debuggers.hg

changeset 16:790c2f0eaf7c

REFRESH to unstable changeset 18414. NO debugger changes.
author Mukesh Rathor
date Tue Sep 02 16:55:55 2008 -0700 (2008-09-02)
parents 5c0bf00e371d
children f875aaa791f0
files Config.mk docs/misc/vtpm.txt docs/src/user.tex docs/xen-api/xenapi-datamodel-graph.dot extras/mini-os/include/posix/dirent.h extras/mini-os/include/posix/err.h extras/mini-os/include/posix/pthread.h extras/mini-os/include/posix/syslog.h extras/mini-os/include/xenbus.h extras/mini-os/lib/sys.c extras/mini-os/main.c extras/mini-os/xenbus/xenbus.c stubdom/Makefile tools/Makefile tools/Rules.mk tools/cross-install tools/examples/xmexample.pv-grub tools/firmware/Rules.mk tools/firmware/hvmloader/32bitbios_support.c tools/firmware/hvmloader/Makefile tools/firmware/hvmloader/acpi/Makefile tools/firmware/hvmloader/acpi/build.c tools/firmware/hvmloader/hvmloader.c tools/firmware/hvmloader/tests.c tools/firmware/hvmloader/util.c tools/firmware/hvmloader/util.h tools/ioemu/hw/cirrus_vga.c tools/ioemu/hw/ide.c tools/ioemu/hw/pass-through.c tools/ioemu/hw/pci.c tools/ioemu/hw/vga.c tools/ioemu/hw/vga_int.h tools/ioemu/hw/xen_platform.c tools/ioemu/vl.c tools/ioemu/vl.h tools/libxc/xc_dom_boot.c tools/libxc/xc_domain.c tools/libxc/xc_domain_save.c tools/libxc/xc_minios.c tools/libxc/xenctrl.h tools/pygrub/src/pygrub tools/python/xen/util/pci.py tools/python/xen/xend/XendConfig.py tools/python/xen/xend/XendDomain.py tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/image.py tools/python/xen/xend/server/DevController.py tools/python/xen/xend/server/pciif.py tools/xenmon/Makefile tools/xentrace/formats xen/Makefile xen/arch/ia64/xen/mm.c xen/arch/x86/acpi/power.c xen/arch/x86/cpu/amd.c xen/arch/x86/cpu/amd.h xen/arch/x86/domain.c xen/arch/x86/hvm/emulate.c xen/arch/x86/irq.c xen/arch/x86/microcode.c xen/arch/x86/mm.c xen/arch/x86/platform_hypercall.c xen/arch/x86/time.c xen/arch/x86/traps.c xen/arch/x86/x86_64/compat/mm.c xen/common/softirq.c xen/common/timer.c xen/common/xmalloc.c xen/drivers/passthrough/vtd/intremap.c xen/drivers/passthrough/vtd/iommu.c xen/include/asm-x86/io_apic.h xen/include/asm-x86/mm.h xen/include/asm-x86/msr-index.h xen/include/asm-x86/processor.h xen/include/asm-x86/softirq.h xen/include/public/memory.h xen/include/public/platform.h xen/include/xen/compat.h xen/include/xen/iommu.h xen/include/xen/timer.h xen/include/xlat.lst xen/include/xsm/xsm.h xen/xsm/dummy.c xen/xsm/flask/hooks.c
line diff
     1.1 --- a/Config.mk	Tue Sep 02 16:34:53 2008 -0700
     1.2 +++ b/Config.mk	Tue Sep 02 16:55:55 2008 -0700
     1.3 @@ -93,11 +93,7 @@ QEMU_REMOTE=http://xenbits.xensource.com
     1.4  # Mercurial in-tree version, or a local directory, or a git URL.
     1.5  # CONFIG_QEMU   ?= ioemu
     1.6  # CONFIG_QEMU   ?= ../qemu-xen.git
     1.7 -ifeq ($(XEN_TARGET_ARCH),ia64)
     1.8 -CONFIG_QEMU   ?= ioemu
     1.9 -else
    1.10  CONFIG_QEMU   ?= $(QEMU_REMOTE)
    1.11 -endif
    1.12  
    1.13  # Optional components
    1.14  XENSTAT_XENTOP     ?= y
     2.1 --- a/docs/misc/vtpm.txt	Tue Sep 02 16:34:53 2008 -0700
     2.2 +++ b/docs/misc/vtpm.txt	Tue Sep 02 16:55:55 2008 -0700
     2.3 @@ -92,8 +92,8 @@ the actual instance number that is assig
     2.4  can be different. This is the case if for example that particular
     2.5  instance is already used by another virtual machine. The association
     2.6  of which TPM instance number is used by which virtual machine is
     2.7 -kept in the file /etc/xen/vtpm.db. Associations are maintained by
     2.8 -domain name and instance number.
     2.9 +kept in the file /var/vtpm/vtpm.db. Associations are maintained by
    2.10 +a xend-internal vTPM UUID and vTPM instance number.
    2.11  
    2.12  Note: If you do not want TPM functionality for your user domain simply
    2.13  leave out the 'vtpm' line in the configuration file.
     3.1 --- a/docs/src/user.tex	Tue Sep 02 16:34:53 2008 -0700
     3.2 +++ b/docs/src/user.tex	Tue Sep 02 16:55:55 2008 -0700
     3.3 @@ -22,7 +22,7 @@
     3.4  \vfill
     3.5  \begin{tabular}{l}
     3.6  {\Huge \bf Users' Manual} \\[4mm]
     3.7 -{\huge Xen v3.0} \\[80mm]
     3.8 +{\huge Xen v3.3} \\[80mm]
     3.9  \end{tabular}
    3.10  \end{center}
    3.11  
    3.12 @@ -42,9 +42,7 @@ welcome.}
    3.13  
    3.14  \vspace*{\fill}
    3.15  
    3.16 -Xen is Copyright \copyright  2002-2005, University of Cambridge, UK, XenSource
    3.17 -Inc., IBM Corp., Hewlett-Packard Co., Intel Corp., AMD Inc., and others.  All
    3.18 -rights reserved.
    3.19 +Xen is Copyright \copyright  2002-2008, Citrix Systems, Inc., University of Cambridge, UK, XenSource Inc., IBM Corp., Hewlett-Packard Co., Intel Corp., AMD Inc., and others.  All rights reserved.
    3.20  
    3.21  Xen is an open-source project.  Most portions of Xen are licensed for copying
    3.22  under the terms of the GNU General Public License, version 2.  Other portions
    3.23 @@ -116,16 +114,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    3.24  
    3.25  
    3.26  Xen is an open-source \emph{para-virtualizing} virtual machine monitor
    3.27 -(VMM), or ``hypervisor'', for the x86 processor architecture. Xen can
    3.28 -securely execute multiple virtual machines on a single physical system
    3.29 -with close-to-native performance.  Xen facilitates enterprise-grade
    3.30 -functionality, including:
    3.31 +(VMM), or ``hypervisor'', for a variety of processor architectures including x86. Xen can securely execute multiple virtual machines on a single physical system with near native performance.  Xen facilitates enterprise-grade functionality, including:
    3.32  
    3.33  \begin{itemize}
    3.34  \item Virtual machines with performance close to native hardware.
    3.35  \item Live migration of running virtual machines between physical hosts.
    3.36  \item Up to 32\footnote{IA64 supports up to 64 virtual CPUs per guest virtual machine} virtual CPUs per guest virtual machine, with VCPU hotplug.
    3.37 -\item x86/32, x86/32 with PAE, x86/64, IA64 and Power platform support.
    3.38 +\item x86/32 with PAE, x86/64, and IA64 platform support.
    3.39  \item Intel and AMD Virtualization Technology for unmodified guest operating systems (including Microsoft Windows).
    3.40  \item Excellent hardware support (supports almost all Linux device
    3.41    drivers). 
    3.42 @@ -182,22 +177,20 @@ unmodified guests running natively on th
    3.43  
    3.44  Paravirtualized Xen support is available for increasingly many
    3.45  operating systems: currently, mature Linux support is available and
    3.46 -included in the standard distribution.  Other OS ports---including
    3.47 -NetBSD, FreeBSD and Solaris x86 v10---are nearing completion. 
    3.48 +included in the standard distribution.  Other OS ports, including
    3.49 +NetBSD, FreeBSD and Solaris are also complete. 
    3.50  
    3.51  
    3.52  \section{Hardware Support}
    3.53  
    3.54 -Xen currently runs on the x86 architecture, requiring a ``P6'' or
    3.55 -newer processor (e.g.\ Pentium Pro, Celeron, Pentium~II, Pentium~III,
    3.56 -Pentium~IV, Xeon, AMD~Athlon, AMD~Duron). Multiprocessor machines are
    3.57 -supported, and there is support for HyperThreading (SMT).  In 
    3.58 -addition, ports to IA64 and Power architectures are supported.
    3.59 -
    3.60 -The default 32-bit Xen supports for Intel's Physical Addressing Extensions (PAE), which enable x86/32 machines to address up to 64 GB of physical memory.
    3.61 -It also supports non-PAE 32-bit Xen up to 4GB of memory. 
    3.62 -Xen also supports x86/64 platforms such as Intel EM64T and AMD Opteron
    3.63 -which can currently address up to 1TB of physical memory.
    3.64 +Xen currently runs on the IA64 and x86 architectures. Multiprocessor
    3.65 +machines are supported, and there is support for HyperThreading (SMT).
    3.66 +
    3.67 +The default 32-bit Xen requires processor support for Physical
    3.68 +Addressing Extensions (PAE), which enables the hypervisor to address
    3.69 +up to 16GB of physical memory. Xen also supports x86/64 platforms
    3.70 +such as Intel EM64T and AMD Opteron which can currently address up to
    3.71 +1TB of physical memory.
    3.72  
    3.73  Xen offloads most of the hardware support issues to the guest OS
    3.74  running in the \emph{Domain~0} management virtual machine. Xen itself
    3.75 @@ -253,8 +246,8 @@ with pointers to papers and technical re
    3.76  Xen has grown into a fully-fledged project in its own right, enabling us
    3.77  to investigate interesting research issues regarding the best techniques
    3.78  for virtualizing resources such as the CPU, memory, disk and network.
    3.79 -Project contributors now include XenSource, Intel, IBM, HP, AMD, Novell,
    3.80 -RedHat.
    3.81 +Project contributors now include Citrix, Intel, IBM, HP, AMD, Novell,
    3.82 +RedHat, Sun, Fujitsu, and Samsung.
    3.83  
    3.84  Xen was first described in a paper presented at SOSP in
    3.85  2003\footnote{\tt
    3.86 @@ -265,25 +258,20 @@ sites.
    3.87  
    3.88  \section{What's New}
    3.89  
    3.90 -Xen 3.0.0 offers:
    3.91 +Xen 3.3.0 offers:
    3.92  
    3.93  \begin{itemize}
    3.94 -\item Support for up to 32-way SMP guest operating systems
    3.95 -\item Intel (Physical Addressing Extensions) PAE to support 32-bit
    3.96 -  servers with more than 4GB physical memory
    3.97 -\item x86/64 support (Intel EM64T, AMD Opteron)
    3.98 -\item Intel VT-x support to enable the running of unmodified guest
    3.99 -operating systems (Windows XP/2003, Legacy Linux)
   3.100 -\item Enhanced control tools
   3.101 -\item Improved ACPI support
   3.102 -\item AGP/DRM graphics
   3.103 +\item IO Emulation (stub domains) for HVM IO performance and scailability
   3.104 +\item Replacement of Intel VT vmxassist by new 16b emulation code
   3.105 +\item Improved VT-d device pass-through e.g. for graphics devices
   3.106 +\item Enhanced C and P state power management
   3.107 +\item Exploitation of multi-queue support on modern NICs
   3.108 +\item Removal of domain lock for improved PV guest scalability
   3.109 +\item 2MB page support for HVM and PV guests
   3.110 +\item CPU Portability
   3.111  \end{itemize}
   3.112  
   3.113 -
   3.114 -Xen 3.0 features greatly enhanced hardware support, configuration
   3.115 -flexibility, usability and a larger complement of supported operating
   3.116 -systems.  This latest release takes Xen a step closer to being the 
   3.117 -definitive open source solution for virtualization.
   3.118 +Xen 3.3 delivers the capabilities needed by enterprise customers and gives computing industry leaders a solid, secure platform to build upon for their virtualization solutions. This latest release establishes Xen as the definitive open source solution for virtualization.
   3.119  
   3.120  
   3.121  
   3.122 @@ -295,7 +283,7 @@ definitive open source solution for virt
   3.123  The Xen distribution includes three main components: Xen itself, ports
   3.124  of Linux and NetBSD to run on Xen, and the userspace tools required to
   3.125  manage a Xen-based system. This chapter describes how to install the
   3.126 -Xen~3.0 distribution from source. Alternatively, there may be pre-built
   3.127 +Xen~3.3 distribution from source. Alternatively, there may be pre-built
   3.128  packages available as part of your operating system distribution.
   3.129  
   3.130  
   3.131 @@ -4029,9 +4017,8 @@ files: \path{Config.mk} and \path{Makefi
   3.132  
   3.133  The former allows the overall build target architecture to be 
   3.134  specified. You will typically not need to modify this unless 
   3.135 -you are cross-compiling or if you wish to build a non-PAE  
   3.136 -Xen system. Additional configuration options are documented 
   3.137 -in the \path{Config.mk} file. 
   3.138 +you are cross-compiling. Additional configuration options are
   3.139 +documented in the \path{Config.mk} file. 
   3.140  
   3.141  The top-level \path{Makefile} is chiefly used to customize the set of
   3.142  kernels built. Look for the line: 
     4.1 --- a/docs/xen-api/xenapi-datamodel-graph.dot	Tue Sep 02 16:34:53 2008 -0700
     4.2 +++ b/docs/xen-api/xenapi-datamodel-graph.dot	Tue Sep 02 16:55:55 2008 -0700
     4.3 @@ -14,7 +14,7 @@ fontname="Verdana";
     4.4  
     4.5  node [ shape=box ]; session VM host network VIF PIF SR VDI VBD PBD user XSPolicy ACMPolicy;
     4.6  node [shape=ellipse]; PIF_metrics VIF_metrics VM_metrics VBD_metrics PBD_metrics VM_guest_metrics host_metrics;
     4.7 -node [shape=box]; DPCI PPCI host_cpu console
     4.8 +node [shape=box]; DPCI PPCI host_cpu console VTPM
     4.9  session -> host [ arrowhead="none" ]
    4.10  session -> user [ arrowhead="none" ]
    4.11  VM -> VM_metrics [ arrowhead="none" ]
     5.1 --- a/extras/mini-os/include/posix/dirent.h	Tue Sep 02 16:34:53 2008 -0700
     5.2 +++ b/extras/mini-os/include/posix/dirent.h	Tue Sep 02 16:55:55 2008 -0700
     5.3 @@ -1,7 +1,7 @@
     5.4  #ifndef _POSIX_DIRENT_H
     5.5  #define _POSIX_DIRENT_H
     5.6  
     5.7 -#include <sys/types.h>
     5.8 +#include <stdint.h>
     5.9  
    5.10  struct dirent {
    5.11          char *d_name;
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/extras/mini-os/include/posix/err.h	Tue Sep 02 16:55:55 2008 -0700
     6.3 @@ -0,0 +1,15 @@
     6.4 +#ifndef _POSIX_ERR_H
     6.5 +#define _POSIX_ERR_H
     6.6 +
     6.7 +#include <stdarg.h>
     6.8 +
     6.9 +void err(int eval, const char *fmt, ...);
    6.10 +void errx(int eval, const char *fmt, ...);
    6.11 +void warn(const char *fmt, ...);
    6.12 +void warnx(const char *fmt, ...);
    6.13 +void verr(int eval, const char *fmt, va_list args);
    6.14 +void verrx(int eval, const char *fmt, va_list args);
    6.15 +void vwarn(const char *fmt, va_list args);
    6.16 +void vwarnx(const char *fmt, va_list args);
    6.17 +
    6.18 +#endif /* _POSIX_ERR_H */
     7.1 --- a/extras/mini-os/include/posix/pthread.h	Tue Sep 02 16:34:53 2008 -0700
     7.2 +++ b/extras/mini-os/include/posix/pthread.h	Tue Sep 02 16:55:55 2008 -0700
     7.3 @@ -31,8 +31,15 @@ static inline int pthread_key_delete(pth
     7.4  
     7.5  
     7.6  
     7.7 +typedef struct {} pthread_mutexattr_t;
     7.8 +static inline int pthread_mutexattr_init(pthread_mutexattr_t *mattr) { return 0; }
     7.9 +#define PTHREAD_MUTEX_NORMAL 0
    7.10 +#define PTHREAD_MUTEX_RECURSIVE 1
    7.11 +static inline int pthread_mutexattr_settype(pthread_mutexattr_t *mattr, int kind) { return 0; }
    7.12 +static inline int pthread_mutexattr_destroy(pthread_mutexattr_t *mattr) { return 0; }
    7.13  typedef struct {} pthread_mutex_t;
    7.14  #define PTHREAD_MUTEX_INITIALIZER {}
    7.15 +static inline int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *mattr) { return 0; }
    7.16  static inline int pthread_mutex_lock(pthread_mutex_t *mutex) { return 0; }
    7.17  static inline int pthread_mutex_unlock(pthread_mutex_t *mutex) { return 0; }
    7.18  
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/extras/mini-os/include/posix/syslog.h	Tue Sep 02 16:55:55 2008 -0700
     8.3 @@ -0,0 +1,37 @@
     8.4 +#ifndef _POSIX_SYSLOG_H
     8.5 +#define _POSIX_SYSLOG_H
     8.6 +
     8.7 +#include <stdarg.h>
     8.8 +
     8.9 +#define LOG_PID 0
    8.10 +#define LOG_CONS 0
    8.11 +#define LOG_NDELAY 0
    8.12 +#define LOG_ODELAY 0
    8.13 +#define LOG_NOWAIT 0
    8.14 +
    8.15 +#define LOG_KERN 0
    8.16 +#define LOG_USER 0
    8.17 +#define LOG_MAIL 0
    8.18 +#define LOG_NEWS 0
    8.19 +#define LOG_UUCP 0
    8.20 +#define LOG_DAEMON 0
    8.21 +#define LOG_AUTH 0
    8.22 +#define LOG_CRON 0
    8.23 +#define LOG_LPR 0
    8.24 +
    8.25 +/* TODO: support */
    8.26 +#define LOG_EMERG 0
    8.27 +#define LOG_ALERT 1
    8.28 +#define LOG_CRIT 2
    8.29 +#define LOG_ERR 3
    8.30 +#define LOG_WARNING 4
    8.31 +#define LOG_NOTICE 5
    8.32 +#define LOG_INFO 6
    8.33 +#define LOG_DEBUG 7
    8.34 +
    8.35 +void openlog(const char *ident, int option, int facility);
    8.36 +void syslog(int priority, const char *format, ...);
    8.37 +void closelog(void);
    8.38 +void vsyslog(int priority, const char *format, va_list ap);
    8.39 +
    8.40 +#endif /* _POSIX_SYSLOG_H */
     9.1 --- a/extras/mini-os/include/xenbus.h	Tue Sep 02 16:34:53 2008 -0700
     9.2 +++ b/extras/mini-os/include/xenbus.h	Tue Sep 02 16:55:55 2008 -0700
     9.3 @@ -83,12 +83,13 @@ char *xenbus_transaction_end(xenbus_tran
     9.4  			     int *retry);
     9.5  
     9.6  /* Read path and parse it as an integer.  Returns -1 on error. */
     9.7 -int xenbus_read_integer(char *path);
     9.8 +int xenbus_read_integer(const char *path);
     9.9  
    9.10  /* Contraction of snprintf and xenbus_write(path/node). */
    9.11  char* xenbus_printf(xenbus_transaction_t xbt,
    9.12 -                                  char* node, char* path,
    9.13 -                                  char* fmt, ...);
    9.14 +                                  const char* node, const char* path,
    9.15 +                                  const char* fmt, ...)
    9.16 +                   __attribute__((__format__(printf, 4, 5)));
    9.17  
    9.18  /* Reset the XenBus system. */
    9.19  void fini_xenbus(void);
    10.1 --- a/extras/mini-os/lib/sys.c	Tue Sep 02 16:34:53 2008 -0700
    10.2 +++ b/extras/mini-os/lib/sys.c	Tue Sep 02 16:55:55 2008 -0700
    10.3 @@ -1007,6 +1007,96 @@ LWIP_STUB(ssize_t, sendto, (int s, void 
    10.4  LWIP_STUB(int, getsockname, (int s, struct sockaddr *name, socklen_t *namelen), (s, name, namelen))
    10.5  #endif
    10.6  
    10.7 +static char *syslog_ident;
    10.8 +void openlog(const char *ident, int option, int facility)
    10.9 +{
   10.10 +    if (syslog_ident)
   10.11 +        free(syslog_ident);
   10.12 +    syslog_ident = strdup(ident);
   10.13 +}
   10.14 +
   10.15 +void vsyslog(int priority, const char *format, va_list ap)
   10.16 +{
   10.17 +    printk("%s: ", syslog_ident);
   10.18 +    print(0, format, ap);
   10.19 +}
   10.20 +
   10.21 +void syslog(int priority, const char *format, ...)
   10.22 +{
   10.23 +    va_list ap;
   10.24 +    va_start(ap, format);
   10.25 +    vsyslog(priority, format, ap);
   10.26 +    va_end(ap);
   10.27 +}
   10.28 +
   10.29 +void closelog(void)
   10.30 +{
   10.31 +    free(syslog_ident);
   10.32 +    syslog_ident = NULL;
   10.33 +}
   10.34 +
   10.35 +void vwarn(const char *format, va_list ap)
   10.36 +{
   10.37 +    int the_errno = errno;
   10.38 +    printk("stubdom: ");
   10.39 +    if (format) {
   10.40 +        print(0, format, ap);
   10.41 +        printk(", ");
   10.42 +    }
   10.43 +    printk("%s", strerror(the_errno));
   10.44 +}
   10.45 +
   10.46 +void warn(const char *format, ...)
   10.47 +{
   10.48 +    va_list ap;
   10.49 +    va_start(ap, format);
   10.50 +    vwarn(format, ap);
   10.51 +    va_end(ap);
   10.52 +}
   10.53 +
   10.54 +void verr(int eval, const char *format, va_list ap)
   10.55 +{
   10.56 +    vwarn(format, ap);
   10.57 +    exit(eval);
   10.58 +}
   10.59 +
   10.60 +void err(int eval, const char *format, ...)
   10.61 +{
   10.62 +    va_list ap;
   10.63 +    va_start(ap, format);
   10.64 +    verr(eval, format, ap);
   10.65 +    va_end(ap);
   10.66 +}
   10.67 +
   10.68 +void vwarnx(const char *format, va_list ap)
   10.69 +{
   10.70 +    printk("stubdom: ");
   10.71 +    if (format)
   10.72 +        print(0, format, ap);
   10.73 +}
   10.74 +
   10.75 +void warnx(const char *format, ...)
   10.76 +{
   10.77 +    va_list ap;
   10.78 +    va_start(ap, format);
   10.79 +    vwarnx(format, ap);
   10.80 +    va_end(ap);
   10.81 +}
   10.82 +
   10.83 +void verrx(int eval, const char *format, va_list ap)
   10.84 +{
   10.85 +    vwarnx(format, ap);
   10.86 +    exit(eval);
   10.87 +}
   10.88 +
   10.89 +void errx(int eval, const char *format, ...)
   10.90 +{
   10.91 +    va_list ap;
   10.92 +    va_start(ap, format);
   10.93 +    verrx(eval, format, ap);
   10.94 +    va_end(ap);
   10.95 +}
   10.96 +
   10.97  int nanosleep(const struct timespec *req, struct timespec *rem)
   10.98  {
   10.99      s_time_t start = NOW();
  10.100 @@ -1115,34 +1205,47 @@ void *mmap(void *start, size_t length, i
  10.101      } else ASSERT(0);
  10.102  }
  10.103  
  10.104 +#define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
  10.105  int munmap(void *start, size_t length)
  10.106  {
  10.107 -    int i, n = length / PAGE_SIZE;
  10.108 -    multicall_entry_t call[n];
  10.109 -    unsigned char (*data)[PAGE_SIZE] = start;
  10.110 -    int ret;
  10.111 +    int total = length / PAGE_SIZE;
  10.112      ASSERT(!((unsigned long)start & ~PAGE_MASK));
  10.113 -    ASSERT(!(length & ~PAGE_MASK));
  10.114 +    while (total) {
  10.115 +        int n = UNMAP_BATCH;
  10.116 +        if (n > total)
  10.117 +            n = total;
  10.118 +        {
  10.119 +            int i;
  10.120 +            multicall_entry_t call[n];
  10.121 +            unsigned char (*data)[PAGE_SIZE] = start;
  10.122 +            int ret;
  10.123  
  10.124 -    for (i = 0; i < n; i++) {
  10.125 -	call[i].op = __HYPERVISOR_update_va_mapping;
  10.126 -	call[i].args[0] = (unsigned long) &data[i];
  10.127 -	call[i].args[1] = 0;
  10.128 -	call[i].args[2] = 0;
  10.129 -	call[i].args[3] = UVMF_INVLPG;
  10.130 -    }
  10.131 +            for (i = 0; i < n; i++) {
  10.132 +                int arg = 0;
  10.133 +                call[i].op = __HYPERVISOR_update_va_mapping;
  10.134 +                call[i].args[arg++] = (unsigned long) &data[i];
  10.135 +                call[i].args[arg++] = 0;
  10.136 +#ifdef __i386__
  10.137 +                call[i].args[arg++] = 0;
  10.138 +#endif
  10.139 +                call[i].args[arg++] = UVMF_INVLPG;
  10.140 +            }
  10.141  
  10.142 -    ret = HYPERVISOR_multicall(call, n);
  10.143 -    if (ret) {
  10.144 -	errno = -ret;
  10.145 -	return -1;
  10.146 -    }
  10.147 +            ret = HYPERVISOR_multicall(call, n);
  10.148 +            if (ret) {
  10.149 +                errno = -ret;
  10.150 +                return -1;
  10.151 +            }
  10.152  
  10.153 -    for (i = 0; i < n; i++) {
  10.154 -	if (call[i].result) {
  10.155 -	    errno = call[i].result;
  10.156 -	    return -1;
  10.157 -	}
  10.158 +            for (i = 0; i < n; i++) {
  10.159 +                if (call[i].result) {
  10.160 +                    errno = call[i].result;
  10.161 +                    return -1;
  10.162 +                }
  10.163 +            }
  10.164 +        }
  10.165 +        start = (char *)start + n * PAGE_SIZE;
  10.166 +        total -= n;
  10.167      }
  10.168      return 0;
  10.169  }
    11.1 --- a/extras/mini-os/main.c	Tue Sep 02 16:34:53 2008 -0700
    11.2 +++ b/extras/mini-os/main.c	Tue Sep 02 16:55:55 2008 -0700
    11.3 @@ -42,7 +42,7 @@ void _fini(void)
    11.4  extern char __app_bss_start, __app_bss_end;
    11.5  static void call_main(void *p)
    11.6  {
    11.7 -    char *c;
    11.8 +    char *c, quote;
    11.9  #ifdef CONFIG_QEMU
   11.10      char *domargs, *msg;
   11.11  #endif
   11.12 @@ -101,32 +101,53 @@ static void call_main(void *p)
   11.13  
   11.14      argc = 1;
   11.15  
   11.16 -#define PARSE_ARGS(ARGS,START,END) \
   11.17 +#define PARSE_ARGS(ARGS,START,QUOTE,END) \
   11.18      c = ARGS; \
   11.19 +    quote = 0; \
   11.20      while (*c) { \
   11.21  	if (*c != ' ') { \
   11.22  	    START; \
   11.23 -	    while (*c && *c != ' ') \
   11.24 +	    while (*c) { \
   11.25 +		if (quote) { \
   11.26 +		    if (*c == quote) { \
   11.27 +			quote = 0; \
   11.28 +			QUOTE; \
   11.29 +			continue; \
   11.30 +		    } \
   11.31 +		} else if (*c == ' ') \
   11.32 +		    break; \
   11.33 +		if (*c == '"' || *c == '\'') { \
   11.34 +		    quote = *c; \
   11.35 +		    QUOTE; \
   11.36 +		    continue; \
   11.37 +		} \
   11.38  		c++; \
   11.39 +	    } \
   11.40  	} else { \
   11.41              END; \
   11.42  	    while (*c == ' ') \
   11.43  		c++; \
   11.44  	} \
   11.45 +    } \
   11.46 +    if (quote) {\
   11.47 +	printk("Warning: unterminated quotation %c\n", quote); \
   11.48 +	quote = 0; \
   11.49      }
   11.50 +#define PARSE_ARGS_COUNT(ARGS) PARSE_ARGS(ARGS, argc++, c++, )
   11.51 +#define PARSE_ARGS_STORE(ARGS) PARSE_ARGS(ARGS, argv[argc++] = c, memmove(c, c + 1, strlen(c + 1) + 1), *c++ = 0)
   11.52  
   11.53 -    PARSE_ARGS((char*)start_info.cmd_line, argc++, );
   11.54 +    PARSE_ARGS_COUNT((char*)start_info.cmd_line);
   11.55  #ifdef CONFIG_QEMU
   11.56 -    PARSE_ARGS(domargs, argc++, );
   11.57 +    PARSE_ARGS_COUNT(domargs);
   11.58  #endif
   11.59  
   11.60      argv = alloca((argc + 1) * sizeof(char *));
   11.61      argv[0] = "main";
   11.62      argc = 1;
   11.63  
   11.64 -    PARSE_ARGS((char*)start_info.cmd_line, argv[argc++] = c, *c++ = 0)
   11.65 +    PARSE_ARGS_STORE((char*)start_info.cmd_line)
   11.66  #ifdef CONFIG_QEMU
   11.67 -    PARSE_ARGS(domargs, argv[argc++] = c, *c++ = 0)
   11.68 +    PARSE_ARGS_STORE(domargs)
   11.69  #endif
   11.70  
   11.71      argv[argc] = NULL;
    12.1 --- a/extras/mini-os/xenbus/xenbus.c	Tue Sep 02 16:34:53 2008 -0700
    12.2 +++ b/extras/mini-os/xenbus/xenbus.c	Tue Sep 02 16:55:55 2008 -0700
    12.3 @@ -633,7 +633,7 @@ xenbus_transaction_end(xenbus_transactio
    12.4      return NULL;
    12.5  }
    12.6  
    12.7 -int xenbus_read_integer(char *path)
    12.8 +int xenbus_read_integer(const char *path)
    12.9  {
   12.10      char *res, *buf;
   12.11      int t;
   12.12 @@ -650,8 +650,8 @@ int xenbus_read_integer(char *path)
   12.13  }
   12.14  
   12.15  char* xenbus_printf(xenbus_transaction_t xbt,
   12.16 -                                  char* node, char* path,
   12.17 -                                  char* fmt, ...)
   12.18 +                                  const char* node, const char* path,
   12.19 +                                  const char* fmt, ...)
   12.20  {
   12.21  #define BUFFER_SIZE 256
   12.22      char fullpath[BUFFER_SIZE];
    13.1 --- a/stubdom/Makefile	Tue Sep 02 16:34:53 2008 -0700
    13.2 +++ b/stubdom/Makefile	Tue Sep 02 16:55:55 2008 -0700
    13.3 @@ -9,7 +9,7 @@ include $(XEN_ROOT)/Config.mk
    13.4  
    13.5  override CONFIG_QEMU=ioemu
    13.6  
    13.7 -IOEMU_OPTIONS=--disable-sdl --disable-opengl --disable-gfx-check --disable-vnc-tls --disable-brlapi --disable-kqemu
    13.8 +IOEMU_OPTIONS=--disable-sdl --disable-opengl --disable-vnc-tls --disable-brlapi --disable-kqemu
    13.9  ZLIB_URL?=http://www.zlib.net
   13.10  ZLIB_VERSION=1.2.3
   13.11  LIBPCI_URL?=http://www.kernel.org/pub/software/utils/pciutils
   13.12 @@ -53,7 +53,7 @@ TARGET_CFLAGS += $(call cc-option,$(CC),
   13.13  TARGET_CFLAGS += $(call cc-option,$(CC),-fno-stack-protector-all,)
   13.14  
   13.15  # Do not use host headers and libs
   13.16 -GCC_INSTALL = $(shell gcc -print-search-dirs | sed -n -e 's/install: \(.*\)/\1/p')
   13.17 +GCC_INSTALL = $(shell LANG=C gcc -print-search-dirs | sed -n -e 's/install: \(.*\)/\1/p')
   13.18  TARGET_CPPFLAGS += -U __linux__ -U __FreeBSD__ -U __sun__
   13.19  TARGET_CPPFLAGS += -nostdinc
   13.20  TARGET_CPPFLAGS += -isystem $(CURDIR)/$(MINI_OS)/include/posix
    14.1 --- a/tools/Makefile	Tue Sep 02 16:34:53 2008 -0700
    14.2 +++ b/tools/Makefile	Tue Sep 02 16:55:55 2008 -0700
    14.3 @@ -59,8 +59,7 @@ clean distclean: subdirs-clean
    14.4  ifneq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
    14.5  IOEMU_CONFIGURE_CROSS ?= --cpu=$(XEN_TARGET_ARCH) \
    14.6  			 --cross-prefix=$(CROSS_COMPILE) \
    14.7 -			 --interp-prefix=$(CROSS_SYS_ROOT) \
    14.8 -			 --install=$(CURDIR)/cross-install
    14.9 +			 --interp-prefix=$(CROSS_SYS_ROOT)
   14.10  endif
   14.11  
   14.12  ioemu/config-host.mak:
    15.1 --- a/tools/Rules.mk	Tue Sep 02 16:34:53 2008 -0700
    15.2 +++ b/tools/Rules.mk	Tue Sep 02 16:55:55 2008 -0700
    15.3 @@ -5,6 +5,9 @@ all:
    15.4  
    15.5  include $(XEN_ROOT)/Config.mk
    15.6  
    15.7 +export _INSTALL := $(INSTALL)
    15.8 +INSTALL = $(XEN_ROOT)/tools/cross-install
    15.9 +
   15.10  XEN_INCLUDE        = $(XEN_ROOT)/tools/include
   15.11  XEN_XC             = $(XEN_ROOT)/tools/python/xen/lowlevel/xc
   15.12  XEN_LIBXC          = $(XEN_ROOT)/tools/libxc
    16.1 --- a/tools/cross-install	Tue Sep 02 16:34:53 2008 -0700
    16.2 +++ b/tools/cross-install	Tue Sep 02 16:55:55 2008 -0700
    16.3 @@ -5,4 +5,4 @@ if [ -n "$CROSS_BIN_PATH" ]; then
    16.4      PATH="$CROSS_BIN_PATH:$PATH"
    16.5  fi
    16.6  
    16.7 -exec install "$@"
    16.8 +exec $_INSTALL "$@"
    17.1 --- a/tools/examples/xmexample.pv-grub	Tue Sep 02 16:34:53 2008 -0700
    17.2 +++ b/tools/examples/xmexample.pv-grub	Tue Sep 02 16:55:55 2008 -0700
    17.3 @@ -25,7 +25,7 @@ extra = "(hd0,0)/boot/grub/menu.lst"
    17.4  # WARNING: Creating a domain with insufficient memory may cause out of
    17.5  #          memory errors. The domain needs enough memory to boot kernel
    17.6  #          and modules. Allocating less than 32MBs is not recommended.
    17.7 -memory = 64
    17.8 +memory = 128
    17.9  
   17.10  # A name for your domain. All domains must have different names.
   17.11  name = "ExampleDomain"
   17.12 @@ -119,32 +119,6 @@ disk = [ 'phy:hda1,hda1,w' ]
   17.13  #vtpm = [ 'instance=1,backend=0' ]
   17.14  
   17.15  #----------------------------------------------------------------------------
   17.16 -# Set the kernel command line for the new domain.
   17.17 -# You only need to define the IP parameters and hostname if the domain's
   17.18 -# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP.
   17.19 -# You can use 'extra' to set the runlevel and custom environment
   17.20 -# variables used by custom rc scripts (e.g. VMID=, usr= ).
   17.21 -
   17.22 -# Set if you want dhcp to allocate the IP address.
   17.23 -#dhcp="dhcp"
   17.24 -# Set netmask.
   17.25 -#netmask=
   17.26 -# Set default gateway.
   17.27 -#gateway=
   17.28 -# Set the hostname.
   17.29 -#hostname= "vm%d" % vmid
   17.30 -
   17.31 -# Set root device.
   17.32 -root = "/dev/hda1 ro"
   17.33 -
   17.34 -# Root device for nfs.
   17.35 -#root = "/dev/nfs"
   17.36 -# The nfs server.
   17.37 -#nfs_server = '192.0.2.1'  
   17.38 -# Root directory on the nfs server.
   17.39 -#nfs_root   = '/full/path/to/root/directory'
   17.40 -
   17.41 -#----------------------------------------------------------------------------
   17.42  # Configure the behaviour when a domain exits.  There are three 'reasons'
   17.43  # for a domain to stop: poweroff, reboot, and crash.  For each of these you
   17.44  # may specify:
    18.1 --- a/tools/firmware/Rules.mk	Tue Sep 02 16:34:53 2008 -0700
    18.2 +++ b/tools/firmware/Rules.mk	Tue Sep 02 16:55:55 2008 -0700
    18.3 @@ -6,6 +6,10 @@ CFLAGS :=
    18.4  
    18.5  include $(XEN_ROOT)/tools/Rules.mk
    18.6  
    18.7 +ifneq ($(debug),y)
    18.8 +CFLAGS += -DNDEBUG
    18.9 +endif
   18.10 +
   18.11  CFLAGS += -Werror
   18.12  
   18.13  # Disable PIE/SSP if GCC supports them. They can break us.
    19.1 --- a/tools/firmware/hvmloader/32bitbios_support.c	Tue Sep 02 16:34:53 2008 -0700
    19.2 +++ b/tools/firmware/hvmloader/32bitbios_support.c	Tue Sep 02 16:55:55 2008 -0700
    19.3 @@ -76,7 +76,7 @@ static void relocate_32bitbios(char *elf
    19.4       */
    19.5      reloc_size = reloc_off;
    19.6      printf("%d bytes of ROMBIOS high-memory extensions:\n", reloc_size);
    19.7 -    highbiosarea = (char *)(long)e820_malloc(reloc_size);
    19.8 +    highbiosarea = (char *)(long)e820_malloc(reloc_size, 0);
    19.9      BUG_ON(highbiosarea == NULL);
   19.10      printf("  Relocating to 0x%x-0x%x ... ",
   19.11             (uint32_t)&highbiosarea[0],
    20.1 --- a/tools/firmware/hvmloader/Makefile	Tue Sep 02 16:34:53 2008 -0700
    20.2 +++ b/tools/firmware/hvmloader/Makefile	Tue Sep 02 16:55:55 2008 -0700
    20.3 @@ -30,6 +30,9 @@ CFLAGS += $(CFLAGS_include) -I.
    20.4  
    20.5  SRCS  = hvmloader.c mp_tables.c util.c smbios.c 
    20.6  SRCS += 32bitbios_support.c smp.c cacheattr.c
    20.7 +ifeq ($(debug),y)
    20.8 +SRCS += tests.c
    20.9 +endif
   20.10  OBJS  = $(patsubst %.c,%.o,$(SRCS))
   20.11  
   20.12  .PHONY: all
    21.1 --- a/tools/firmware/hvmloader/acpi/Makefile	Tue Sep 02 16:34:53 2008 -0700
    21.2 +++ b/tools/firmware/hvmloader/acpi/Makefile	Tue Sep 02 16:55:55 2008 -0700
    21.3 @@ -22,8 +22,8 @@ C_SRC = build.c dsdt.c static_tables.c
    21.4  H_SRC = $(wildcard *.h)
    21.5  OBJS  = $(patsubst %.c,%.o,$(C_SRC))
    21.6  
    21.7 -IASL_VER = acpica-unix-20060707
    21.8 -IASL_URL = http://developer.intel.com/technology/iapc/acpi/downloads/$(IASL_VER).tar.gz
    21.9 +IASL_VER = acpica-unix-20080729
   21.10 +IASL_URL = http://acpica.org/download/$(IASL_VER).tar.gz
   21.11  
   21.12  CFLAGS += -I. -I.. $(CFLAGS_include)
   21.13  
   21.14 @@ -48,7 +48,7 @@ iasl:
   21.15  	@echo "ACPI ASL compiler(iasl) is needed"
   21.16  	@echo "Download Intel ACPI CA"
   21.17  	@echo "If wget failed, please download and compile manually from"
   21.18 -	@echo "http://developer.intel.com/technology/iapc/acpi/downloads.htm"
   21.19 +	@echo "http://acpica.org/downloads/"
   21.20  	@echo 
   21.21  	wget $(IASL_URL)
   21.22  	tar xzf $(IASL_VER).tar.gz
    22.1 --- a/tools/firmware/hvmloader/acpi/build.c	Tue Sep 02 16:34:53 2008 -0700
    22.2 +++ b/tools/firmware/hvmloader/acpi/build.c	Tue Sep 02 16:55:55 2008 -0700
    22.3 @@ -233,7 +233,7 @@ static int construct_secondary_tables(ui
    22.4          tcpa->header.oem_revision = ACPI_OEM_REVISION;
    22.5          tcpa->header.creator_id   = ACPI_CREATOR_ID;
    22.6          tcpa->header.creator_revision = ACPI_CREATOR_REVISION;
    22.7 -        tcpa->lasa = e820_malloc(ACPI_2_0_TCPA_LAML_SIZE);
    22.8 +        tcpa->lasa = e820_malloc(ACPI_2_0_TCPA_LAML_SIZE, 0);
    22.9          if ( tcpa->lasa )
   22.10          {
   22.11              tcpa->laml = ACPI_2_0_TCPA_LAML_SIZE;
   22.12 @@ -363,7 +363,7 @@ void acpi_build_tables(void)
   22.13      memset(buf, 0, high_sz);
   22.14  
   22.15      /* Allocate data area and set up ACPI tables there. */
   22.16 -    buf = (uint8_t *)e820_malloc(high_sz);
   22.17 +    buf = (uint8_t *)e820_malloc(high_sz, 0);
   22.18      __acpi_build_tables(buf, &low_sz, &high_sz);
   22.19  
   22.20      printf(" - Lo data: %08lx-%08lx\n"
    23.1 --- a/tools/firmware/hvmloader/hvmloader.c	Tue Sep 02 16:34:53 2008 -0700
    23.2 +++ b/tools/firmware/hvmloader/hvmloader.c	Tue Sep 02 16:55:55 2008 -0700
    23.3 @@ -243,6 +243,13 @@ static void pci_setup(void)
    23.4              bars[i].bar_sz  = bar_sz;
    23.5  
    23.6              nr_bars++;
    23.7 +
    23.8 +            /* Skip the upper-half of the address for a 64-bit BAR. */
    23.9 +            if ( (bar_data & (PCI_BASE_ADDRESS_SPACE |
   23.10 +                              PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == 
   23.11 +                 (PCI_BASE_ADDRESS_SPACE_MEMORY | 
   23.12 +                  PCI_BASE_ADDRESS_MEM_TYPE_64) )
   23.13 +                bar++;
   23.14          }
   23.15  
   23.16          /* Map the interrupt. */
   23.17 @@ -430,12 +437,14 @@ static void cmos_write_memory_size(void)
   23.18      cmos_outb(0x35, (uint8_t)( alt_mem >> 8));
   23.19  }
   23.20  
   23.21 -static void init_xen_platform_io_base(void)
   23.22 +static uint16_t init_xen_platform_io_base(void)
   23.23  {
   23.24      struct bios_info *bios_info = (struct bios_info *)ACPI_PHYSICAL_ADDRESS;
   23.25      uint32_t devfn, bar_data;
   23.26      uint16_t vendor_id, device_id;
   23.27  
   23.28 +    bios_info->xen_pfiob = 0;
   23.29 +
   23.30      for ( devfn = 0; devfn < 128; devfn++ )
   23.31      {
   23.32          vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
   23.33 @@ -445,12 +454,16 @@ static void init_xen_platform_io_base(vo
   23.34          bar_data = pci_readl(devfn, PCI_BASE_ADDRESS_0);
   23.35          bios_info->xen_pfiob = bar_data & PCI_BASE_ADDRESS_IO_MASK;
   23.36      }
   23.37 +
   23.38 +    return bios_info->xen_pfiob;
   23.39  }
   23.40  
   23.41  int main(void)
   23.42  {
   23.43      int vgabios_sz = 0, etherboot_sz = 0, rombios_sz, smbios_sz;
   23.44      int extboot_sz = 0;
   23.45 +    uint32_t vga_ram = 0;
   23.46 +    uint16_t xen_pfiob;
   23.47  
   23.48      printf("HVM Loader\n");
   23.49  
   23.50 @@ -460,6 +473,8 @@ int main(void)
   23.51  
   23.52      smp_initialise();
   23.53  
   23.54 +    perform_tests();
   23.55 +
   23.56      printf("Writing SMBIOS tables ...\n");
   23.57      smbios_sz = hvm_write_smbios_tables();
   23.58  
   23.59 @@ -495,6 +510,12 @@ int main(void)
   23.60          break;
   23.61      }
   23.62  
   23.63 +    if ( virtual_vga != VGA_none )
   23.64 +    {
   23.65 +        vga_ram = e820_malloc(8 << 20, 4096);
   23.66 +        printf("VGA RAM at %08x\n", vga_ram);
   23.67 +    }
   23.68 +
   23.69      etherboot_sz = scan_etherboot_nic((void*)ETHERBOOT_PHYSICAL_ADDRESS);
   23.70  
   23.71      if ( must_load_extboot() )
   23.72 @@ -535,7 +556,9 @@ int main(void)
   23.73                 ROMBIOS_PHYSICAL_ADDRESS,
   23.74                 ROMBIOS_PHYSICAL_ADDRESS + rombios_sz - 1);
   23.75  
   23.76 -    init_xen_platform_io_base();
   23.77 +    xen_pfiob = init_xen_platform_io_base();
   23.78 +    if ( xen_pfiob && vga_ram )
   23.79 +        outl(xen_pfiob + 4, vga_ram);
   23.80  
   23.81      printf("Invoking ROMBIOS ...\n");
   23.82      return 0;
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/tools/firmware/hvmloader/tests.c	Tue Sep 02 16:55:55 2008 -0700
    24.3 @@ -0,0 +1,164 @@
    24.4 +/*
    24.5 + * tests.c: HVM environment tests.
    24.6 + *
    24.7 + * Copyright (c) 2008, Citrix Systems, Inc.
    24.8 + * 
    24.9 + * Authors:
   24.10 + *    Keir Fraser <keir.fraser@citrix.com>
   24.11 + * 
   24.12 + * This program is free software; you can redistribute it and/or modify it
   24.13 + * under the terms and conditions of the GNU General Public License,
   24.14 + * version 2, as published by the Free Software Foundation.
   24.15 + *
   24.16 + * This program is distributed in the hope it will be useful, but WITHOUT
   24.17 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   24.18 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   24.19 + * more details.
   24.20 + *
   24.21 + * You should have received a copy of the GNU General Public License along with
   24.22 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
   24.23 + * Place - Suite 330, Boston, MA 02111-1307 USA.
   24.24 + */
   24.25 +
   24.26 +#include "util.h"
   24.27 +
   24.28 +/*
   24.29 + * Memory layout during tests:
   24.30 + *  4MB to 8MB is cleared.
   24.31 + *  Page directory resides at 8MB.
   24.32 + *  4 page table pages reside at 8MB+4kB to 8MB+20kB.
   24.33 + *  Pagetables identity-map 0-16MB, except 4kB at va 6MB maps to pa 5MB.
   24.34 + */
   24.35 +#define PD_START (8ul << 20)
   24.36 +#define PT_START (PD_START + 4096)
   24.37 +
   24.38 +static void setup_paging(void)
   24.39 +{
   24.40 +    uint32_t *pd = (uint32_t *)PD_START;
   24.41 +    uint32_t *pt = (uint32_t *)PT_START;
   24.42 +    uint32_t i;
   24.43 +
   24.44 +    /* Identity map 0-16MB. */
   24.45 +    for ( i = 0; i < 4; i++ )
   24.46 +        pd[i] = (unsigned long)pt + (i<<12) + 3;
   24.47 +    for ( i = 0; i < (4*1024); i++ )
   24.48 +        pt[i] = (i << 12) + 3;
   24.49 +
   24.50 +    /* Page at virtual 6MB maps to physical 5MB. */
   24.51 +    pt[6u<<8] -= 0x100000u;
   24.52 +}
   24.53 +
   24.54 +static void start_paging(void)
   24.55 +{
   24.56 +    asm volatile (
   24.57 +        "mov %%eax,%%cr3; mov %%cr0,%%eax; "
   24.58 +        "orl $0x80000000,%%eax; mov %%eax,%%cr0; "
   24.59 +        "jmp 1f; 1:"
   24.60 +        : : "a" (PD_START) : "memory" );
   24.61 +}
   24.62 +
   24.63 +static void stop_paging(void)
   24.64 +{
   24.65 +    asm volatile (
   24.66 +        "mov %%cr0,%%eax; andl $0x7fffffff,%%eax; mov %%eax,%%cr0; "
   24.67 +        "jmp 1f; 1:"
   24.68 +        : : : "eax", "memory" );
   24.69 +}
   24.70 +
   24.71 +/*
   24.72 + * rep_io_test: Tests REP INSB both forwards and backwards (EF.DF={0,1}) across
   24.73 + * a discontiguous page boundary.
   24.74 + */
   24.75 +static int rep_io_test(void)
   24.76 +{
   24.77 +    uint32_t *p;
   24.78 +    uint32_t i, p0, p1, p2;
   24.79 +    int okay = 1;
   24.80 +
   24.81 +    static const struct {
   24.82 +        unsigned long addr;
   24.83 +        uint32_t expected;
   24.84 +    } check[] = {
   24.85 +        { 0x00500000, 0x987654ff },
   24.86 +        { 0x00500ffc, 0xff000000 },
   24.87 +        { 0x005ffffc, 0xff000000 },
   24.88 +        { 0x00601000, 0x000000ff },
   24.89 +        { 0, 0 }
   24.90 +    };
   24.91 +
   24.92 +    start_paging();
   24.93 +
   24.94 +    /* Phys 5MB = 0xdeadbeef */
   24.95 +    *(uint32_t *)0x500000ul = 0xdeadbeef;
   24.96 +
   24.97 +    /* Phys 5MB = 0x98765432 */
   24.98 +    *(uint32_t *)0x600000ul = 0x98765432;
   24.99 +
  24.100 +    /* Phys 0x5fffff = Phys 0x500000 = 0xff (byte) */
  24.101 +    asm volatile (
  24.102 +        "rep insb"
  24.103 +        : "=d" (p0), "=c" (p1), "=D" (p2)
  24.104 +        : "0" (0x5f), "1" (2), "2" (0x5ffffful) : "memory" );
  24.105 +
  24.106 +    /* Phys 0x500fff = Phys 0x601000 = 0xff (byte) */
  24.107 +    asm volatile (
  24.108 +        "std ; rep insb ; cld"
  24.109 +        : "=d" (p0), "=c" (p1), "=D" (p2)
  24.110 +        : "0" (0x5f), "1" (2), "2" (0x601000ul) : "memory" );
  24.111 +
  24.112 +    stop_paging();
  24.113 +
  24.114 +    i = 0;
  24.115 +    for ( p = (uint32_t *)0x400000ul; p < (uint32_t *)0x700000ul; p++ )
  24.116 +    {
  24.117 +        uint32_t expected = 0;
  24.118 +        if ( check[i].addr == (unsigned long)p )
  24.119 +        {
  24.120 +            expected = check[i].expected;
  24.121 +            i++;
  24.122 +        }
  24.123 +        if ( *p != expected )
  24.124 +        {
  24.125 +            printf("Bad value at 0x%08lx: saw %08x expected %08x\n",
  24.126 +                   (unsigned long)p, *p, expected);
  24.127 +            okay = 0;
  24.128 +        }
  24.129 +    }
  24.130 +
  24.131 +    return okay;
  24.132 +}
  24.133 +
  24.134 +void perform_tests(void)
  24.135 +{
  24.136 +    int i, passed;
  24.137 +
  24.138 +    static struct {
  24.139 +        int (* const test)(void);
  24.140 +        const char *description;
  24.141 +    } tests[] = {
  24.142 +        { rep_io_test, "REP INSB across page boundaries" },
  24.143 +        { NULL, NULL }
  24.144 +    };
  24.145 +
  24.146 +    printf("Testing HVM environment:\n");
  24.147 +
  24.148 +    passed = 0;
  24.149 +    for ( i = 0; tests[i].test; i++ )
  24.150 +    {
  24.151 +        printf(" - %s ... ", tests[i].description);
  24.152 +        memset((char *)(4ul << 20), 0, 4ul << 20);
  24.153 +        setup_paging();
  24.154 +        if ( (*tests[i].test)() )
  24.155 +        {
  24.156 +            printf("passed\n");
  24.157 +            passed++;
  24.158 +        }
  24.159 +        else
  24.160 +        {
  24.161 +            printf("failed\n");
  24.162 +        }
  24.163 +    }
  24.164 +
  24.165 +    printf("Passed %d/%d tests\n", passed, i);
  24.166 +    BUG_ON(passed != i);
  24.167 +}
    25.1 --- a/tools/firmware/hvmloader/util.c	Tue Sep 02 16:34:53 2008 -0700
    25.2 +++ b/tools/firmware/hvmloader/util.c	Tue Sep 02 16:55:55 2008 -0700
    25.3 @@ -325,35 +325,34 @@ static void e820_collapse(void)
    25.4      }
    25.5  }
    25.6  
    25.7 -uint32_t e820_malloc(uint32_t size)
    25.8 +uint32_t e820_malloc(uint32_t size, uint32_t align)
    25.9  {
   25.10      uint32_t addr;
   25.11      int i;
   25.12      struct e820entry *ent = (struct e820entry *)HVM_E820;
   25.13  
   25.14 -    /* Align allocation request to a reasonable boundary (1kB). */
   25.15 -    size = (size + 1023) & ~1023;
   25.16 +    /* Align to at leats one kilobyte. */
   25.17 +    if ( align < 1024 )
   25.18 +        align = 1024;
   25.19  
   25.20      for ( i = *HVM_E820_NR - 1; i >= 0; i-- )
   25.21      {
   25.22 -        addr = ent[i].addr;
   25.23 +        addr = (ent[i].addr + ent[i].size - size) & ~(align-1);
   25.24          if ( (ent[i].type != E820_RAM) || /* not ram? */
   25.25 -             (ent[i].size < size) ||      /* too small? */
   25.26 -             (addr != ent[i].addr) ||     /* starts above 4gb? */
   25.27 +             (addr < ent[i].addr) ||      /* too small or starts above 4gb? */
   25.28               ((addr + size) < addr) )     /* ends above 4gb? */
   25.29              continue;
   25.30  
   25.31 -        if ( ent[i].size != size )
   25.32 +        if ( addr != ent[i].addr )
   25.33          {
   25.34              memmove(&ent[i+1], &ent[i], (*HVM_E820_NR-i) * sizeof(*ent));
   25.35              (*HVM_E820_NR)++;
   25.36 -            ent[i].size -= size;
   25.37 -            addr += ent[i].size;
   25.38 +            ent[i].size = addr - ent[i].addr;
   25.39 +            ent[i+1].addr = addr;
   25.40 +            ent[i+1].size -= ent[i].size;
   25.41              i++;
   25.42          }
   25.43  
   25.44 -        ent[i].addr = addr;
   25.45 -        ent[i].size = size;
   25.46          ent[i].type = E820_RESERVED;
   25.47  
   25.48          e820_collapse();
    26.1 --- a/tools/firmware/hvmloader/util.h	Tue Sep 02 16:34:53 2008 -0700
    26.2 +++ b/tools/firmware/hvmloader/util.h	Tue Sep 02 16:55:55 2008 -0700
    26.3 @@ -132,7 +132,7 @@ int printf(const char *fmt, ...) __attri
    26.4  int vprintf(const char *fmt, va_list ap);
    26.5  
    26.6  /* Reserve a RAM region in the e820 table. */
    26.7 -uint32_t e820_malloc(uint32_t size);
    26.8 +uint32_t e820_malloc(uint32_t size, uint32_t align);
    26.9  
   26.10  /* Prepare the 32bit BIOS */
   26.11  void highbios_setup(void);
   26.12 @@ -143,6 +143,12 @@ void create_mp_tables(void);
   26.13  int hvm_write_smbios_tables(void);
   26.14  void smp_initialise(void);
   26.15  
   26.16 +#ifndef NDEBUG
   26.17 +void perform_tests(void);
   26.18 +#else
   26.19 +#define perform_tests() ((void)0)
   26.20 +#endif
   26.21 +
   26.22  #define isdigit(c) ((c) >= '0' && (c) <= '9')
   26.23  
   26.24  extern char _start[], _end[];
    27.1 --- a/tools/ioemu/hw/cirrus_vga.c	Tue Sep 02 16:34:53 2008 -0700
    27.2 +++ b/tools/ioemu/hw/cirrus_vga.c	Tue Sep 02 16:55:55 2008 -0700
    27.3 @@ -2543,34 +2543,28 @@ static CPUWriteMemoryFunc *cirrus_linear
    27.4      cirrus_linear_bitblt_writel,
    27.5  };
    27.6  
    27.7 -static void *set_vram_mapping(unsigned long begin, unsigned long end)
    27.8 +static void set_vram_mapping(CirrusVGAState *s, unsigned long begin, unsigned long end)
    27.9  {
   27.10 -    xen_pfn_t *extent_start = NULL;
   27.11 -    unsigned long nr_extents;
   27.12 -    void *vram_pointer = NULL;
   27.13 -    int i;
   27.14 -
   27.15 -    /* align begin and end address */
   27.16 -    begin = begin & TARGET_PAGE_MASK;
   27.17 -    end = begin + VGA_RAM_SIZE;
   27.18 -    end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
   27.19 -    nr_extents = (end - begin) >> TARGET_PAGE_BITS;
   27.20 -
   27.21 -    extent_start = malloc(sizeof(xen_pfn_t) * nr_extents);
   27.22 -    if (extent_start == NULL) {
   27.23 -        fprintf(stderr, "Failed malloc on set_vram_mapping\n");
   27.24 -        return NULL;
   27.25 -    }
   27.26 -
   27.27 -    memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
   27.28 -
   27.29 -    for (i = 0; i < nr_extents; i++)
   27.30 -        extent_start[i] = (begin + i * TARGET_PAGE_SIZE) >> TARGET_PAGE_BITS;
   27.31 -
   27.32 -    if (set_mm_mapping(xc_handle, domid, nr_extents, 0, extent_start) < 0) {
   27.33 -        fprintf(logfile, "Failed set_mm_mapping\n");
   27.34 -        free(extent_start);
   27.35 -        return NULL;
   27.36 +    unsigned long i;
   27.37 +    struct xen_add_to_physmap xatp;
   27.38 +    int rc;
   27.39 +
   27.40 +    if (end > begin + VGA_RAM_SIZE)
   27.41 +        end = begin + VGA_RAM_SIZE;
   27.42 +
   27.43 +    fprintf(logfile,"mapping vram to %lx - %lx\n", begin, end);
   27.44 +
   27.45 +    xatp.domid = domid;
   27.46 +    xatp.space = XENMAPSPACE_mfn;
   27.47 +
   27.48 +    for (i = 0; i < (end - begin) >> TARGET_PAGE_BITS; i++) {
   27.49 +        xatp.idx = s->vram_mfns[i];
   27.50 +        xatp.gpfn = (begin >> TARGET_PAGE_BITS) + i;
   27.51 +        rc = xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp);
   27.52 +        if (rc) {
   27.53 +            fprintf(stderr, "add_to_physmap MFN %"PRI_xen_pfn" to PFN %"PRI_xen_pfn" failed: %d\n", xatp.idx, xatp.gpfn, rc);
   27.54 +            return;
   27.55 +        }
   27.56      }
   27.57  
   27.58      (void)xc_domain_pin_memory_cacheattr(
   27.59 @@ -2578,61 +2572,42 @@ static void *set_vram_mapping(unsigned l
   27.60          begin >> TARGET_PAGE_BITS,
   27.61          end >> TARGET_PAGE_BITS,
   27.62          XEN_DOMCTL_MEM_CACHEATTR_WB);
   27.63 -
   27.64 -    vram_pointer = xc_map_foreign_pages(xc_handle, domid,
   27.65 -                                        PROT_READ|PROT_WRITE,
   27.66 -                                        extent_start, nr_extents);
   27.67 -    if (vram_pointer == NULL) {
   27.68 -        fprintf(logfile, "xc_map_foreign_batch vgaram returned error %d\n",
   27.69 -                errno);
   27.70 -        free(extent_start);
   27.71 -        return NULL;
   27.72 -    }
   27.73 -
   27.74 -    memset(vram_pointer, 0, nr_extents * TARGET_PAGE_SIZE);
   27.75 -
   27.76 -#ifdef CONFIG_STUBDOM
   27.77 -    xenfb_pv_display_start(vram_pointer);
   27.78 -#endif
   27.79 -
   27.80 -    free(extent_start);
   27.81 -
   27.82 -    return vram_pointer;
   27.83  }
   27.84  
   27.85 -static int unset_vram_mapping(unsigned long begin, unsigned long end, 
   27.86 -                              void *mapping)
   27.87 +static void unset_vram_mapping(CirrusVGAState *s, unsigned long begin, unsigned long end)
   27.88  {
   27.89 -    xen_pfn_t *extent_start = NULL;
   27.90 -    unsigned long nr_extents;
   27.91 -    int i;
   27.92 -
   27.93 -    /* align begin and end address */
   27.94 -
   27.95 -    end = begin + VGA_RAM_SIZE;
   27.96 -    begin = begin & TARGET_PAGE_MASK;
   27.97 -    end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
   27.98 -    nr_extents = (end - begin) >> TARGET_PAGE_BITS;
   27.99 -
  27.100 -    extent_start = malloc(sizeof(xen_pfn_t) * nr_extents);
  27.101 -
  27.102 -    if (extent_start == NULL) {
  27.103 -        fprintf(stderr, "Failed malloc on set_mm_mapping\n");
  27.104 -        return -1;
  27.105 +    if (s->stolen_vram_addr) {
  27.106 +        /* We can put it there for xend to save it efficiently */
  27.107 +        set_vram_mapping(s, s->stolen_vram_addr, s->stolen_vram_addr + VGA_RAM_SIZE);
  27.108 +    } else {
  27.109 +        /* Old image, we have to unmap them completely */
  27.110 +        struct xen_remove_from_physmap xrfp;
  27.111 +        unsigned long i;
  27.112 +        int rc;
  27.113 +
  27.114 +        if (end > begin + VGA_RAM_SIZE)
  27.115 +            end = begin + VGA_RAM_SIZE;
  27.116 +
  27.117 +        fprintf(logfile,"unmapping vram from %lx - %lx\n", begin, end);
  27.118 +
  27.119 +        xrfp.domid = domid;
  27.120 +
  27.121 +        for (i = 0; i < (end - begin) >> TARGET_PAGE_BITS; i++) {
  27.122 +            xrfp.gpfn = (begin >> TARGET_PAGE_BITS) + i;
  27.123 +            rc = xc_memory_op(xc_handle, XENMEM_remove_from_physmap, &xrfp);
  27.124 +            if (rc) {
  27.125 +                fprintf(stderr, "remove_from_physmap PFN %"PRI_xen_pfn" failed: %d\n", xrfp.gpfn, rc);
  27.126 +                return;
  27.127 +            }
  27.128 +        }
  27.129      }
  27.130 -
  27.131 -    /* Drop our own references to the vram pages */
  27.132 -    munmap(mapping, nr_extents * TARGET_PAGE_SIZE);
  27.133 -
  27.134 -    /* Now drop the guest's mappings */
  27.135 -    memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
  27.136 -    for (i = 0; i < nr_extents; i++)
  27.137 -        extent_start[i] = (begin + (i * TARGET_PAGE_SIZE)) >> TARGET_PAGE_BITS;
  27.138 -    unset_mm_mapping(xc_handle, domid, nr_extents, 0, extent_start);
  27.139 -
  27.140 -    free(extent_start);
  27.141 -
  27.142 -    return 0;
  27.143 +}
  27.144 +
  27.145 +void cirrus_restart_acc(CirrusVGAState *s)
  27.146 +{
  27.147 +    set_vram_mapping(s, s->lfb_addr, s->lfb_end);
  27.148 +    s->map_addr = s->lfb_addr;
  27.149 +    s->map_end = s->lfb_end;
  27.150  }
  27.151  
  27.152  /* Compute the memory access functions */
  27.153 @@ -2654,17 +2629,7 @@ static void cirrus_update_memory_access(
  27.154  	mode = s->gr[0x05] & 0x7;
  27.155  	if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
  27.156              if (s->lfb_addr && s->lfb_end && !s->map_addr) {
  27.157 -                void *vram_pointer, *old_vram;
  27.158 -
  27.159 -                vram_pointer = set_vram_mapping(s->lfb_addr,
  27.160 -                                                s->lfb_end);
  27.161 -                if (!vram_pointer)
  27.162 -                    fprintf(stderr, "NULL vram_pointer\n");
  27.163 -                else {
  27.164 -                    old_vram = vga_update_vram((VGAState *)s, vram_pointer,
  27.165 -                                               VGA_RAM_SIZE);
  27.166 -                    qemu_free(old_vram);
  27.167 -                }
  27.168 +                set_vram_mapping(s, s->lfb_addr, s->lfb_end);
  27.169                  s->map_addr = s->lfb_addr;
  27.170                  s->map_end = s->lfb_end;
  27.171              }
  27.172 @@ -2674,14 +2639,7 @@ static void cirrus_update_memory_access(
  27.173          } else {
  27.174          generic_io:
  27.175              if (s->lfb_addr && s->lfb_end && s->map_addr) {
  27.176 -                void *old_vram;
  27.177 -
  27.178 -                old_vram = vga_update_vram((VGAState *)s, NULL, VGA_RAM_SIZE);
  27.179 -
  27.180 -                unset_vram_mapping(s->lfb_addr,
  27.181 -                                   s->lfb_end, 
  27.182 -                                   old_vram);
  27.183 -
  27.184 +                unset_vram_mapping(s, s->map_addr, s->map_end);
  27.185                  s->map_addr = s->map_end = 0;
  27.186              }
  27.187              s->cirrus_linear_write[0] = cirrus_linear_writeb;
  27.188 @@ -3040,36 +2998,6 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
  27.189      cirrus_mmio_writel,
  27.190  };
  27.191  
  27.192 -void cirrus_stop_acc(CirrusVGAState *s)
  27.193 -{
  27.194 -    if (s->map_addr){
  27.195 -        int error;
  27.196 -        s->map_addr = 0;
  27.197 -        error = unset_vram_mapping(s->lfb_addr,
  27.198 -                s->lfb_end, s->vram_ptr);
  27.199 -        fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
  27.200 -    }
  27.201 -}
  27.202 -
  27.203 -void cirrus_restart_acc(CirrusVGAState *s)
  27.204 -{
  27.205 -    if (s->lfb_addr && s->lfb_end) {
  27.206 -        void *vram_pointer, *old_vram;
  27.207 -        fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, lfb_end=0x%lx.\n",
  27.208 -                s->lfb_addr, s->lfb_end);
  27.209 -        vram_pointer = set_vram_mapping(s->lfb_addr ,s->lfb_end);
  27.210 -        if (!vram_pointer){
  27.211 -            fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
  27.212 -        } else {
  27.213 -            old_vram = vga_update_vram((VGAState *)s, vram_pointer,
  27.214 -                    VGA_RAM_SIZE);
  27.215 -            qemu_free(old_vram);
  27.216 -            s->map_addr = s->lfb_addr;
  27.217 -            s->map_end = s->lfb_end;
  27.218 -        }
  27.219 -    }
  27.220 -}
  27.221 -
  27.222  /* load/save state */
  27.223  
  27.224  static void cirrus_vga_save(QEMUFile *f, void *opaque)
  27.225 @@ -3118,7 +3046,10 @@ static void cirrus_vga_save(QEMUFile *f,
  27.226      qemu_put_8s(f, &vga_acc);
  27.227      qemu_put_be64s(f, (uint64_t*)&s->lfb_addr);
  27.228      qemu_put_be64s(f, (uint64_t*)&s->lfb_end);
  27.229 -    qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
  27.230 +    qemu_put_be64s(f, &s->stolen_vram_addr);
  27.231 +    if (!s->stolen_vram_addr && !vga_acc)
  27.232 +        /* Old guest: VRAM is not mapped, we have to save it ourselves */
  27.233 +        qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
  27.234  }
  27.235  
  27.236  static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
  27.237 @@ -3127,7 +3058,7 @@ static int cirrus_vga_load(QEMUFile *f, 
  27.238      uint8_t vga_acc = 0;
  27.239      int ret;
  27.240  
  27.241 -    if (version_id > 2)
  27.242 +    if (version_id > 3)
  27.243          return -EINVAL;
  27.244  
  27.245      if (s->pci_dev && version_id >= 2) {
  27.246 @@ -3173,9 +3104,20 @@ static int cirrus_vga_load(QEMUFile *f, 
  27.247      qemu_get_8s(f, &vga_acc);
  27.248      qemu_get_be64s(f, (uint64_t*)&s->lfb_addr);
  27.249      qemu_get_be64s(f, (uint64_t*)&s->lfb_end);
  27.250 -    qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
  27.251 -    if (vga_acc){
  27.252 -        cirrus_restart_acc(s);
  27.253 +    if (version_id >= 3) {
  27.254 +        qemu_get_be64s(f, &s->stolen_vram_addr);
  27.255 +        if (!s->stolen_vram_addr && !vga_acc) {
  27.256 +            /* Old guest, VRAM is not mapped, we have to restore it ourselves */
  27.257 +            qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
  27.258 +            xen_vga_populate_vram(s->lfb_addr);
  27.259 +        } else
  27.260 +            xen_vga_vram_map(vga_acc ? s->lfb_addr : s->stolen_vram_addr, 0);
  27.261 +    } else {
  27.262 +        /* Old image, we have to populate and restore VRAM ourselves */
  27.263 +        xen_vga_populate_vram(s->lfb_addr);
  27.264 +        qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
  27.265 +        if (vga_acc)
  27.266 +            cirrus_restart_acc(s);
  27.267      }
  27.268  
  27.269      /* force refresh */
  27.270 @@ -3297,7 +3239,7 @@ static void cirrus_init_common(CirrusVGA
  27.271      s->cursor_invalidate = cirrus_cursor_invalidate;
  27.272      s->cursor_draw_line = cirrus_cursor_draw_line;
  27.273  
  27.274 -    register_savevm("cirrus_vga", 0, 2, cirrus_vga_save, cirrus_vga_load, s);
  27.275 +    register_savevm("cirrus_vga", 0, 3, cirrus_vga_save, cirrus_vga_load, s);
  27.276  }
  27.277  
  27.278  /***************************************
    28.1 --- a/tools/ioemu/hw/ide.c	Tue Sep 02 16:34:53 2008 -0700
    28.2 +++ b/tools/ioemu/hw/ide.c	Tue Sep 02 16:55:55 2008 -0700
    28.3 @@ -1108,14 +1108,14 @@ static void ide_flush_cb(void *opaque, i
    28.4  	return;
    28.5      }
    28.6      else
    28.7 -        s->status = READY_STAT;
    28.8 +        s->status = READY_STAT | SEEK_STAT;
    28.9      ide_set_irq(s);
   28.10  }
   28.11  
   28.12  static void ide_atapi_cmd_ok(IDEState *s)
   28.13  {
   28.14      s->error = 0;
   28.15 -    s->status = READY_STAT;
   28.16 +    s->status = READY_STAT | SEEK_STAT;
   28.17      s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
   28.18      ide_set_irq(s);
   28.19  }
   28.20 @@ -1229,7 +1229,7 @@ static void ide_atapi_cmd_reply_end(IDES
   28.21      if (s->packet_transfer_size <= 0) {
   28.22          /* end of transfer */
   28.23          ide_transfer_stop(s);
   28.24 -        s->status = READY_STAT;
   28.25 +        s->status = READY_STAT | SEEK_STAT;
   28.26          s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
   28.27          ide_set_irq(s);
   28.28  #ifdef DEBUG_IDE_ATAPI
   28.29 @@ -1307,10 +1307,10 @@ static void ide_atapi_cmd_reply(IDEState
   28.30      s->io_buffer_index = 0;
   28.31  
   28.32      if (s->atapi_dma) {
   28.33 -    	s->status = READY_STAT | DRQ_STAT;
   28.34 +    	s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
   28.35  	ide_dma_start(s, ide_atapi_cmd_read_dma_cb);
   28.36      } else {
   28.37 -    	s->status = READY_STAT;
   28.38 +    	s->status = READY_STAT | SEEK_STAT;
   28.39      	ide_atapi_cmd_reply_end(s);
   28.40      }
   28.41  }
   28.42 @@ -1325,7 +1325,7 @@ static void ide_atapi_cmd_read_pio(IDESt
   28.43      s->io_buffer_index = sector_size;
   28.44      s->cd_sector_size = sector_size;
   28.45  
   28.46 -    s->status = READY_STAT;
   28.47 +    s->status = READY_STAT | SEEK_STAT;
   28.48      ide_atapi_cmd_reply_end(s);
   28.49  }
   28.50  
   28.51 @@ -1368,7 +1368,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
   28.52      }
   28.53  
   28.54      if (s->packet_transfer_size <= 0) {
   28.55 -        s->status = READY_STAT;
   28.56 +        s->status = READY_STAT | SEEK_STAT;
   28.57          s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
   28.58          ide_set_irq(s);
   28.59      eot:
   28.60 @@ -1418,7 +1418,7 @@ static void ide_atapi_cmd_read_dma(IDESt
   28.61      s->cd_sector_size = sector_size;
   28.62  
   28.63      /* XXX: check if BUSY_STAT should be set */
   28.64 -    s->status = READY_STAT | DRQ_STAT | BUSY_STAT;
   28.65 +    s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
   28.66      ide_dma_start(s, ide_atapi_cmd_read_dma_cb);
   28.67  }
   28.68  
   28.69 @@ -1886,7 +1886,7 @@ static void ide_ioport_write(void *opaqu
   28.70                  ide_abort_command(s);
   28.71              } else {
   28.72                  s->mult_sectors = s->nsector;
   28.73 -                s->status = READY_STAT;
   28.74 +                s->status = READY_STAT | SEEK_STAT;
   28.75              }
   28.76              ide_set_irq(s);
   28.77              break;
   28.78 @@ -1896,7 +1896,7 @@ static void ide_ioport_write(void *opaqu
   28.79          case WIN_VERIFY_ONCE:
   28.80              /* do sector number check ? */
   28.81  	    ide_cmd_lba48_transform(s, lba48);
   28.82 -            s->status = READY_STAT;
   28.83 +            s->status = READY_STAT | SEEK_STAT;
   28.84              ide_set_irq(s);
   28.85              break;
   28.86  	case WIN_READ_EXT:
   28.87 @@ -1965,12 +1965,12 @@ static void ide_ioport_write(void *opaqu
   28.88          case WIN_READ_NATIVE_MAX:
   28.89  	    ide_cmd_lba48_transform(s, lba48);
   28.90              ide_set_sector(s, s->nb_sectors - 1);
   28.91 -            s->status = READY_STAT;
   28.92 +            s->status = READY_STAT | SEEK_STAT;
   28.93              ide_set_irq(s);
   28.94              break;
   28.95          case WIN_CHECKPOWERMODE1:
   28.96              s->nsector = 0xff; /* device active or idle */
   28.97 -            s->status = READY_STAT;
   28.98 +            s->status = READY_STAT | SEEK_STAT;
   28.99              ide_set_irq(s);
  28.100              break;
  28.101          case WIN_SETFEATURES:
  28.102 @@ -2070,7 +2070,7 @@ static void ide_ioport_write(void *opaqu
  28.103              /* overlapping commands not supported */
  28.104              if (s->feature & 0x02)
  28.105                  goto abort_cmd;
  28.106 -            s->status = READY_STAT;
  28.107 +            s->status = READY_STAT | SEEK_STAT;
  28.108              s->atapi_dma = s->feature & 1;
  28.109              s->nsector = 1;
  28.110              ide_transfer_start(s, s->io_buffer, ATAPI_PACKET_SIZE, 
  28.111 @@ -2289,7 +2289,7 @@ static void ide_reset(IDEState *s)
  28.112      s->mult_sectors = MAX_MULT_SECTORS;
  28.113      s->cur_drive = s;
  28.114      s->select = 0xa0;
  28.115 -    s->status = READY_STAT;
  28.116 +    s->status = READY_STAT | SEEK_STAT;
  28.117      ide_set_signature(s);
  28.118      /* init the transfer handler so that 0xffff is returned on data
  28.119         accesses */
    29.1 --- a/tools/ioemu/hw/pass-through.c	Tue Sep 02 16:34:53 2008 -0700
    29.2 +++ b/tools/ioemu/hw/pass-through.c	Tue Sep 02 16:55:55 2008 -0700
    29.3 @@ -2340,11 +2340,6 @@ static int pt_bar_reg_write(struct pt_de
    29.4                  return -1;
    29.5          }
    29.6  
    29.7 -        /* always keep the emulate register value to 0,
    29.8 -         * because hvmloader does not support high MMIO for now.
    29.9 -         */
   29.10 -        cfg_entry->data = 0;
   29.11 -
   29.12          /* never mapping the 'empty' upper region,
   29.13           * because we'll do it enough for the lower region.
   29.14           */
    30.1 --- a/tools/ioemu/hw/pci.c	Tue Sep 02 16:34:53 2008 -0700
    30.2 +++ b/tools/ioemu/hw/pci.c	Tue Sep 02 16:55:55 2008 -0700
    30.3 @@ -45,7 +45,6 @@ struct PCIBus {
    30.4  static void pci_update_mappings(PCIDevice *d);
    30.5  
    30.6  target_phys_addr_t pci_mem_base;
    30.7 -static int pci_irq_index;
    30.8  static PCIBus *first_bus;
    30.9  
   30.10  PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
   30.11 @@ -114,9 +113,6 @@ PCIDevice *pci_register_device(PCIBus *b
   30.12  {
   30.13      PCIDevice *pci_dev;
   30.14  
   30.15 -    if (pci_irq_index >= PCI_DEVICES_MAX)
   30.16 -        return NULL;
   30.17 -    
   30.18      if (devfn < 0) {
   30.19          for(devfn = bus->devfn_min ; devfn < 256; devfn += 8) {
   30.20              if ( !bus->devices[devfn] &&
   30.21 @@ -140,7 +136,6 @@ PCIDevice *pci_register_device(PCIBus *b
   30.22          config_write = pci_default_write_config;
   30.23      pci_dev->config_read = config_read;
   30.24      pci_dev->config_write = config_write;
   30.25 -    pci_dev->irq_index = pci_irq_index++;
   30.26      bus->devices[devfn] = pci_dev;
   30.27      return pci_dev;
   30.28  }
    31.1 --- a/tools/ioemu/hw/vga.c	Tue Sep 02 16:34:53 2008 -0700
    31.2 +++ b/tools/ioemu/hw/vga.c	Tue Sep 02 16:55:55 2008 -0700
    31.3 @@ -23,6 +23,7 @@
    31.4   */
    31.5  #include "vl.h"
    31.6  #include "vga_int.h"
    31.7 +#include <sys/mman.h>
    31.8  
    31.9  //#define DEBUG_VGA
   31.10  //#define DEBUG_VGA_MEM
   31.11 @@ -1776,7 +1777,10 @@ static void vga_save(QEMUFile *f, void *
   31.12  #endif
   31.13      vram_size = s->vram_size;
   31.14      qemu_put_be32s(f, &vram_size); 
   31.15 -    qemu_put_buffer(f, s->vram_ptr, s->vram_size); 
   31.16 +    qemu_put_be64s(f, &s->stolen_vram_addr);
   31.17 +    if (!s->stolen_vram_addr)
   31.18 +        /* Old guest: VRAM is not mapped, we have to save it ourselves */
   31.19 +        qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
   31.20  }
   31.21  
   31.22  static int vga_load(QEMUFile *f, void *opaque, int version_id)
   31.23 @@ -1788,7 +1792,7 @@ static int vga_load(QEMUFile *f, void *o
   31.24      int i;
   31.25  #endif
   31.26  
   31.27 -    if (version_id > 3)
   31.28 +    if (version_id > 4)
   31.29          return -EINVAL;
   31.30  
   31.31      if (s->pci_dev && version_id >= 2) {
   31.32 @@ -1839,7 +1843,14 @@ static int vga_load(QEMUFile *f, void *o
   31.33  	qemu_get_be32s(f, &vram_size);
   31.34  	if (vram_size != s->vram_size)
   31.35  	    return -EINVAL;
   31.36 -	qemu_get_buffer(f, s->vram_ptr, s->vram_size); 
   31.37 +        if (version_id >= 4) {
   31.38 +            qemu_get_be64s(f, &s->stolen_vram_addr);
   31.39 +            if (s->stolen_vram_addr)
   31.40 +                xen_vga_vram_map(s->stolen_vram_addr, 0);
   31.41 +        }
   31.42 +        /* Old guest, VRAM is not mapped, we have to restore it ourselves */
   31.43 +        if (!s->stolen_vram_addr)
   31.44 +            qemu_get_buffer(f, s->vram_ptr, s->vram_size); 
   31.45      }
   31.46  
   31.47      /* force refresh */
   31.48 @@ -1994,6 +2005,100 @@ void vga_bios_init(VGAState *s)
   31.49      /* TODO: add vbe support if enabled */
   31.50  }
   31.51  
   31.52 +
   31.53 +static VGAState *xen_vga_state;
   31.54 +
   31.55 +/* When loading old images we have to populate the video ram ourselves */
   31.56 +void xen_vga_populate_vram(uint64_t vram_addr)
   31.57 +{
   31.58 +    unsigned long nr_pfn;
   31.59 +    struct xen_remove_from_physmap xrfp;
   31.60 +    xen_pfn_t *pfn_list;
   31.61 +    int i;
   31.62 +    int rc;
   31.63 +
   31.64 +    fprintf(logfile, "populating video RAM at %lx\n", vram_addr);
   31.65 +
   31.66 +    nr_pfn = VGA_RAM_SIZE >> TARGET_PAGE_BITS;
   31.67 +
   31.68 +    pfn_list = malloc(sizeof(*pfn_list) * nr_pfn);
   31.69 +
   31.70 +    for (i = 0; i < nr_pfn; i++)
   31.71 +        pfn_list[i] = (vram_addr >> TARGET_PAGE_BITS) + i;
   31.72 +
   31.73 +    if (xc_domain_memory_populate_physmap(xc_handle, domid, nr_pfn, 0, 0, pfn_list)) {
   31.74 +        fprintf(stderr, "Failed to populate video ram\n");
   31.75 +        exit(1);
   31.76 +    }
   31.77 +    free(pfn_list);
   31.78 +
   31.79 +    xen_vga_vram_map(vram_addr, 0);
   31.80 +
   31.81 +    /* Unmap them from the guest for now. */
   31.82 +    xrfp.domid = domid;
   31.83 +    for (i = 0; i < nr_pfn; i++) {
   31.84 +        xrfp.gpfn = (vram_addr >> TARGET_PAGE_BITS) + i;
   31.85 +        rc = xc_memory_op(xc_handle, XENMEM_remove_from_physmap, &xrfp);
   31.86 +        if (rc) {
   31.87 +            fprintf(stderr, "remove_from_physmap PFN %"PRI_xen_pfn" failed: %d\n", xrfp.gpfn, rc);
   31.88 +            break;
   31.89 +        }
   31.90 +    }
   31.91 +}
   31.92 +
   31.93 +/* Called once video memory has been allocated in the GPFN space */
   31.94 +void xen_vga_vram_map(uint64_t vram_addr, int copy)
   31.95 +{
   31.96 +    unsigned long nr_pfn;
   31.97 +    xen_pfn_t *pfn_list;
   31.98 +    int i;
   31.99 +    void *vram;
  31.100 +
  31.101 +    fprintf(logfile, "mapping video RAM from %lx\n", vram_addr);
  31.102 +
  31.103 +    nr_pfn = VGA_RAM_SIZE >> TARGET_PAGE_BITS;
  31.104 +
  31.105 +    pfn_list = malloc(sizeof(*pfn_list) * nr_pfn);
  31.106 +
  31.107 +    for (i = 0; i < nr_pfn; i++)
  31.108 +        pfn_list[i] = (vram_addr >> TARGET_PAGE_BITS) + i;
  31.109 +
  31.110 +    vram = xc_map_foreign_pages(xc_handle, domid,
  31.111 +                                        PROT_READ|PROT_WRITE,
  31.112 +                                        pfn_list, nr_pfn);
  31.113 +
  31.114 +    if (!vram) {
  31.115 +        fprintf(stderr, "Failed to map vram\n");
  31.116 +        exit(1);
  31.117 +    }
  31.118 +
  31.119 +    if (xc_domain_memory_translate_gpfn_list(xc_handle, domid, nr_pfn,
  31.120 +                pfn_list, pfn_list)) {
  31.121 +        fprintf(stderr, "Failed translation in xen_vga_vram_addr\n");
  31.122 +        exit(1);
  31.123 +    }
  31.124 +
  31.125 +    if (copy)
  31.126 +        memcpy(vram, xen_vga_state->vram_ptr, VGA_RAM_SIZE);
  31.127 +    qemu_free(xen_vga_state->vram_ptr);
  31.128 +    xen_vga_state->vram_ptr = vram;
  31.129 +    xen_vga_state->vram_mfns = pfn_list;
  31.130 +#ifdef CONFIG_STUBDOM
  31.131 +    xenfb_pv_display_start(vram);
  31.132 +#endif
  31.133 +}
  31.134 +
  31.135 +/* Called at boot time when the BIOS has allocated video RAM */
  31.136 +void xen_vga_stolen_vram_addr(uint64_t stolen_vram_addr)
  31.137 +{
  31.138 +    fprintf(logfile, "stolen video RAM at %lx\n", stolen_vram_addr);
  31.139 +
  31.140 +    xen_vga_state->stolen_vram_addr = stolen_vram_addr;
  31.141 +
  31.142 +    /* And copy from the initialization value */
  31.143 +    xen_vga_vram_map(stolen_vram_addr, 1);
  31.144 +}
  31.145 +
  31.146  /* when used on xen environment, the vga_ram_base is not used */
  31.147  void vga_common_init(VGAState *s, DisplayState *ds, uint8_t *vga_ram_base, 
  31.148                       unsigned long vga_ram_offset, int vga_ram_size)
  31.149 @@ -2025,13 +2130,9 @@ void vga_common_init(VGAState *s, Displa
  31.150  
  31.151      vga_reset(s);
  31.152  
  31.153 -    /* Video RAM must be page-aligned for PVFB memory sharing */
  31.154 -    s->vram_ptr = s->vram_alloc = qemu_memalign(TARGET_PAGE_SIZE, vga_ram_size);
  31.155 -
  31.156 -#ifdef CONFIG_STUBDOM
  31.157 -    if (!cirrus_vga_enabled)
  31.158 -        xenfb_pv_display_start(s->vram_ptr);
  31.159 -#endif
  31.160 +    s->vram_ptr = qemu_malloc(vga_ram_size);
  31.161 +    s->vram_mfns = NULL;
  31.162 +    xen_vga_state = s;
  31.163  
  31.164      s->vram_offset = vga_ram_offset;
  31.165      s->vram_size = vga_ram_size;
  31.166 @@ -2051,7 +2152,7 @@ static void vga_init(VGAState *s)
  31.167  {
  31.168      int vga_io_memory;
  31.169  
  31.170 -    register_savevm("vga", 0, 3, vga_save, vga_load, s);
  31.171 +    register_savevm("vga", 0, 4, vga_save, vga_load, s);
  31.172  
  31.173      register_ioport_write(0x3c0, 16, 1, vga_ioport_write, s);
  31.174  
  31.175 @@ -2163,33 +2264,6 @@ int pci_vga_init(PCIBus *bus, DisplaySta
  31.176      return 0;
  31.177  }
  31.178  
  31.179 -void *vga_update_vram(VGAState *s, void *vga_ram_base, int vga_ram_size)
  31.180 -{
  31.181 -    uint8_t *old_pointer;
  31.182 -
  31.183 -    if (s->vram_size != vga_ram_size) {
  31.184 -        fprintf(stderr, "No support to change vga_ram_size\n");
  31.185 -        return NULL;
  31.186 -    }
  31.187 -
  31.188 -    if (!vga_ram_base) {
  31.189 -        vga_ram_base = qemu_memalign(TARGET_PAGE_SIZE, vga_ram_size + TARGET_PAGE_SIZE + 1);
  31.190 -        if (!vga_ram_base) {
  31.191 -            fprintf(stderr, "reallocate error\n");
  31.192 -            return NULL;
  31.193 -        }
  31.194 -    }
  31.195 -
  31.196 -    /* XXX lock needed? */
  31.197 -    old_pointer = s->vram_alloc;
  31.198 -    s->vram_alloc = vga_ram_base;
  31.199 -    vga_ram_base = (uint8_t *)((long)(vga_ram_base + 15) & ~15L);
  31.200 -    memcpy(vga_ram_base, s->vram_ptr, vga_ram_size);
  31.201 -    s->vram_ptr = vga_ram_base;
  31.202 -
  31.203 -    return old_pointer;
  31.204 -}
  31.205 -
  31.206  /********************************************************/
  31.207  /* vga screen dump */
  31.208  
    32.1 --- a/tools/ioemu/hw/vga_int.h	Tue Sep 02 16:34:53 2008 -0700
    32.2 +++ b/tools/ioemu/hw/vga_int.h	Tue Sep 02 16:55:55 2008 -0700
    32.3 @@ -80,9 +80,9 @@
    32.4  #define VGA_MAX_HEIGHT 2048
    32.5  
    32.6  #define VGA_STATE_COMMON                                                \
    32.7 -    uint8_t *vram_alloc;                                                \
    32.8      uint8_t *vram_ptr;                                                  \
    32.9 -    uint8_t *vram_shadow;                                               \
   32.10 +    xen_pfn_t *vram_mfns;                                               \
   32.11 +    uint64_t stolen_vram_addr; /* Address of stolen RAM */              \
   32.12      unsigned long vram_offset;                                          \
   32.13      unsigned int vram_size;                                             \
   32.14      unsigned long bios_offset;                                          \
    33.1 --- a/tools/ioemu/hw/xen_platform.c	Tue Sep 02 16:34:53 2008 -0700
    33.2 +++ b/tools/ioemu/hw/xen_platform.c	Tue Sep 02 16:55:55 2008 -0700
    33.3 @@ -34,6 +34,7 @@ typedef struct PCIXenPlatformState
    33.4  {
    33.5    PCIDevice  pci_dev;
    33.6    uint8_t    platform_flags;
    33.7 +  uint64_t   vga_stolen_ram;
    33.8  } PCIXenPlatformState;
    33.9  
   33.10  static uint32_t xen_platform_ioport_readb(void *opaque, uint32_t addr)
   33.11 @@ -69,11 +70,46 @@ static void xen_platform_ioport_writeb(v
   33.12  }
   33.13  
   33.14  
   33.15 +static uint32_t xen_platform_ioport_readl(void *opaque, uint32_t addr)
   33.16 +{
   33.17 +    PCIXenPlatformState *d = opaque;
   33.18 +
   33.19 +    addr  &= 0xff;
   33.20 +
   33.21 +    switch (addr) {
   33.22 +    case 4: /* VGA stolen memory address */
   33.23 +        return d->vga_stolen_ram;
   33.24 +    default:
   33.25 +        return ~0u;
   33.26 +    }
   33.27 +}
   33.28 +
   33.29 +static void xen_platform_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
   33.30 +{
   33.31 +    PCIXenPlatformState *d = opaque;
   33.32 +
   33.33 +    addr &= 0xff;
   33.34 +    val  &= 0xffffffff;
   33.35 +
   33.36 +    switch (addr) {
   33.37 +    case 4: /* VGA stolen memory address */
   33.38 +        d->vga_stolen_ram = val;
   33.39 +        xen_vga_stolen_vram_addr(val);
   33.40 +        break;
   33.41 +    default:
   33.42 +        break;
   33.43 +    }
   33.44 +}
   33.45 +
   33.46 +
   33.47 +
   33.48  static void platform_ioport_map(PCIDevice *pci_dev, int region_num, uint32_t addr, uint32_t size, int type)
   33.49  {
   33.50      PCIXenPlatformState *d = (PCIXenPlatformState *)pci_dev;
   33.51      register_ioport_write(addr, size, 1, xen_platform_ioport_writeb, d);
   33.52 +    register_ioport_write(addr, size, 4, xen_platform_ioport_writel, d);
   33.53      register_ioport_read(addr, size, 1, xen_platform_ioport_readb, d);
   33.54 +    register_ioport_read(addr, size, 4, xen_platform_ioport_readl, d);
   33.55  }
   33.56  
   33.57  static uint32_t platform_mmio_read(void *opaque, target_phys_addr_t addr)
   33.58 @@ -155,6 +191,7 @@ void xen_pci_save(QEMUFile *f, void *opa
   33.59  
   33.60      pci_device_save(&d->pci_dev, f);
   33.61      qemu_put_8s(f, &d->platform_flags);
   33.62 +    qemu_put_be64s(f, &d->vga_stolen_ram);
   33.63  }
   33.64  
   33.65  int xen_pci_load(QEMUFile *f, void *opaque, int version_id)
   33.66 @@ -173,6 +210,7 @@ int xen_pci_load(QEMUFile *f, void *opaq
   33.67          uint8_t flags;
   33.68          qemu_get_8s(f, &flags);
   33.69          xen_platform_ioport_writeb(d, 0, flags);
   33.70 +        qemu_get_be64s(f, &d->vga_stolen_ram);
   33.71      }
   33.72  
   33.73      return 0;
    34.1 --- a/tools/ioemu/vl.c	Tue Sep 02 16:34:53 2008 -0700
    34.2 +++ b/tools/ioemu/vl.c	Tue Sep 02 16:55:55 2008 -0700
    34.3 @@ -7023,38 +7023,6 @@ static BOOL WINAPI qemu_ctrl_handler(DWO
    34.4  
    34.5  #include <xg_private.h>
    34.6  
    34.7 -/* FIXME Flush the shadow page */
    34.8 -int unset_mm_mapping(int xc_handle, uint32_t domid,
    34.9 -                     unsigned long nr_pages, unsigned int address_bits,
   34.10 -                     xen_pfn_t *extent_start)
   34.11 -{
   34.12 -    int err = 0;
   34.13 -
   34.14 -    err = xc_domain_memory_decrease_reservation(xc_handle, domid,
   34.15 -                                                nr_pages, 0, extent_start);
   34.16 -    if (err)
   34.17 -        fprintf(stderr, "Failed to decrease physmap\n");
   34.18 -
   34.19 -    return err;
   34.20 -}
   34.21 -
   34.22 -int set_mm_mapping(int xc_handle, uint32_t domid,
   34.23 -                   unsigned long nr_pages, unsigned int address_bits,
   34.24 -                   xen_pfn_t *extent_start)
   34.25 -{
   34.26 -    int err = 0;
   34.27 -
   34.28 -    err = xc_domain_memory_populate_physmap(
   34.29 -        xc_handle, domid, nr_pages, 0,
   34.30 -        XENMEMF_address_bits(address_bits), extent_start);
   34.31 -    if (err) {
   34.32 -        fprintf(stderr, "Failed to populate physmap\n");
   34.33 -        return -1;
   34.34 -    }
   34.35 -
   34.36 -    return 0;
   34.37 -}
   34.38 -
   34.39  
   34.40  int main(int argc, char **argv)
   34.41  {
    35.1 --- a/tools/ioemu/vl.h	Tue Sep 02 16:34:53 2008 -0700
    35.2 +++ b/tools/ioemu/vl.h	Tue Sep 02 16:55:55 2008 -0700
    35.3 @@ -812,8 +812,6 @@ struct PCIDevice {
    35.4      /* do not access the following fields */
    35.5      PCIConfigReadFunc *config_read;
    35.6      PCIConfigWriteFunc *config_write;
    35.7 -    /* ??? This is a PC-specific hack, and should be removed.  */
    35.8 -    int irq_index;
    35.9  
   35.10      /* Current IRQ levels.  Used internally by the generic PCI code.  */
   35.11      int irq_state[4];
   35.12 @@ -1560,6 +1558,9 @@ void timeoffset_get(void);
   35.13  /* xen_platform.c */
   35.14  #ifndef QEMU_TOOL
   35.15  void pci_xen_platform_init(PCIBus *bus);
   35.16 +void xen_vga_stolen_vram_addr(uint64_t vram_addr);
   35.17 +void xen_vga_populate_vram(uint64_t vram_addr);
   35.18 +void xen_vga_vram_map(uint64_t vram_addr, int copy);
   35.19  #endif
   35.20  
   35.21  /* pci_emulation.c */
    36.1 --- a/tools/libxc/xc_dom_boot.c	Tue Sep 02 16:34:53 2008 -0700
    36.2 +++ b/tools/libxc/xc_dom_boot.c	Tue Sep 02 16:55:55 2008 -0700
    36.3 @@ -187,7 +187,7 @@ void *xc_dom_boot_domU_map(struct xc_dom
    36.4  int xc_dom_boot_image(struct xc_dom_image *dom)
    36.5  {
    36.6      DECLARE_DOMCTL;
    36.7 -    void *ctxt;
    36.8 +    vcpu_guest_context_any_t ctxt;
    36.9      int rc;
   36.10  
   36.11      xc_dom_printf("%s: called\n", __FUNCTION__);
   36.12 @@ -245,12 +245,11 @@ int xc_dom_boot_image(struct xc_dom_imag
   36.13          return rc;
   36.14  
   36.15      /* let the vm run */
   36.16 -    ctxt = xc_dom_malloc(dom, PAGE_SIZE * 2 /* FIXME */ );
   36.17 -    memset(ctxt, 0, PAGE_SIZE * 2);
   36.18 -    if ( (rc = dom->arch_hooks->vcpu(dom, ctxt)) != 0 )
   36.19 +    memset(&ctxt, 0, sizeof(ctxt));
   36.20 +    if ( (rc = dom->arch_hooks->vcpu(dom, &ctxt)) != 0 )
   36.21          return rc;
   36.22      xc_dom_unmap_all(dom);
   36.23 -    rc = launch_vm(dom->guest_xc, dom->guest_domid, ctxt);
   36.24 +    rc = launch_vm(dom->guest_xc, dom->guest_domid, &ctxt);
   36.25  
   36.26      return rc;
   36.27  }
    37.1 --- a/tools/libxc/xc_domain.c	Tue Sep 02 16:34:53 2008 -0700
    37.2 +++ b/tools/libxc/xc_domain.c	Tue Sep 02 16:55:55 2008 -0700
    37.3 @@ -537,6 +537,33 @@ int xc_domain_memory_populate_physmap(in
    37.4      return err;
    37.5  }
    37.6  
    37.7 +int xc_domain_memory_translate_gpfn_list(int xc_handle,
    37.8 +                                         uint32_t domid,
    37.9 +                                         unsigned long nr_gpfns,
   37.10 +                                         xen_pfn_t *gpfn_list,
   37.11 +                                         xen_pfn_t *mfn_list)
   37.12 +{
   37.13 +    int err;
   37.14 +    struct xen_translate_gpfn_list translate_gpfn_list = {
   37.15 +        .domid    = domid,
   37.16 +        .nr_gpfns = nr_gpfns,
   37.17 +    };
   37.18 +    set_xen_guest_handle(translate_gpfn_list.gpfn_list, gpfn_list);
   37.19 +    set_xen_guest_handle(translate_gpfn_list.mfn_list, mfn_list);
   37.20 +
   37.21 +    err = xc_memory_op(xc_handle, XENMEM_translate_gpfn_list, &translate_gpfn_list);
   37.22 +
   37.23 +    if ( err != 0 )
   37.24 +    {
   37.25 +        DPRINTF("Failed translation for dom %d (%ld PFNs)\n",
   37.26 +                domid, nr_gpfns);
   37.27 +        errno = -err;
   37.28 +        err = -1;
   37.29 +    }
   37.30 +
   37.31 +    return err;
   37.32 +}
   37.33 +
   37.34  int xc_domain_max_vcpus(int xc_handle, uint32_t domid, unsigned int max)
   37.35  {
   37.36      DECLARE_DOMCTL;
    38.1 --- a/tools/libxc/xc_domain_save.c	Tue Sep 02 16:34:53 2008 -0700
    38.2 +++ b/tools/libxc/xc_domain_save.c	Tue Sep 02 16:55:55 2008 -0700
    38.3 @@ -1111,12 +1111,6 @@ int xc_domain_save(int xc_handle, int io
    38.4                         (test_bit(n, to_fix)  && last_iter)) )
    38.5                      continue;
    38.6  
    38.7 -                /* Skip PFNs that aren't really there */
    38.8 -                if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
    38.9 -                             || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 
   38.10 -                                 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
   38.11 -                    continue;
   38.12 -
   38.13                  /*
   38.14                  ** we get here if:
   38.15                  **  1. page is marked to_send & hasn't already been re-dirtied
    39.1 --- a/tools/libxc/xc_minios.c	Tue Sep 02 16:34:53 2008 -0700
    39.2 +++ b/tools/libxc/xc_minios.c	Tue Sep 02 16:55:55 2008 -0700
    39.3 @@ -64,7 +64,6 @@ void *xc_map_foreign_range(int xc_handle
    39.4                             unsigned long mfn)
    39.5  {
    39.6      unsigned long pt_prot = 0;
    39.7 -    printf("xc_map_foreign_range(%lx, %d)\n", mfn, size);
    39.8  #ifdef __ia64__
    39.9      /* TODO */
   39.10  #else
   39.11 @@ -81,9 +80,10 @@ void *xc_map_foreign_ranges(int xc_handl
   39.12                              size_t size, int prot, size_t chunksize,
   39.13                              privcmd_mmap_entry_t entries[], int nentries)
   39.14  {
   39.15 -    unsigned long mfns[size / PAGE_SIZE];
   39.16 +    unsigned long *mfns;
   39.17      int i, j, n;
   39.18      unsigned long pt_prot = 0;
   39.19 +    void *ret;
   39.20  #ifdef __ia64__
   39.21      /* TODO */
   39.22  #else
   39.23 @@ -93,12 +93,16 @@ void *xc_map_foreign_ranges(int xc_handl
   39.24  	pt_prot = L1_PROT;
   39.25  #endif
   39.26  
   39.27 +    mfns = malloc((size / PAGE_SIZE) * sizeof(*mfns));
   39.28 +
   39.29      n = 0;
   39.30      for (i = 0; i < nentries; i++)
   39.31          for (j = 0; j < chunksize / PAGE_SIZE; j++)
   39.32              mfns[n++] = entries[i].mfn + j;
   39.33  
   39.34 -    return map_frames_ex(mfns, n, 1, 0, 1, dom, 0, pt_prot);
   39.35 +    ret = map_frames_ex(mfns, n, 1, 0, 1, dom, 0, pt_prot);
   39.36 +    free(mfns);
   39.37 +    return ret;
   39.38  }
   39.39  
   39.40  
    40.1 --- a/tools/libxc/xenctrl.h	Tue Sep 02 16:34:53 2008 -0700
    40.2 +++ b/tools/libxc/xenctrl.h	Tue Sep 02 16:55:55 2008 -0700
    40.3 @@ -628,6 +628,12 @@ int xc_domain_memory_populate_physmap(in
    40.4                                        unsigned int mem_flags,
    40.5                                        xen_pfn_t *extent_start);
    40.6  
    40.7 +int xc_domain_memory_translate_gpfn_list(int xc_handle,
    40.8 +                                         uint32_t domid,
    40.9 +                                         unsigned long nr_gpfns,
   40.10 +                                         xen_pfn_t *gpfn_list,
   40.11 +                                         xen_pfn_t *mfn_list);
   40.12 +
   40.13  int xc_domain_ioport_permission(int xc_handle,
   40.14                                  uint32_t domid,
   40.15                                  uint32_t first_port,
    41.1 --- a/tools/pygrub/src/pygrub	Tue Sep 02 16:34:53 2008 -0700
    41.2 +++ b/tools/pygrub/src/pygrub	Tue Sep 02 16:55:55 2008 -0700
    41.3 @@ -124,7 +124,7 @@ def get_fs_offset(file):
    41.4  class GrubLineEditor(curses.textpad.Textbox):
    41.5      def __init__(self, screen, startx, starty, line = ""):
    41.6          screen.addstr(startx, starty, "> ")
    41.7 -        screen.refresh()
    41.8 +        screen.noutrefresh()
    41.9          win = curses.newwin(1, 74, startx, starty + 2)
   41.10          curses.textpad.Textbox.__init__(self, win)
   41.11          
   41.12 @@ -137,7 +137,7 @@ class GrubLineEditor(curses.textpad.Text
   41.13          """Show the text.  One of our advantages over standard textboxes
   41.14          is that we can handle lines longer than the window."""
   41.15  
   41.16 -        self.win.clear()
   41.17 +        self.win.erase()
   41.18          p = self.pos
   41.19          off = 0
   41.20          while p > 70:
   41.21 @@ -188,6 +188,7 @@ class GrubLineEditor(curses.textpad.Text
   41.22          return 1
   41.23  
   41.24      def edit(self):
   41.25 +        curses.doupdate()
   41.26          r = curses.textpad.Textbox.edit(self)
   41.27          if self.cancelled:
   41.28              return None
   41.29 @@ -217,16 +218,15 @@ class Grub:
   41.30              curses.def_prog_mode()
   41.31          
   41.32          curses.reset_prog_mode()
   41.33 -        self.screen.clear()
   41.34 -        self.screen.refresh()
   41.35 +        self.screen.erase()
   41.36  
   41.37          # create basic grub screen with a box of entries and a textbox
   41.38          self.screen.addstr(1, 4, "pyGRUB  version %s" %(PYGRUB_VER,))
   41.39          self.entry_win.box()
   41.40 -        self.screen.refresh()
   41.41 +        self.screen.noutrefresh()
   41.42  
   41.43      def fill_entry_list(self):
   41.44 -        self.entry_win.clear()
   41.45 +        self.entry_win.erase()
   41.46          self.entry_win.box()
   41.47  
   41.48          maxy = self.entry_win.getmaxyx()[0]-3 # maxy - 2 for the frame + index
   41.49 @@ -244,7 +244,7 @@ class Grub:
   41.50              self.entry_win.addstr(y + 1 - self.start_image, 2, i.title.ljust(70))
   41.51              if y == self.selected_image:
   41.52                  self.entry_win.attroff(curses.A_REVERSE)
   41.53 -        self.entry_win.refresh()
   41.54 +        self.entry_win.noutrefresh()
   41.55  
   41.56      def edit_entry(self, origimg):
   41.57          def draw():
   41.58 @@ -259,13 +259,13 @@ class Grub:
   41.59              self.text_win.addch(0, 14, curses.ACS_DARROW)
   41.60              (y, x) = self.text_win.getmaxyx()
   41.61              self.text_win.move(y - 1, x - 1)
   41.62 -            self.text_win.refresh()
   41.63 +            self.text_win.noutrefresh()
   41.64  
   41.65          curline = 1
   41.66          img = copy.deepcopy(origimg)
   41.67          while 1:
   41.68              draw()
   41.69 -            self.entry_win.clear()
   41.70 +            self.entry_win.erase()
   41.71              self.entry_win.box()
   41.72              for idx in range(1, len(img.lines)):
   41.73                  # current line should be highlighted
   41.74 @@ -280,7 +280,8 @@ class Grub:
   41.75                  self.entry_win.addstr(idx, 2, l)
   41.76                  if idx == curline:
   41.77                      self.entry_win.attroff(curses.A_REVERSE)
   41.78 -            self.entry_win.refresh()
   41.79 +            self.entry_win.noutrefresh()
   41.80 +            curses.doupdate()
   41.81  
   41.82              c = self.screen.getch()
   41.83              if c in (ord('q'), 27): # 27 == esc
   41.84 @@ -318,10 +319,10 @@ class Grub:
   41.85              origimg.reset(img.lines)
   41.86  
   41.87      def edit_line(self, line):
   41.88 -        self.screen.clear()
   41.89 +        self.screen.erase()
   41.90          self.screen.addstr(1, 2, "[ Minimal BASH-like line editing is supported.  ")
   41.91          self.screen.addstr(2, 2, "  ESC at any time cancels.  ENTER at any time accepts your changes. ]")
   41.92 -        self.screen.refresh()
   41.93 +        self.screen.noutrefresh()
   41.94  
   41.95          t = GrubLineEditor(self.screen, 5, 2, line)
   41.96          enable_cursor(True)
   41.97 @@ -331,10 +332,10 @@ class Grub:
   41.98          return None
   41.99  
  41.100      def command_line_mode(self):
  41.101 -        self.screen.clear()
  41.102 +        self.screen.erase()
  41.103          self.screen.addstr(1, 2, "[ Minimal BASH-like line editing is supported.  ESC at any time ")
  41.104          self.screen.addstr(2, 2, "  exits.  Typing 'boot' will boot with your entered commands. ] ")
  41.105 -        self.screen.refresh()
  41.106 +        self.screen.noutrefresh()
  41.107  
  41.108          y = 5
  41.109          lines = []
  41.110 @@ -420,7 +421,7 @@ class Grub:
  41.111              self.text_win.addch(0, 14, curses.ACS_DARROW)
  41.112              (y, x) = self.text_win.getmaxyx()
  41.113              self.text_win.move(y - 1, x - 1)
  41.114 -            self.text_win.refresh()
  41.115 +            self.text_win.noutrefresh()
  41.116  
  41.117          # now loop until we hit the timeout or get a go from the user
  41.118          mytime = 0
  41.119 @@ -433,6 +434,7 @@ class Grub:
  41.120              else:
  41.121                  self.screen.addstr(20, 5, " " * 80)
  41.122              self.fill_entry_list()
  41.123 +            curses.doupdate()
  41.124  
  41.125              c = self.screen.getch()
  41.126              if c == -1:
    42.1 --- a/tools/python/xen/util/pci.py	Tue Sep 02 16:34:53 2008 -0700
    42.2 +++ b/tools/python/xen/util/pci.py	Tue Sep 02 16:55:55 2008 -0700
    42.3 @@ -40,6 +40,7 @@ DEV_TYPE_PCIe_BRIDGE    = 1
    42.4  DEV_TYPE_PCI_BRIDGE     = 2
    42.5  DEV_TYPE_PCI            = 3    
    42.6  
    42.7 +PCI_VENDOR_ID = 0x0
    42.8  PCI_STATUS = 0x6
    42.9  PCI_CLASS_DEVICE = 0x0a
   42.10  PCI_CLASS_BRIDGE_PCI = 0x0604
   42.11 @@ -69,6 +70,11 @@ PCI_PM_CTRL_NO_SOFT_RESET = 0x0004
   42.12  PCI_PM_CTRL_STATE_MASK = 0x0003
   42.13  PCI_D3hot = 3
   42.14  
   42.15 +VENDOR_INTEL  = 0x8086
   42.16 +PCI_CAP_ID_VENDOR_SPECIFIC_CAP = 0x09
   42.17 +PCI_CLASS_ID_USB = 0x0c03
   42.18 +PCI_USB_FLRCTRL = 0x4
   42.19 +
   42.20  PCI_CAP_ID_AF = 0x13
   42.21  PCI_AF_CAPs   = 0x3
   42.22  PCI_AF_CAPs_TP_FLR = 0x3
   42.23 @@ -487,7 +493,7 @@ class PciDevice:
   42.24      def do_Dstate_transition(self):
   42.25          pos = self.find_cap_offset(PCI_CAP_ID_PM)
   42.26          if pos == 0:
   42.27 -            return 
   42.28 +            return False
   42.29          
   42.30          (pci_list, cfg_list) = save_pci_conf_space([self.name])
   42.31          
   42.32 @@ -504,6 +510,31 @@ class PciDevice:
   42.33          time.sleep(0.010)
   42.34  
   42.35          restore_pci_conf_space((pci_list, cfg_list))
   42.36 +        return True
   42.37 +
   42.38 +    def do_vendor_specific_FLR_method(self):
   42.39 +        pos = self.find_cap_offset(PCI_CAP_ID_VENDOR_SPECIFIC_CAP)
   42.40 +        if pos == 0:
   42.41 +            return
   42.42 +
   42.43 +        vendor_id = self.pci_conf_read16(PCI_VENDOR_ID)
   42.44 +        if vendor_id != VENDOR_INTEL:
   42.45 +            return
   42.46 +
   42.47 +        class_id = self.pci_conf_read16(PCI_CLASS_DEVICE)
   42.48 +        if class_id != PCI_CLASS_ID_USB:
   42.49 +            return
   42.50 +
   42.51 +        (pci_list, cfg_list) = save_pci_conf_space([self.name])
   42.52 +
   42.53 +        self.pci_conf_write8(pos + PCI_USB_FLRCTRL, 1)
   42.54 +        time.sleep(0.010)
   42.55 +
   42.56 +        restore_pci_conf_space((pci_list, cfg_list))
   42.57 +
   42.58 +    def do_FLR_for_integrated_device(self):
   42.59 +        if not self.do_Dstate_transition():
   42.60 +            self.do_vendor_specific_FLR_method()
   42.61  
   42.62      def find_all_the_multi_functions(self):
   42.63          sysfs_mnt = find_sysfs_mnt()
   42.64 @@ -676,7 +707,7 @@ class PciDevice:
   42.65                  restore_pci_conf_space((pci_list, cfg_list))
   42.66              else:
   42.67                  if self.bus == 0:
   42.68 -                    self.do_Dstate_transition()
   42.69 +                    self.do_FLR_for_integrated_device()
   42.70                  else:
   42.71                      funcs = self.find_all_the_multi_functions()
   42.72                      self.devs_check_driver(funcs)
   42.73 @@ -697,7 +728,7 @@ class PciDevice:
   42.74                  restore_pci_conf_space((pci_list, cfg_list))
   42.75              else:
   42.76                  if self.bus == 0:
   42.77 -                    self.do_Dstate_transition()
   42.78 +                    self.do_FLR_for_integrated_device()
   42.79                  else:
   42.80                      devs = self.find_coassigned_devices(False)
   42.81                      # Remove the element 0 which is a bridge
    43.1 --- a/tools/python/xen/xend/XendConfig.py	Tue Sep 02 16:34:53 2008 -0700
    43.2 +++ b/tools/python/xen/xend/XendConfig.py	Tue Sep 02 16:55:55 2008 -0700
    43.3 @@ -1538,9 +1538,9 @@ class XendConfig(dict):
    43.4                      pci_dev_info[opt] = val
    43.5                  except TypeError:
    43.6                      pass
    43.7 -                # append uuid for each pci device.
    43.8 -                dpci_uuid = pci_dev_info.get('uuid', uuid.createString())
    43.9 -                pci_dev_info['uuid'] = dpci_uuid
   43.10 +            # append uuid for each pci device.
   43.11 +            dpci_uuid = pci_dev_info.get('uuid', uuid.createString())
   43.12 +            pci_dev_info['uuid'] = dpci_uuid
   43.13              pci_devs.append(pci_dev_info)
   43.14          dev_config['devs'] = pci_devs 
   43.15  
    44.1 --- a/tools/python/xen/xend/XendDomain.py	Tue Sep 02 16:34:53 2008 -0700
    44.2 +++ b/tools/python/xen/xend/XendDomain.py	Tue Sep 02 16:55:55 2008 -0700
    44.3 @@ -419,6 +419,8 @@ class XendDomain:
    44.4                  except VmError:
    44.5                      log.exception("Unable to recreate domain")
    44.6                      try:
    44.7 +                        xc.domain_pause(domid)
    44.8 +                        do_FLR(domid)
    44.9                          xc.domain_destroy(domid)
   44.10                      except:
   44.11                          log.exception("Hard destruction of domain failed: %d" %
   44.12 @@ -1255,6 +1257,8 @@ class XendDomain:
   44.13              val = dominfo.destroy()
   44.14          else:
   44.15              try:
   44.16 +                xc.domain_pause(int(domid))
   44.17 +                do_FLR(int(domid))
   44.18                  val = xc.domain_destroy(int(domid))
   44.19              except ValueError:
   44.20                  raise XendInvalidDomain(domid)
    45.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Tue Sep 02 16:34:53 2008 -0700
    45.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Tue Sep 02 16:55:55 2008 -0700
    45.3 @@ -287,6 +287,28 @@ def dom_get(dom):
    45.4          log.trace("domain_getinfo(%d) failed, ignoring: %s", dom, str(err))
    45.5      return None
    45.6  
    45.7 +def do_FLR(domid):
    45.8 +    from xen.xend.server.pciif import parse_pci_name, PciDevice
    45.9 +    path = '/local/domain/0/backend/pci/%u/0/' % domid
   45.10 +    num_devs = xstransact.Read(path + 'num_devs');
   45.11 +    if num_devs is None or num_devs == "":
   45.12 +        return;
   45.13 +
   45.14 +    num_devs = int(xstransact.Read(path + 'num_devs'));
   45.15 +
   45.16 +    dev_str_list = []
   45.17 +    for i in range(num_devs):
   45.18 +        dev_str = xstransact.Read(path + 'dev-%i' % i)
   45.19 +        dev_str_list = dev_str_list + [dev_str]
   45.20 +
   45.21 +    for dev_str in dev_str_list:
   45.22 +        (dom, b, d, f) = parse_pci_name(dev_str)
   45.23 +        try:
   45.24 +            dev = PciDevice(dom, b, d, f)
   45.25 +        except Exception, e:
   45.26 +            raise VmError("pci: failed to locate device and "+
   45.27 +                    "parse it's resources - "+str(e))
   45.28 +        dev.do_FLR()
   45.29  
   45.30  class XendDomainInfo:
   45.31      """An object represents a domain.
   45.32 @@ -2386,44 +2408,34 @@ class XendDomainInfo:
   45.33          if self.domid is None:
   45.34              return
   45.35  
   45.36 +        from xen.xend import XendDomain
   45.37          log.debug("XendDomainInfo.destroy: domid=%s", str(self.domid))
   45.38  
   45.39          paths = self._prepare_phantom_paths()
   45.40  
   45.41          self._cleanupVm()
   45.42          if self.dompath is not None:
   45.43 -            self.destroyDomain()
   45.44 +            try:
   45.45 +                xc.domain_destroy_hook(self.domid)
   45.46 +                xc.domain_pause(self.domid)
   45.47 +                do_FLR(self.domid)
   45.48 +                xc.domain_destroy(self.domid)
   45.49 +                for state in DOM_STATES_OLD:
   45.50 +                    self.info[state] = 0
   45.51 +                self._stateSet(DOM_STATE_HALTED)
   45.52 +            except:
   45.53 +                log.exception("XendDomainInfo.destroy: domain destruction failed.")
   45.54 +
   45.55 +            XendDomain.instance().remove_domain(self)
   45.56 +            self.cleanupDomain()
   45.57  
   45.58          self._cleanup_phantom_devs(paths)
   45.59  
   45.60          if "transient" in self.info["other_config"] \
   45.61             and bool(self.info["other_config"]["transient"]):
   45.62 -            from xen.xend import XendDomain
   45.63              XendDomain.instance().domain_delete_by_dominfo(self)
   45.64  
   45.65  
   45.66 -    def destroyDomain(self):
   45.67 -        log.debug("XendDomainInfo.destroyDomain(%s)", str(self.domid))
   45.68 -
   45.69 -        paths = self._prepare_phantom_paths()
   45.70 -
   45.71 -        try:
   45.72 -            if self.domid is not None:
   45.73 -                xc.domain_destroy_hook(self.domid)
   45.74 -                xc.domain_destroy(self.domid)
   45.75 -                for state in DOM_STATES_OLD:
   45.76 -                    self.info[state] = 0
   45.77 -                self._stateSet(DOM_STATE_HALTED)
   45.78 -        except:
   45.79 -            log.exception("XendDomainInfo.destroy: xc.domain_destroy failed.")
   45.80 -
   45.81 -        from xen.xend import XendDomain
   45.82 -        XendDomain.instance().remove_domain(self)
   45.83 -
   45.84 -        self.cleanupDomain()
   45.85 -        self._cleanup_phantom_devs(paths)
   45.86 -
   45.87 -
   45.88      def resetDomain(self):
   45.89          log.debug("XendDomainInfo.resetDomain(%s)", str(self.domid))
   45.90  
    46.1 --- a/tools/python/xen/xend/image.py	Tue Sep 02 16:34:53 2008 -0700
    46.2 +++ b/tools/python/xen/xend/image.py	Tue Sep 02 16:55:55 2008 -0700
    46.3 @@ -637,8 +637,9 @@ class LinuxImageHandler(ImageHandler):
    46.4          log.debug("ramdisk        = %s", self.ramdisk)
    46.5          log.debug("vcpus          = %d", self.vm.getVCpuCount())
    46.6          log.debug("features       = %s", self.vm.getFeatures())
    46.7 +        log.debug("flags          = %d", self.flags)
    46.8          if arch.type == "ia64":
    46.9 -            log.debug("vhpt          = %d", self.flags)
   46.10 +            log.debug("vhpt          = %d", self.vhpt)
   46.11  
   46.12          return xc.linux_build(domid          = self.vm.getDomid(),
   46.13                                memsize        = mem_mb,
    47.1 --- a/tools/python/xen/xend/server/DevController.py	Tue Sep 02 16:34:53 2008 -0700
    47.2 +++ b/tools/python/xen/xend/server/DevController.py	Tue Sep 02 16:55:55 2008 -0700
    47.3 @@ -223,12 +223,6 @@ class DevController:
    47.4          raise VmError('%s devices may not be reconfigured' % self.deviceClass)
    47.5  
    47.6  
    47.7 -    def cleanupDeviceOnDomainDestroy(self, devid):
    47.8 -        """ Some devices may need special cleanup when the guest domain
    47.9 -            is destroyed.
   47.10 -        """
   47.11 -        return
   47.12 -
   47.13      def destroyDevice(self, devid, force):
   47.14          """Destroy the specified device.
   47.15  
   47.16 @@ -245,8 +239,6 @@ class DevController:
   47.17  
   47.18          dev = self.convertToDeviceNumber(devid)
   47.19  
   47.20 -        self.cleanupDeviceOnDomainDestroy(dev)
   47.21 -
   47.22          # Modify online status /before/ updating state (latter is watched by
   47.23          # drivers, so this ordering avoids a race).
   47.24          self.writeBackend(dev, 'online', "0")
    48.1 --- a/tools/python/xen/xend/server/pciif.py	Tue Sep 02 16:34:53 2008 -0700
    48.2 +++ b/tools/python/xen/xend/server/pciif.py	Tue Sep 02 16:55:55 2008 -0700
    48.3 @@ -286,7 +286,7 @@ class PciController(DevController):
    48.4                      )%(dev.name))
    48.5  
    48.6          if dev.has_non_page_aligned_bar and arch.type != "ia64":
    48.7 -            raise VmError("pci: %: non-page-aligned MMIO BAR found." % dev.name)
    48.8 +            raise VmError("pci: %s: non-page-aligned MMIO BAR found." % dev.name)
    48.9  
   48.10          self.CheckSiblingDevices(fe_domid, dev)
   48.11  
   48.12 @@ -383,10 +383,10 @@ class PciController(DevController):
   48.13              if (dev.dev_type == DEV_TYPE_PCIe_ENDPOINT) and not dev.pcie_flr:
   48.14                  if dev.bus == 0:
   48.15                      # We cope with this case by using the Dstate transition
   48.16 -                    # method for now.
   48.17 +                    # method or some vendor specific methods for now.
   48.18                      err_msg = 'pci: %s: it is on bus 0, but has no PCIe' +\
   48.19                          ' FLR Capability. Will try the Dstate transition'+\
   48.20 -                        ' method if available.'
   48.21 +                        ' method or some vendor specific methods if available.'
   48.22                      log.warn(err_msg % dev.name)
   48.23                  else:
   48.24                      funcs = dev.find_all_the_multi_functions()
   48.25 @@ -404,10 +404,11 @@ class PciController(DevController):
   48.26                  if dev.bus == 0 or arch.type == "ia64":
   48.27                      if not dev.pci_af_flr:
   48.28                          # We cope with this case by using the Dstate transition
   48.29 -                        # method for now.
   48.30 +                        # method or some vendor specific methods for now.
   48.31                          err_msg = 'pci: %s: it is on bus 0, but has no PCI' +\
   48.32                              ' Advanced Capabilities for FLR. Will try the'+\
   48.33 -                            ' Dstate transition method if available.'
   48.34 +                            ' Dstate transition method or some vendor' +\
   48.35 +                            ' specific methods if available.'
   48.36                          log.warn(err_msg % dev.name)
   48.37                  else:
   48.38                      # All devices behind the uppermost PCI/PCI-X bridge must be\
   48.39 @@ -543,22 +544,6 @@ class PciController(DevController):
   48.40  
   48.41          return new_num_devs
   48.42  
   48.43 -    def cleanupDeviceOnDomainDestroy(self, devid):
   48.44 -        num_devs = int(self.readBackend(devid, 'num_devs'))
   48.45 -        dev_str_list = []
   48.46 -        for i in range(num_devs):
   48.47 -            dev_str = self.readBackend(devid, 'dev-%i' % i)
   48.48 -            dev_str_list = dev_str_list + [dev_str]
   48.49 -
   48.50 -        for dev_str in dev_str_list:
   48.51 -            (dom, b, d, f) = parse_pci_name(dev_str)
   48.52 -            try:
   48.53 -                dev = PciDevice(dom, b, d, f)
   48.54 -            except Exception, e:
   48.55 -                raise VmError("pci: failed to locate device and "+
   48.56 -                        "parse it's resources - "+str(e))
   48.57 -            dev.do_FLR()
   48.58 -
   48.59      def waitForBackend(self,devid):
   48.60          return (0, "ok - no hotplug")
   48.61  
    49.1 --- a/tools/xenmon/Makefile	Tue Sep 02 16:34:53 2008 -0700
    49.2 +++ b/tools/xenmon/Makefile	Tue Sep 02 16:55:55 2008 -0700
    49.3 @@ -42,6 +42,6 @@ clean:
    49.4  
    49.5  
    49.6  %: %.c Makefile
    49.7 -	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
    49.8 +	$(CC) $(CFLAGS) $< $(LDFLAGS) -o $@
    49.9  xentrace_%: %.c Makefile
   49.10 -	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $<
   49.11 +	$(CC) $(CFLAGS) $< $(LDFLAGS) -o $@
    50.1 --- a/tools/xentrace/formats	Tue Sep 02 16:34:53 2008 -0700
    50.2 +++ b/tools/xentrace/formats	Tue Sep 02 16:55:55 2008 -0700
    50.3 @@ -23,8 +23,8 @@ 0x0002f00f  CPU%(cpu)d  %(tsc)d (+%(relt
    50.4  0x00081001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMENTRY     [ dom:vcpu = 0x%(1)08x ]
    50.5  0x00081002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMEXIT      [ dom:vcpu = 0x%(1)08x, exitcode = 0x%(2)08x, rIP  = 0x%(3)08x ]
    50.6  0x00081102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMEXIT      [ dom:vcpu = 0x%(1)08x, exitcode = 0x%(2)08x, rIP  = 0x%(3)016x ]
    50.7 -0x00082001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(3)02x, virt = 0x%(2)08x ]
    50.8 -0x00082101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(3)02x, virt = 0x%(2)016x ]
    50.9 +0x00082001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ]
   50.10 +0x00082101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)016x ]
   50.11  0x00082002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ]
   50.12  0x00082102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ dom:vcpu = 0x%(1)08x,  errorcode = 0x%(2)02x, virt = 0x%(3)016x ]
   50.13  0x00082003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INJ_EXC     [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x, errorcode = 0x%(3)04x ]
    51.1 --- a/xen/Makefile	Tue Sep 02 16:34:53 2008 -0700
    51.2 +++ b/xen/Makefile	Tue Sep 02 16:55:55 2008 -0700
    51.3 @@ -1,8 +1,8 @@
    51.4  # This is the correct place to edit the build version.
    51.5  # All other places this is stored (eg. compile.h) should be autogenerated.
    51.6  export XEN_VERSION       = 3
    51.7 -export XEN_SUBVERSION    = 3
    51.8 -export XEN_EXTRAVERSION ?= .0-rc8-pre$(XEN_VENDORVERSION)
    51.9 +export XEN_SUBVERSION    = 4
   51.10 +export XEN_EXTRAVERSION ?= -unstable$(XEN_VENDORVERSION)
   51.11  export XEN_FULLVERSION   = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
   51.12  -include xen-version
   51.13  
   51.14 @@ -88,7 +88,7 @@ include/xen/compile.h: include/xen/compi
   51.15  	    -e 's/@@whoami@@/$(XEN_WHOAMI)/g' \
   51.16  	    -e 's/@@domain@@/$(XEN_DOMAIN)/g' \
   51.17  	    -e 's/@@hostname@@/$(shell hostname)/g' \
   51.18 -	    -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) -v 2>&1 | grep -i "gcc.*version")!g' \
   51.19 +	    -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) -v 2>&1 | tail -1)!g' \
   51.20  	    -e 's/@@version@@/$(XEN_VERSION)/g' \
   51.21  	    -e 's/@@subversion@@/$(XEN_SUBVERSION)/g' \
   51.22  	    -e 's/@@extraversion@@/$(XEN_EXTRAVERSION)/g' \
    52.1 --- a/xen/arch/ia64/xen/mm.c	Tue Sep 02 16:34:53 2008 -0700
    52.2 +++ b/xen/arch/ia64/xen/mm.c	Tue Sep 02 16:55:55 2008 -0700
    52.3 @@ -2698,6 +2698,20 @@ void put_page_type(struct page_info *pag
    52.4  }
    52.5  
    52.6  
    52.7 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
    52.8 +{
    52.9 +    struct page_info *page = mfn_to_page(page_nr);
   52.10 +
   52.11 +    if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) )
   52.12 +    {
   52.13 +        MEM_LOG("Could not get page ref for pfn %lx", page_nr);
   52.14 +        return 0;
   52.15 +    }
   52.16 +
   52.17 +    return 1;
   52.18 +}
   52.19 +
   52.20 +
   52.21  int get_page_type(struct page_info *page, u32 type)
   52.22  {
   52.23      u64 nx, x, y = page->u.inuse.type_info;
   52.24 @@ -2792,6 +2806,8 @@ int memory_is_conventional_ram(paddr_t p
   52.25  long
   52.26  arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
   52.27  {
   52.28 +    struct page_info *page = NULL;
   52.29 +
   52.30      switch (op) {
   52.31      case XENMEM_add_to_physmap:
   52.32      {
   52.33 @@ -2836,11 +2852,21 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
   52.34  
   52.35              spin_unlock(&d->grant_table->lock);
   52.36              break;
   52.37 +        case XENMAPSPACE_mfn:
   52.38 +        {
   52.39 +            if ( get_page_from_pagenr(xatp.idx, d) ) {
   52.40 +                mfn = xatp.idx;
   52.41 +                page = mfn_to_page(mfn);
   52.42 +            }
   52.43 +            break;
   52.44 +        }
   52.45          default:
   52.46              break;
   52.47          }
   52.48  
   52.49          if (mfn == 0) {
   52.50 +            if ( page )
   52.51 +                put_page(page);
   52.52              rcu_unlock_domain(d);
   52.53              return -EINVAL;
   52.54          }
   52.55 @@ -2872,12 +2898,54 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
   52.56  
   52.57      out:
   52.58          domain_unlock(d);
   52.59 -        
   52.60 +
   52.61 +        if ( page )
   52.62 +            put_page(page);
   52.63 +
   52.64          rcu_unlock_domain(d);
   52.65  
   52.66          break;
   52.67      }
   52.68  
   52.69 +    case XENMEM_remove_from_physmap:
   52.70 +    {
   52.71 +        struct xen_remove_from_physmap xrfp;
   52.72 +        unsigned long mfn;
   52.73 +        struct domain *d;
   52.74 +
   52.75 +        if ( copy_from_guest(&xrfp, arg, 1) )
   52.76 +            return -EFAULT;
   52.77 +
   52.78 +        if ( xrfp.domid == DOMID_SELF )
   52.79 +        {
   52.80 +            d = rcu_lock_current_domain();
   52.81 +        }
   52.82 +        else
   52.83 +        {
   52.84 +            if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
   52.85 +                return -ESRCH;
   52.86 +            if ( !IS_PRIV_FOR(current->domain, d) )
   52.87 +            {
   52.88 +                rcu_unlock_domain(d);
   52.89 +                return -EPERM;
   52.90 +            }
   52.91 +        }
   52.92 +
   52.93 +        domain_lock(d);
   52.94 +
   52.95 +        mfn = gmfn_to_mfn(d, xrfp.gpfn);
   52.96 +
   52.97 +        if ( mfn_valid(mfn) )
   52.98 +            guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
   52.99 +
  52.100 +        domain_unlock(d);
  52.101 +
  52.102 +        rcu_unlock_domain(d);
  52.103 +
  52.104 +        break;
  52.105 +    }
  52.106 +
  52.107 +
  52.108      case XENMEM_machine_memory_map:
  52.109      {
  52.110          struct xen_memory_map memmap;
    53.1 --- a/xen/arch/x86/acpi/power.c	Tue Sep 02 16:34:53 2008 -0700
    53.2 +++ b/xen/arch/x86/acpi/power.c	Tue Sep 02 16:55:55 2008 -0700
    53.3 @@ -24,6 +24,7 @@
    53.4  #include <xen/sched.h>
    53.5  #include <xen/domain.h>
    53.6  #include <xen/console.h>
    53.7 +#include <xen/iommu.h>
    53.8  #include <public/platform.h>
    53.9  #include <asm/tboot.h>
   53.10  
   53.11 @@ -41,6 +42,8 @@ void do_suspend_lowlevel(void);
   53.12  
   53.13  static int device_power_down(void)
   53.14  {
   53.15 +    iommu_suspend();
   53.16 +
   53.17      console_suspend();
   53.18  
   53.19      time_suspend();
   53.20 @@ -65,6 +68,8 @@ static void device_power_up(void)
   53.21      time_resume();
   53.22  
   53.23      console_resume();
   53.24 +
   53.25 +    iommu_resume();
   53.26  }
   53.27  
   53.28  static void freeze_domains(void)
    54.1 --- a/xen/arch/x86/cpu/amd.c	Tue Sep 02 16:34:53 2008 -0700
    54.2 +++ b/xen/arch/x86/cpu/amd.c	Tue Sep 02 16:55:55 2008 -0700
    54.3 @@ -10,10 +10,144 @@
    54.4  #include <asm/hvm/support.h>
    54.5  
    54.6  #include "cpu.h"
    54.7 +#include "amd.h"
    54.8  
    54.9  int start_svm(struct cpuinfo_x86 *c);
   54.10  
   54.11  /*
   54.12 + * Pre-canned values for overriding the CPUID features 
   54.13 + * and extended features masks.
   54.14 + *
   54.15 + * Currently supported processors:
   54.16 + * 
   54.17 + * "fam_0f_rev_c"
   54.18 + * "fam_0f_rev_d"
   54.19 + * "fam_0f_rev_e"
   54.20 + * "fam_0f_rev_f"
   54.21 + * "fam_0f_rev_g"
   54.22 + * "fam_10_rev_b"
   54.23 + * "fam_10_rev_c"
   54.24 + * "fam_11_rev_b"
   54.25 + */
   54.26 +static char opt_famrev[14];
   54.27 +string_param("cpuid_mask_cpu", opt_famrev);
   54.28 +
   54.29 +/* Finer-grained CPUID feature control. */
   54.30 +static unsigned int opt_cpuid_mask_ecx, opt_cpuid_mask_edx;
   54.31 +integer_param("cpuid_mask_ecx", opt_cpuid_mask_ecx);
   54.32 +integer_param("cpuid_mask_edx", opt_cpuid_mask_edx);
   54.33 +static unsigned int opt_cpuid_mask_ext_ecx, opt_cpuid_mask_ext_edx;
   54.34 +integer_param("cpuid_mask_ecx", opt_cpuid_mask_ext_ecx);
   54.35 +integer_param("cpuid_mask_edx", opt_cpuid_mask_ext_edx);
   54.36 +
   54.37 +static inline void wrmsr_amd(unsigned int index, unsigned int lo, 
   54.38 +		unsigned int hi)
   54.39 +{
   54.40 +	asm volatile (
   54.41 +		"wrmsr"
   54.42 +		: /* No outputs */
   54.43 +		: "c" (index), "a" (lo), 
   54.44 +		"d" (hi), "D" (0x9c5a203a)
   54.45 +	);
   54.46 +}
   54.47 +
   54.48 +/*
   54.49 + * Mask the features and extended features returned by CPUID.  Parameters are
   54.50 + * set from the boot line via two methods:
   54.51 + *
   54.52 + *   1) Specific processor revision string
   54.53 + *   2) User-defined masks
   54.54 + *
   54.55 + * The processor revision string parameter has precedene.
   54.56 + */
   54.57 +static void __devinit set_cpuidmask(struct cpuinfo_x86 *c)
   54.58 +{
   54.59 +	static unsigned int feat_ecx, feat_edx;
   54.60 +	static unsigned int extfeat_ecx, extfeat_edx;
   54.61 +	static enum { not_parsed, no_mask, set_mask } status;
   54.62 +
   54.63 +	if (status == no_mask)
   54.64 +		return;
   54.65 +
   54.66 +	if (status == set_mask)
   54.67 +		goto setmask;
   54.68 +
   54.69 +	ASSERT((status == not_parsed) && (smp_processor_id() == 0));
   54.70 +	status = no_mask;
   54.71 +
   54.72 +	if (opt_cpuid_mask_ecx | opt_cpuid_mask_edx |
   54.73 +	    opt_cpuid_mask_ext_ecx | opt_cpuid_mask_ext_edx) {
   54.74 +		feat_ecx = opt_cpuid_mask_ecx ? : ~0U;
   54.75 +		feat_edx = opt_cpuid_mask_edx ? : ~0U;
   54.76 +		extfeat_ecx = opt_cpuid_mask_ext_ecx ? : ~0U;
   54.77 +		extfeat_edx = opt_cpuid_mask_ext_edx ? : ~0U;
   54.78 +	} else if (*opt_famrev == '\0') {
   54.79 +		return;
   54.80 +	} else if (!strcmp(opt_famrev, "fam_0f_rev_c")) {
   54.81 +		feat_ecx = AMD_FEATURES_K8_REV_C_ECX;
   54.82 +		feat_edx = AMD_FEATURES_K8_REV_C_EDX;
   54.83 +		extfeat_ecx = AMD_EXTFEATURES_K8_REV_C_ECX;
   54.84 +		extfeat_edx = AMD_EXTFEATURES_K8_REV_C_EDX;
   54.85 +	} else if (!strcmp(opt_famrev, "fam_0f_rev_d")) {
   54.86 +		feat_ecx = AMD_FEATURES_K8_REV_D_ECX;
   54.87 +		feat_edx = AMD_FEATURES_K8_REV_D_EDX;
   54.88 +		extfeat_ecx = AMD_EXTFEATURES_K8_REV_D_ECX;
   54.89 +		extfeat_edx = AMD_EXTFEATURES_K8_REV_D_EDX;
   54.90 +	} else if (!strcmp(opt_famrev, "fam_0f_rev_e")) {
   54.91 +		feat_ecx = AMD_FEATURES_K8_REV_E_ECX;
   54.92 +		feat_edx = AMD_FEATURES_K8_REV_E_EDX;
   54.93 +		extfeat_ecx = AMD_EXTFEATURES_K8_REV_E_ECX;
   54.94 +		extfeat_edx = AMD_EXTFEATURES_K8_REV_E_EDX;
   54.95 +	} else if (!strcmp(opt_famrev, "fam_0f_rev_f")) {
   54.96 +		feat_ecx = AMD_FEATURES_K8_REV_F_ECX;
   54.97 +		feat_edx = AMD_FEATURES_K8_REV_F_EDX;
   54.98 +		extfeat_ecx = AMD_EXTFEATURES_K8_REV_F_ECX;
   54.99 +		extfeat_edx = AMD_EXTFEATURES_K8_REV_F_EDX;
  54.100 +	} else if (!strcmp(opt_famrev, "fam_0f_rev_g")) {
  54.101 +		feat_ecx = AMD_FEATURES_K8_REV_G_ECX;
  54.102 +		feat_edx = AMD_FEATURES_K8_REV_G_EDX;
  54.103 +		extfeat_ecx = AMD_EXTFEATURES_K8_REV_G_ECX;
  54.104 +		extfeat_edx = AMD_EXTFEATURES_K8_REV_G_EDX;
  54.105 +	} else if (!strcmp(opt_famrev, "fam_10_rev_b")) {
  54.106 +		feat_ecx = AMD_FEATURES_FAM10h_REV_B_ECX;
  54.107 +		feat_edx = AMD_FEATURES_FAM10h_REV_B_EDX;
  54.108 +		extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_B_ECX;
  54.109 +		extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_B_EDX;
  54.110 +	} else if (!strcmp(opt_famrev, "fam_10_rev_c")) {
  54.111 +		feat_ecx = AMD_FEATURES_FAM10h_REV_C_ECX;
  54.112 +		feat_edx = AMD_FEATURES_FAM10h_REV_C_EDX;
  54.113 +		extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_C_ECX;
  54.114 +		extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_C_EDX;
  54.115 +	} else if (!strcmp(opt_famrev, "fam_11_rev_b")) {
  54.116 +		feat_ecx = AMD_FEATURES_FAM11h_REV_B_ECX;
  54.117 +		feat_edx = AMD_FEATURES_FAM11h_REV_B_EDX;
  54.118 +		extfeat_ecx = AMD_EXTFEATURES_FAM11h_REV_B_ECX;
  54.119 +		extfeat_edx = AMD_EXTFEATURES_FAM11h_REV_B_EDX;
  54.120 +	} else {
  54.121 +		printk("Invalid processor string: %s\n", opt_famrev);
  54.122 +		printk("CPUID will not be masked\n");
  54.123 +		return;
  54.124 +	}
  54.125 +
  54.126 +	status = set_mask;
  54.127 +	printk("Writing CPUID feature mask ECX:EDX -> %08Xh:%08Xh\n", 
  54.128 +	       feat_ecx, feat_edx);
  54.129 +	printk("Writing CPUID extended feature mask ECX:EDX -> %08Xh:%08Xh\n", 
  54.130 +	       extfeat_ecx, extfeat_edx);
  54.131 +
  54.132 + setmask:
  54.133 +	/* FIXME check if processor supports CPUID masking */
  54.134 +	/* AMD processors prior to family 10h required a 32-bit password */
  54.135 +	if (c->x86 >= 0x10) {
  54.136 +		wrmsr(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
  54.137 +		wrmsr(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
  54.138 +	} else if (c->x86 == 0x0f) {
  54.139 +		wrmsr_amd(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
  54.140 +		wrmsr_amd(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
  54.141 +	}
  54.142 +}
  54.143 +
  54.144 +/*
  54.145   * amd_flush_filter={on,off}. Forcibly Enable or disable the TLB flush
  54.146   * filter on AMD 64-bit processors.
  54.147   */
  54.148 @@ -115,7 +249,7 @@ static void check_disable_c1e(unsigned i
  54.149  		on_each_cpu(disable_c1e, NULL, 1, 1);
  54.150  }
  54.151  
  54.152 -static void __init init_amd(struct cpuinfo_x86 *c)
  54.153 +static void __devinit init_amd(struct cpuinfo_x86 *c)
  54.154  {
  54.155  	u32 l, h;
  54.156  	int mbytes = num_physpages >> (20-PAGE_SHIFT);
  54.157 @@ -368,6 +502,8 @@ static void __init init_amd(struct cpuin
  54.158  	if ((smp_processor_id() == 1) && c1_ramping_may_cause_clock_drift(c))
  54.159  		disable_c1_ramping();
  54.160  
  54.161 +	set_cpuidmask(c);
  54.162 +
  54.163  	start_svm(c);
  54.164  }
  54.165  
    55.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    55.2 +++ b/xen/arch/x86/cpu/amd.h	Tue Sep 02 16:55:55 2008 -0700
    55.3 @@ -0,0 +1,103 @@
    55.4 +/*
    55.5 + * amd.h - AMD processor specific definitions
    55.6 + */
    55.7 +
    55.8 +#ifndef __AMD_H__
    55.9 +#define __AMD_H__
   55.10 +
   55.11 +#include <asm/cpufeature.h>
   55.12 +
   55.13 +/* CPUID masked for use by AMD-V Extended Migration */
   55.14 +
   55.15 +#define X86_FEATURE_BITPOS(_feature_) ((_feature_) % 32)
   55.16 +#define __bit(_x_) (1U << X86_FEATURE_BITPOS(_x_))
   55.17 +
   55.18 +/* Family 0Fh, Revision C */
   55.19 +#define AMD_FEATURES_K8_REV_C_ECX  0
   55.20 +#define AMD_FEATURES_K8_REV_C_EDX (					\
   55.21 +	__bit(X86_FEATURE_FPU)      | __bit(X86_FEATURE_VME)   |	\
   55.22 +	__bit(X86_FEATURE_DE)       | __bit(X86_FEATURE_PSE)   |	\
   55.23 +	__bit(X86_FEATURE_TSC)      | __bit(X86_FEATURE_MSR)   |	\
   55.24 +	__bit(X86_FEATURE_PAE)      | __bit(X86_FEATURE_MCE)   |	\
   55.25 +	__bit(X86_FEATURE_CX8)      | __bit(X86_FEATURE_APIC)  |	\
   55.26 +	__bit(X86_FEATURE_SEP)      | __bit(X86_FEATURE_MTRR)  |	\
   55.27 +	__bit(X86_FEATURE_PGE)      | __bit(X86_FEATURE_MCA)   | 	\
   55.28 +	__bit(X86_FEATURE_CMOV)     | __bit(X86_FEATURE_PAT)   |	\
   55.29 +	__bit(X86_FEATURE_PSE36)    | __bit(X86_FEATURE_CLFLSH)|	\
   55.30 +	__bit(X86_FEATURE_MMX)      | __bit(X86_FEATURE_FXSR)  | 	\
   55.31 +	__bit(X86_FEATURE_XMM)      | __bit(X86_FEATURE_XMM2))
   55.32 +#define AMD_EXTFEATURES_K8_REV_C_ECX  0 
   55.33 +#define AMD_EXTFEATURES_K8_REV_C_EDX  (					\
   55.34 +	__bit(X86_FEATURE_FPU)      | __bit(X86_FEATURE_VME)   |	\
   55.35 +	__bit(X86_FEATURE_DE)       | __bit(X86_FEATURE_PSE)   |	\
   55.36 +	__bit(X86_FEATURE_TSC)      | __bit(X86_FEATURE_MSR)   |	\
   55.37 +	__bit(X86_FEATURE_PAE)      | __bit(X86_FEATURE_MCE)   |	\
   55.38 +	__bit(X86_FEATURE_CX8)      | __bit(X86_FEATURE_APIC)  |	\
   55.39 +	__bit(X86_FEATURE_SYSCALL)  | __bit(X86_FEATURE_MTRR)  |	\
   55.40 +	__bit(X86_FEATURE_PGE)      | __bit(X86_FEATURE_MCA)   |	\
   55.41 +	__bit(X86_FEATURE_CMOV)     | __bit(X86_FEATURE_PAT)   |	\
   55.42 +	__bit(X86_FEATURE_PSE36)    | __bit(X86_FEATURE_NX)    |	\
   55.43 +	__bit(X86_FEATURE_MMXEXT)   | __bit(X86_FEATURE_MMX)   |	\
   55.44 +	__bit(X86_FEATURE_FXSR)     | __bit(X86_FEATURE_LM)    |	\
   55.45 +	__bit(X86_FEATURE_3DNOWEXT) | __bit(X86_FEATURE_3DNOW))
   55.46 +
   55.47 +/* Family 0Fh, Revision D */
   55.48 +#define AMD_FEATURES_K8_REV_D_ECX         AMD_FEATURES_K8_REV_C_ECX
   55.49 +#define AMD_FEATURES_K8_REV_D_EDX         AMD_FEATURES_K8_REV_C_EDX
   55.50 +#define AMD_EXTFEATURES_K8_REV_D_ECX     (AMD_EXTFEATURES_K8_REV_C_ECX |\
   55.51 +	__bit(X86_FEATURE_LAHF_LM))
   55.52 +#define AMD_EXTFEATURES_K8_REV_D_EDX     (AMD_EXTFEATURES_K8_REV_C_EDX |\
   55.53 +	__bit(X86_FEATURE_FFXSR))
   55.54 +
   55.55 +/* Family 0Fh, Revision E */
   55.56 +#define AMD_FEATURES_K8_REV_E_ECX        (AMD_FEATURES_K8_REV_D_ECX |	\
   55.57 +	__bit(X86_FEATURE_XMM3))
   55.58 +#define AMD_FEATURES_K8_REV_E_EDX        (AMD_FEATURES_K8_REV_D_EDX | 	\
   55.59 +	__bit(X86_FEATURE_HT))
   55.60 +#define AMD_EXTFEATURES_K8_REV_E_ECX     (AMD_EXTFEATURES_K8_REV_D_ECX |\
   55.61 +	__bit(X86_FEATURE_CMP_LEGACY)) 
   55.62 +#define AMD_EXTFEATURES_K8_REV_E_EDX      AMD_EXTFEATURES_K8_REV_D_EDX
   55.63 +
   55.64 +/* Family 0Fh, Revision F */
   55.65 +#define AMD_FEATURES_K8_REV_F_ECX        (AMD_FEATURES_K8_REV_E_ECX | 	\
   55.66 +	__bit(X86_FEATURE_CX16))
   55.67 +#define AMD_FEATURES_K8_REV_F_EDX         AMD_FEATURES_K8_REV_E_EDX
   55.68 +#define AMD_EXTFEATURES_K8_REV_F_ECX     (AMD_EXTFEATURES_K8_REV_E_ECX |\
   55.69 +	__bit(X86_FEATURE_SVME) | __bit(X86_FEATURE_EXTAPICSPACE) |	\
   55.70 +	__bit(X86_FEATURE_ALTMOVCR))
   55.71 +#define AMD_EXTFEATURES_K8_REV_F_EDX     (AMD_EXTFEATURES_K8_REV_E_EDX |\
   55.72 +	__bit(X86_FEATURE_RDTSCP))
   55.73 +
   55.74 +/* Family 0Fh, Revision G */
   55.75 +#define AMD_FEATURES_K8_REV_G_ECX         AMD_FEATURES_K8_REV_F_ECX
   55.76 +#define AMD_FEATURES_K8_REV_G_EDX         AMD_FEATURES_K8_REV_F_EDX
   55.77 +#define AMD_EXTFEATURES_K8_REV_G_ECX     (AMD_EXTFEATURES_K8_REV_F_ECX |\
   55.78 +	__bit(X86_FEATURE_3DNOWPF))
   55.79 +#define AMD_EXTFEATURES_K8_REV_G_EDX      AMD_EXTFEATURES_K8_REV_F_EDX
   55.80 +
   55.81 +/* Family 10h, Revision B */
   55.82 +#define AMD_FEATURES_FAM10h_REV_B_ECX    (AMD_FEATURES_K8_REV_F_ECX | 	\
   55.83 +	__bit(X86_FEATURE_POPCNT) | __bit(X86_FEATURE_MWAIT))
   55.84 +#define AMD_FEATURES_FAM10h_REV_B_EDX     AMD_FEATURES_K8_REV_F_EDX
   55.85 +#define AMD_EXTFEATURES_FAM10h_REV_B_ECX (AMD_EXTFEATURES_K8_REV_F_ECX |\
   55.86 +	__bit(X86_FEATURE_ABM) | __bit(X86_FEATURE_SSE4A) | 		\
   55.87 +	__bit(X86_FEATURE_MISALIGNSSE) | __bit(X86_FEATURE_OSVW) | 	\
   55.88 +	__bit(X86_FEATURE_IBS))
   55.89 +#define AMD_EXTFEATURES_FAM10h_REV_B_EDX (AMD_EXTFEATURES_K8_REV_F_EDX |\
   55.90 +	__bit(X86_FEATURE_PAGE1GB))
   55.91 +
   55.92 +/* Family 10h, Revision C */
   55.93 +#define AMD_FEATURES_FAM10h_REV_C_ECX     AMD_FEATURES_FAM10h_REV_B_ECX
   55.94 +#define AMD_FEATURES_FAM10h_REV_C_EDX     AMD_FEATURES_FAM10h_REV_B_EDX
   55.95 +#define AMD_EXTFEATURES_FAM10h_REV_C_ECX (AMD_EXTFEATURES_FAM10h_REV_B_ECX |\
   55.96 +	__bit(X86_FEATURE_SKINIT) | __bit(X86_FEATURE_WDT))
   55.97 +#define AMD_EXTFEATURES_FAM10h_REV_C_EDX  AMD_EXTFEATURES_FAM10h_REV_B_EDX
   55.98 +
   55.99 +/* Family 11h, Revision B */
  55.100 +#define AMD_FEATURES_FAM11h_REV_B_ECX     AMD_FEATURES_K8_REV_G_ECX
  55.101 +#define AMD_FEATURES_FAM11h_REV_B_EDX     AMD_FEATURES_K8_REV_G_EDX
  55.102 +#define AMD_EXTFEATURES_FAM11h_REV_B_ECX (AMD_EXTFEATURES_K8_REV_G_ECX |\
  55.103 +	__bit(X86_FEATURE_SKINIT))
  55.104 +#define AMD_EXTFEATURES_FAM11h_REV_B_EDX  AMD_EXTFEATURES_K8_REV_G_EDX
  55.105 +
  55.106 +#endif /* __AMD_H__ */
    56.1 --- a/xen/arch/x86/domain.c	Tue Sep 02 16:34:53 2008 -0700
    56.2 +++ b/xen/arch/x86/domain.c	Tue Sep 02 16:55:55 2008 -0700
    56.3 @@ -1645,23 +1645,26 @@ static int relinquish_memory(
    56.4  
    56.5          /*
    56.6           * Forcibly invalidate top-most, still valid page tables at this point
    56.7 -         * to break circular 'linear page table' references. This is okay
    56.8 -         * because MMU structures are not shared across domains and this domain
    56.9 -         * is now dead. Thus top-most valid tables are not in use so a non-zero
   56.10 -         * count means circular reference.
   56.11 +         * to break circular 'linear page table' references as well as clean up
   56.12 +         * partially validated pages. This is okay because MMU structures are
   56.13 +         * not shared across domains and this domain is now dead. Thus top-most
   56.14 +         * valid tables are not in use so a non-zero count means circular
   56.15 +         * reference or partially validated.
   56.16           */
   56.17          y = page->u.inuse.type_info;
   56.18          for ( ; ; )
   56.19          {
   56.20              x = y;
   56.21 -            if ( likely((x & (PGT_type_mask|PGT_validated)) !=
   56.22 -                        (type|PGT_validated)) )
   56.23 +            if ( likely((x & PGT_type_mask) != type) ||
   56.24 +                 likely(!(x & (PGT_validated|PGT_partial))) )
   56.25                  break;
   56.26  
   56.27 -            y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
   56.28 +            y = cmpxchg(&page->u.inuse.type_info, x,
   56.29 +                        x & ~(PGT_validated|PGT_partial));
   56.30              if ( likely(y == x) )
   56.31              {
   56.32 -                free_page_type(page, type);
   56.33 +                if ( free_page_type(page, x, 0) != 0 )
   56.34 +                    BUG();
   56.35                  break;
   56.36              }
   56.37          }
    57.1 --- a/xen/arch/x86/hvm/emulate.c	Tue Sep 02 16:34:53 2008 -0700
    57.2 +++ b/xen/arch/x86/hvm/emulate.c	Tue Sep 02 16:55:55 2008 -0700
    57.3 @@ -571,11 +571,12 @@ static int hvmemul_rep_movs(
    57.4  {
    57.5      struct hvm_emulate_ctxt *hvmemul_ctxt =
    57.6          container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
    57.7 -    unsigned long saddr, daddr;
    57.8 +    unsigned long saddr, daddr, bytes;
    57.9      paddr_t sgpa, dgpa;
   57.10      uint32_t pfec = PFEC_page_present;
   57.11      p2m_type_t p2mt;
   57.12 -    int rc;
   57.13 +    int rc, df = !!(ctxt->regs->eflags & X86_EFLAGS_DF);
   57.14 +    char *buf;
   57.15  
   57.16      rc = hvmemul_virtual_to_linear(
   57.17          src_seg, src_offset, bytes_per_rep, reps, hvm_access_read,
   57.18 @@ -606,15 +607,56 @@ static int hvmemul_rep_movs(
   57.19      (void)gfn_to_mfn_current(sgpa >> PAGE_SHIFT, &p2mt);
   57.20      if ( !p2m_is_ram(p2mt) )
   57.21          return hvmemul_do_mmio(
   57.22 -            sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
   57.23 -            !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
   57.24 +            sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, df, NULL);
   57.25  
   57.26      (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
   57.27 -    if ( p2m_is_ram(p2mt) )
   57.28 +    if ( !p2m_is_ram(p2mt) )
   57.29 +        return hvmemul_do_mmio(
   57.30 +            dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, df, NULL);
   57.31 +
   57.32 +    /* RAM-to-RAM copy: emulate as equivalent of memmove(dgpa, sgpa, bytes). */
   57.33 +    bytes = *reps * bytes_per_rep;
   57.34 +
   57.35 +    /* Adjust source address for reverse copy. */
   57.36 +    if ( df )
   57.37 +        sgpa -= bytes - bytes_per_rep;
   57.38 +
   57.39 +    /*
   57.40 +     * Will first iteration copy fall within source range? If not then entire
   57.41 +     * copy does not corrupt itself. If so, then this is more complex than
   57.42 +     * can be emulated by a source-to-buffer-to-destination block copy.
   57.43 +     */
   57.44 +    if ( ((dgpa + bytes_per_rep) > sgpa) && (dgpa < (sgpa + bytes)) )
   57.45          return X86EMUL_UNHANDLEABLE;
   57.46 -    return hvmemul_do_mmio(
   57.47 -        dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
   57.48 -        !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
   57.49 +
   57.50 +    /* Adjust destination address for reverse copy. */
   57.51 +    if ( df )
   57.52 +        dgpa -= bytes - bytes_per_rep;
   57.53 +
   57.54 +    /* Allocate temporary buffer. Fall back to slow emulation if this fails. */
   57.55 +    buf = xmalloc_bytes(bytes);
   57.56 +    if ( buf == NULL )
   57.57 +        return X86EMUL_UNHANDLEABLE;
   57.58 +
   57.59 +    /*
   57.60 +     * We do a modicum of checking here, just for paranoia's sake and to
   57.61 +     * definitely avoid copying an unitialised buffer into guest address space.
   57.62 +     */
   57.63 +    rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
   57.64 +    if ( rc == HVMCOPY_okay )
   57.65 +        rc = hvm_copy_to_guest_phys(dgpa, buf, bytes);
   57.66 +
   57.67 +    xfree(buf);
   57.68 +
   57.69 +    if ( rc != HVMCOPY_okay )
   57.70 +    {
   57.71 +        gdprintk(XENLOG_WARNING, "Failed memory-to-memory REP MOVS: sgpa=%"
   57.72 +                 PRIpaddr" dgpa=%"PRIpaddr" reps=%lu bytes_per_rep=%u\n",
   57.73 +                 sgpa, dgpa, *reps, bytes_per_rep);
   57.74 +        return X86EMUL_UNHANDLEABLE;
   57.75 +    }
   57.76 +
   57.77 +    return X86EMUL_OKAY;
   57.78  }
   57.79  
   57.80  static int hvmemul_read_segment(
    58.1 --- a/xen/arch/x86/irq.c	Tue Sep 02 16:34:53 2008 -0700
    58.2 +++ b/xen/arch/x86/irq.c	Tue Sep 02 16:55:55 2008 -0700
    58.3 @@ -63,7 +63,9 @@ asmlinkage void do_IRQ(struct cpu_user_r
    58.4  
    58.5      if ( likely(desc->status & IRQ_GUEST) )
    58.6      {
    58.7 +        irq_enter();
    58.8          __do_IRQ_guest(vector);
    58.9 +        irq_exit();
   58.10          spin_unlock(&desc->lock);
   58.11          return;
   58.12      }
    59.1 --- a/xen/arch/x86/microcode.c	Tue Sep 02 16:34:53 2008 -0700
    59.2 +++ b/xen/arch/x86/microcode.c	Tue Sep 02 16:55:55 2008 -0700
    59.3 @@ -124,7 +124,7 @@ static DEFINE_SPINLOCK(microcode_update_
    59.4  /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
    59.5  static DEFINE_MUTEX(microcode_mutex);
    59.6  
    59.7 -static void __user *user_buffer;	/* user area microcode data buffer */
    59.8 +static const void __user *user_buffer;	/* user area microcode data buffer */
    59.9  static unsigned int user_buffer_size;	/* it's size */
   59.10  
   59.11  typedef enum mc_error_code {
   59.12 @@ -455,7 +455,7 @@ out:
   59.13  	return error;
   59.14  }
   59.15  
   59.16 -int microcode_update(XEN_GUEST_HANDLE(void) buf, unsigned long len)
   59.17 +int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len)
   59.18  {
   59.19  	int ret;
   59.20  
    60.1 --- a/xen/arch/x86/mm.c	Tue Sep 02 16:34:53 2008 -0700
    60.2 +++ b/xen/arch/x86/mm.c	Tue Sep 02 16:55:55 2008 -0700
    60.3 @@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag
    60.4              goto fail;
    60.5  
    60.6      unmap_domain_page(descs);
    60.7 -    return 1;
    60.8 +    return 0;
    60.9  
   60.10   fail:
   60.11      unmap_domain_page(descs);
   60.12 -    return 0;
   60.13 +    return -EINVAL;
   60.14  }
   60.15  
   60.16  
   60.17 @@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned
   60.18  
   60.19  static int get_page_and_type_from_pagenr(unsigned long page_nr, 
   60.20                                           unsigned long type,
   60.21 -                                         struct domain *d)
   60.22 +                                         struct domain *d,
   60.23 +                                         int preemptible)
   60.24  {
   60.25      struct page_info *page = mfn_to_page(page_nr);
   60.26 +    int rc;
   60.27  
   60.28      if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
   60.29 -        return 0;
   60.30 -
   60.31 -    if ( unlikely(!get_page_type(page, type)) )
   60.32 -    {
   60.33 +        return -EINVAL;
   60.34 +
   60.35 +    rc = (preemptible ?
   60.36 +          get_page_type_preemptible(page, type) :
   60.37 +          (get_page_type(page, type) ? 0 : -EINVAL));
   60.38 +
   60.39 +    if ( rc )
   60.40          put_page(page);
   60.41 -        return 0;
   60.42 -    }
   60.43 -
   60.44 -    return 1;
   60.45 +
   60.46 +    return rc;
   60.47  }
   60.48  
   60.49  /*
   60.50 @@ -754,22 +757,23 @@ get_page_from_l2e(
   60.51      if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
   60.52      {
   60.53          MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
   60.54 -        return 0;
   60.55 +        return -EINVAL;
   60.56      }
   60.57  
   60.58 -    rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
   60.59 -    if ( unlikely(!rc) )
   60.60 -        rc = get_l2_linear_pagetable(l2e, pfn, d);
   60.61 +    rc = get_page_and_type_from_pagenr(
   60.62 +        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
   60.63 +    if ( unlikely(rc) && rc != -EAGAIN &&
   60.64 +         get_l2_linear_pagetable(l2e, pfn, d) )
   60.65 +        rc = -EINVAL;
   60.66  
   60.67      return rc;
   60.68  }
   60.69  
   60.70  
   60.71 -#if CONFIG_PAGING_LEVELS >= 3
   60.72  define_get_linear_pagetable(l3);
   60.73  static int
   60.74  get_page_from_l3e(
   60.75 -    l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
   60.76 +    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
   60.77  {
   60.78      int rc;
   60.79  
   60.80 @@ -779,22 +783,23 @@ get_page_from_l3e(
   60.81      if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
   60.82      {
   60.83          MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
   60.84 -        return 0;
   60.85 +        return -EINVAL;
   60.86      }
   60.87  
   60.88 -    rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
   60.89 -    if ( unlikely(!rc) )
   60.90 -        rc = get_l3_linear_pagetable(l3e, pfn, d);
   60.91 +    rc = get_page_and_type_from_pagenr(
   60.92 +        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
   60.93 +    if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR &&
   60.94 +         get_l3_linear_pagetable(l3e, pfn, d) )
   60.95 +        rc = -EINVAL;
   60.96  
   60.97      return rc;
   60.98  }
   60.99 -#endif /* 3 level */
  60.100  
  60.101  #if CONFIG_PAGING_LEVELS >= 4
  60.102  define_get_linear_pagetable(l4);
  60.103  static int
  60.104  get_page_from_l4e(
  60.105 -    l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
  60.106 +    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
  60.107  {
  60.108      int rc;
  60.109  
  60.110 @@ -804,12 +809,14 @@ get_page_from_l4e(
  60.111      if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
  60.112      {
  60.113          MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
  60.114 -        return 0;
  60.115 +        return -EINVAL;
  60.116      }
  60.117  
  60.118 -    rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
  60.119 -    if ( unlikely(!rc) )
  60.120 -        rc = get_l4_linear_pagetable(l4e, pfn, d);
  60.121 +    rc = get_page_and_type_from_pagenr(
  60.122 +        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
  60.123 +    if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR &&
  60.124 +         get_l4_linear_pagetable(l4e, pfn, d) )
  60.125 +        rc = -EINVAL;
  60.126  
  60.127      return rc;
  60.128  }
  60.129 @@ -946,29 +953,35 @@ void put_page_from_l1e(l1_pgentry_t l1e,
  60.130   * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  60.131   * Note also that this automatically deals correctly with linear p.t.'s.
  60.132   */
  60.133 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  60.134 +static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  60.135  {
  60.136      if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
  60.137           (l2e_get_pfn(l2e) != pfn) )
  60.138 +    {
  60.139          put_page_and_type(l2e_get_page(l2e));
  60.140 +        return 0;
  60.141 +    }
  60.142 +    return 1;
  60.143  }
  60.144  
  60.145  
  60.146 -#if CONFIG_PAGING_LEVELS >= 3
  60.147 -static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
  60.148 +static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
  60.149 +                             int preemptible)
  60.150  {
  60.151      if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
  60.152           (l3e_get_pfn(l3e) != pfn) )
  60.153 -        put_page_and_type(l3e_get_page(l3e));
  60.154 +        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
  60.155 +    return 1;
  60.156  }
  60.157 -#endif
  60.158  
  60.159  #if CONFIG_PAGING_LEVELS >= 4
  60.160 -static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
  60.161 +static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
  60.162 +                             int preemptible)
  60.163  {
  60.164      if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
  60.165           (l4e_get_pfn(l4e) != pfn) )
  60.166 -        put_page_and_type(l4e_get_page(l4e));
  60.167 +        return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
  60.168 +    return 1;
  60.169  }
  60.170  #endif
  60.171  
  60.172 @@ -977,7 +990,7 @@ static int alloc_l1_table(struct page_in
  60.173      struct domain *d = page_get_owner(page);
  60.174      unsigned long  pfn = page_to_mfn(page);
  60.175      l1_pgentry_t  *pl1e;
  60.176 -    int            i;
  60.177 +    unsigned int   i;
  60.178  
  60.179      pl1e = map_domain_page(pfn);
  60.180  
  60.181 @@ -991,7 +1004,7 @@ static int alloc_l1_table(struct page_in
  60.182      }
  60.183  
  60.184      unmap_domain_page(pl1e);
  60.185 -    return 1;
  60.186 +    return 0;
  60.187  
  60.188   fail:
  60.189      MEM_LOG("Failure in alloc_l1_table: entry %d", i);
  60.190 @@ -1000,7 +1013,7 @@ static int alloc_l1_table(struct page_in
  60.191              put_page_from_l1e(pl1e[i], d);
  60.192  
  60.193      unmap_domain_page(pl1e);
  60.194 -    return 0;
  60.195 +    return -EINVAL;
  60.196  }
  60.197  
  60.198  static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
  60.199 @@ -1128,47 +1141,53 @@ static void pae_flush_pgd(
  60.200  # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
  60.201  #endif
  60.202  
  60.203 -static int alloc_l2_table(struct page_info *page, unsigned long type)
  60.204 +static int alloc_l2_table(struct page_info *page, unsigned long type,
  60.205 +                          int preemptible)
  60.206  {
  60.207      struct domain *d = page_get_owner(page);
  60.208      unsigned long  pfn = page_to_mfn(page);
  60.209      l2_pgentry_t  *pl2e;
  60.210 -    int            i;
  60.211 +    unsigned int   i;
  60.212 +    int            rc = 0;
  60.213  
  60.214      pl2e = map_domain_page(pfn);
  60.215  
  60.216 -    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  60.217 +    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
  60.218      {
  60.219 -        if ( !is_guest_l2_slot(d, type, i) )
  60.220 +        if ( preemptible && i && hypercall_preempt_check() )
  60.221 +        {
  60.222 +            page->nr_validated_ptes = i;
  60.223 +            rc = -EAGAIN;
  60.224 +            break;
  60.225 +        }
  60.226 +
  60.227 +        if ( !is_guest_l2_slot(d, type, i) ||
  60.228 +             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
  60.229              continue;
  60.230  
  60.231 -        if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
  60.232 -            goto fail;
  60.233 -        
  60.234 +        if ( rc < 0 )
  60.235 +        {
  60.236 +            MEM_LOG("Failure in alloc_l2_table: entry %d", i);
  60.237 +            while ( i-- > 0 )
  60.238 +                if ( is_guest_l2_slot(d, type, i) )
  60.239 +                    put_page_from_l2e(pl2e[i], pfn);
  60.240 +            break;
  60.241 +        }
  60.242 +
  60.243          adjust_guest_l2e(pl2e[i], d);
  60.244      }
  60.245  
  60.246      unmap_domain_page(pl2e);
  60.247 -    return 1;
  60.248 -
  60.249 - fail:
  60.250 -    MEM_LOG("Failure in alloc_l2_table: entry %d", i);
  60.251 -    while ( i-- > 0 )
  60.252 -        if ( is_guest_l2_slot(d, type, i) )
  60.253 -            put_page_from_l2e(pl2e[i], pfn);
  60.254 -
  60.255 -    unmap_domain_page(pl2e);
  60.256 -    return 0;
  60.257 +    return rc > 0 ? 0 : rc;
  60.258  }
  60.259  
  60.260 -
  60.261 -#if CONFIG_PAGING_LEVELS >= 3
  60.262 -static int alloc_l3_table(struct page_info *page)
  60.263 +static int alloc_l3_table(struct page_info *page, int preemptible)
  60.264  {
  60.265      struct domain *d = page_get_owner(page);
  60.266      unsigned long  pfn = page_to_mfn(page);
  60.267      l3_pgentry_t  *pl3e;
  60.268 -    int            i;
  60.269 +    unsigned int   i;
  60.270 +    int            rc = 0;
  60.271  
  60.272  #if CONFIG_PAGING_LEVELS == 3
  60.273      /*
  60.274 @@ -1181,7 +1200,7 @@ static int alloc_l3_table(struct page_in
  60.275           d->vcpu[0] && d->vcpu[0]->is_initialised )
  60.276      {
  60.277          MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
  60.278 -        return 0;
  60.279 +        return -EINVAL;
  60.280      }
  60.281  #endif
  60.282  
  60.283 @@ -1197,64 +1216,96 @@ static int alloc_l3_table(struct page_in
  60.284      if ( is_pv_32on64_domain(d) )
  60.285          memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
  60.286  
  60.287 -    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
  60.288 +    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
  60.289      {
  60.290          if ( is_pv_32bit_domain(d) && (i == 3) )
  60.291          {
  60.292              if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
  60.293 -                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
  60.294 -                 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
  60.295 -                                                PGT_l2_page_table |
  60.296 -                                                PGT_pae_xen_l2,
  60.297 -                                                d) )
  60.298 -                goto fail;
  60.299 +                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
  60.300 +                rc = -EINVAL;
  60.301 +            else
  60.302 +                rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
  60.303 +                                                   PGT_l2_page_table |
  60.304 +                                                   PGT_pae_xen_l2,
  60.305 +                                                   d, preemptible);
  60.306          }
  60.307 -        else if ( !is_guest_l3_slot(i) )
  60.308 +        else if ( !is_guest_l3_slot(i) ||
  60.309 +                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
  60.310              continue;
  60.311 -        else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
  60.312 -            goto fail;
  60.313 +
  60.314 +        if ( rc == -EAGAIN )
  60.315 +        {
  60.316 +            page->nr_validated_ptes = i;
  60.317 +            page->partial_pte = 1;
  60.318 +        }
  60.319 +        else if ( rc == -EINTR && i )
  60.320 +        {
  60.321 +            page->nr_validated_ptes = i;
  60.322 +            page->partial_pte = 0;
  60.323 +            rc = -EAGAIN;
  60.324 +        }
  60.325 +        if ( rc < 0 )
  60.326 +            break;
  60.327  
  60.328          adjust_guest_l3e(pl3e[i], d);
  60.329      }
  60.330  
  60.331 -    if ( !create_pae_xen_mappings(d, pl3e) )
  60.332 -        goto fail;
  60.333 -
  60.334 -    unmap_domain_page(pl3e);
  60.335 -    return 1;
  60.336 -
  60.337 - fail:
  60.338 -    MEM_LOG("Failure in alloc_l3_table: entry %d", i);
  60.339 -    while ( i-- > 0 )
  60.340 +    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
  60.341 +        rc = -EINVAL;
  60.342 +    if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
  60.343      {
  60.344 -        if ( !is_guest_l3_slot(i) )
  60.345 -            continue;
  60.346 -        unadjust_guest_l3e(pl3e[i], d);
  60.347 -        put_page_from_l3e(pl3e[i], pfn);
  60.348 +        MEM_LOG("Failure in alloc_l3_table: entry %d", i);
  60.349 +        while ( i-- > 0 )
  60.350 +        {
  60.351 +            if ( !is_guest_l3_slot(i) )
  60.352 +                continue;
  60.353 +            unadjust_guest_l3e(pl3e[i], d);
  60.354 +            put_page_from_l3e(pl3e[i], pfn, 0);
  60.355 +        }
  60.356      }
  60.357  
  60.358      unmap_domain_page(pl3e);
  60.359 -    return 0;
  60.360 +    return rc > 0 ? 0 : rc;
  60.361  }
  60.362 -#else
  60.363 -#define alloc_l3_table(page) (0)
  60.364 -#endif
  60.365  
  60.366  #if CONFIG_PAGING_LEVELS >= 4
  60.367 -static int alloc_l4_table(struct page_info *page)
  60.368 +static int alloc_l4_table(struct page_info *page, int preemptible)
  60.369  {
  60.370      struct domain *d = page_get_owner(page);
  60.371      unsigned long  pfn = page_to_mfn(page);
  60.372      l4_pgentry_t  *pl4e = page_to_virt(page);
  60.373 -    int            i;
  60.374 -
  60.375 -    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
  60.376 +    unsigned int   i;
  60.377 +    int            rc = 0;
  60.378 +
  60.379 +    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
  60.380      {
  60.381 -        if ( !is_guest_l4_slot(d, i) )
  60.382 +        if ( !is_guest_l4_slot(d, i) ||
  60.383 +             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
  60.384              continue;
  60.385  
  60.386 -        if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
  60.387 -            goto fail;
  60.388 +        if ( rc == -EAGAIN )
  60.389 +        {
  60.390 +            page->nr_validated_ptes = i;
  60.391 +            page->partial_pte = 1;
  60.392 +        }
  60.393 +        else if ( rc == -EINTR )
  60.394 +        {
  60.395 +            if ( i )
  60.396 +            {
  60.397 +                page->nr_validated_ptes = i;
  60.398 +                page->partial_pte = 0;
  60.399 +                rc = -EAGAIN;
  60.400 +            }
  60.401 +        }
  60.402 +        else if ( rc < 0 )
  60.403 +        {
  60.404 +            MEM_LOG("Failure in alloc_l4_table: entry %d", i);
  60.405 +            while ( i-- > 0 )
  60.406 +                if ( is_guest_l4_slot(d, i) )
  60.407 +                    put_page_from_l4e(pl4e[i], pfn, 0);
  60.408 +        }
  60.409 +        if ( rc < 0 )
  60.410 +            return rc;
  60.411  
  60.412          adjust_guest_l4e(pl4e[i], d);
  60.413      }
  60.414 @@ -1269,18 +1320,10 @@ static int alloc_l4_table(struct page_in
  60.415          l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
  60.416                        __PAGE_HYPERVISOR);
  60.417  
  60.418 -    return 1;
  60.419 -
  60.420 - fail:
  60.421 -    MEM_LOG("Failure in alloc_l4_table: entry %d", i);
  60.422 -    while ( i-- > 0 )
  60.423 -        if ( is_guest_l4_slot(d, i) )
  60.424 -            put_page_from_l4e(pl4e[i], pfn);
  60.425 -
  60.426 -    return 0;
  60.427 +    return rc > 0 ? 0 : rc;
  60.428  }
  60.429  #else
  60.430 -#define alloc_l4_table(page) (0)
  60.431 +#define alloc_l4_table(page, preemptible) (-EINVAL)
  60.432  #endif
  60.433  
  60.434  
  60.435 @@ -1289,7 +1332,7 @@ static void free_l1_table(struct page_in
  60.436      struct domain *d = page_get_owner(page);
  60.437      unsigned long pfn = page_to_mfn(page);
  60.438      l1_pgentry_t *pl1e;
  60.439 -    int i;
  60.440 +    unsigned int  i;
  60.441  
  60.442      pl1e = map_domain_page(pfn);
  60.443  
  60.444 @@ -1301,74 +1344,114 @@ static void free_l1_table(struct page_in
  60.445  }
  60.446  
  60.447  
  60.448 -static void free_l2_table(struct page_info *page)
  60.449 +static int free_l2_table(struct page_info *page, int preemptible)
  60.450  {
  60.451  #ifdef CONFIG_COMPAT
  60.452      struct domain *d = page_get_owner(page);
  60.453  #endif
  60.454      unsigned long pfn = page_to_mfn(page);
  60.455      l2_pgentry_t *pl2e;
  60.456 -    int i;
  60.457 +    unsigned int  i = page->nr_validated_ptes - 1;
  60.458 +    int err = 0;
  60.459  
  60.460      pl2e = map_domain_page(pfn);
  60.461  
  60.462 -    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
  60.463 -        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
  60.464 -            put_page_from_l2e(pl2e[i], pfn);
  60.465 +    ASSERT(page->nr_validated_ptes);
  60.466 +    do {
  60.467 +        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
  60.468 +             put_page_from_l2e(pl2e[i], pfn) == 0 &&
  60.469 +             preemptible && i && hypercall_preempt_check() )
  60.470 +        {
  60.471 +           page->nr_validated_ptes = i;
  60.472 +           err = -EAGAIN;
  60.473 +        }
  60.474 +    } while ( !err && i-- );
  60.475  
  60.476      unmap_domain_page(pl2e);
  60.477  
  60.478 -    page->u.inuse.type_info &= ~PGT_pae_xen_l2;
  60.479 +    if ( !err )
  60.480 +        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
  60.481 +
  60.482 +    return err;
  60.483  }
  60.484  
  60.485 -
  60.486 -#if CONFIG_PAGING_LEVELS >= 3
  60.487 -
  60.488 -static void free_l3_table(struct page_info *page)
  60.489 +static int free_l3_table(struct page_info *page, int preemptible)
  60.490  {
  60.491      struct domain *d = page_get_owner(page);
  60.492      unsigned long pfn = page_to_mfn(page);
  60.493      l3_pgentry_t *pl3e;
  60.494 -    int           i;
  60.495 +    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
  60.496 +    int rc = 0;
  60.497  
  60.498  #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
  60.499      if ( d->arch.relmem == RELMEM_l3 )
  60.500 -        return;
  60.501 +        return 0;
  60.502  #endif
  60.503  
  60.504      pl3e = map_domain_page(pfn);
  60.505  
  60.506 -    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
  60.507 +    do {
  60.508          if ( is_guest_l3_slot(i) )
  60.509          {
  60.510 -            put_page_from_l3e(pl3e[i], pfn);
  60.511 +            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
  60.512 +            if ( rc > 0 )
  60.513 +                continue;
  60.514 +            if ( rc )
  60.515 +                break;
  60.516              unadjust_guest_l3e(pl3e[i], d);
  60.517          }
  60.518 +    } while ( i-- );
  60.519  
  60.520      unmap_domain_page(pl3e);
  60.521 +
  60.522 +    if ( rc == -EAGAIN )
  60.523 +    {
  60.524 +        page->nr_validated_ptes = i;
  60.525 +        page->partial_pte = 1;
  60.526 +    }
  60.527 +    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
  60.528 +    {
  60.529 +        page->nr_validated_ptes = i + 1;
  60.530 +        page->partial_pte = 0;
  60.531 +        rc = -EAGAIN;
  60.532 +    }
  60.533 +    return rc > 0 ? 0 : rc;
  60.534  }
  60.535  
  60.536 -#endif
  60.537 -
  60.538  #if CONFIG_PAGING_LEVELS >= 4
  60.539 -
  60.540 -static void free_l4_table(struct page_info *page)
  60.541 +static int free_l4_table(struct page_info *page, int preemptible)
  60.542  {
  60.543      struct domain *d = page_get_owner(page);
  60.544      unsigned long pfn = page_to_mfn(page);
  60.545      l4_pgentry_t *pl4e = page_to_virt(page);
  60.546 -    int           i;
  60.547 +    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
  60.548 +    int rc = 0;
  60.549  
  60.550  #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
  60.551      if ( d->arch.relmem == RELMEM_l4 )
  60.552 -        return;
  60.553 +        return 0;
  60.554  #endif
  60.555  
  60.556 -    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
  60.557 +    do {
  60.558          if ( is_guest_l4_slot(d, i) )
  60.559 -            put_page_from_l4e(pl4e[i], pfn);
  60.560 +            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
  60.561 +    } while ( rc >= 0 && i-- );
  60.562 +
  60.563 +    if ( rc == -EAGAIN )
  60.564 +    {
  60.565 +        page->nr_validated_ptes = i;
  60.566 +        page->partial_pte = 1;
  60.567 +    }
  60.568 +    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
  60.569 +    {
  60.570 +        page->nr_validated_ptes = i + 1;
  60.571 +        page->partial_pte = 0;
  60.572 +        rc = -EAGAIN;
  60.573 +    }
  60.574 +    return rc > 0 ? 0 : rc;
  60.575  }
  60.576 -
  60.577 +#else
  60.578 +#define free_l4_table(page, preemptible) (-EINVAL)
  60.579  #endif
  60.580  
  60.581  static void page_lock(struct page_info *page)
  60.582 @@ -1560,7 +1643,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
  60.583              return rc;
  60.584          }
  60.585  
  60.586 -        if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
  60.587 +        if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
  60.588              return page_unlock(l2pg), 0;
  60.589  
  60.590          adjust_guest_l2e(nl2e, d);
  60.591 @@ -1583,24 +1666,23 @@ static int mod_l2_entry(l2_pgentry_t *pl
  60.592      return rc;
  60.593  }
  60.594  
  60.595 -#if CONFIG_PAGING_LEVELS >= 3
  60.596 -
  60.597  /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
  60.598  static int mod_l3_entry(l3_pgentry_t *pl3e, 
  60.599                          l3_pgentry_t nl3e, 
  60.600                          unsigned long pfn,
  60.601 -                        int preserve_ad)
  60.602 +                        int preserve_ad,
  60.603 +                        int preemptible)
  60.604  {
  60.605      l3_pgentry_t ol3e;
  60.606      struct vcpu *curr = current;
  60.607      struct domain *d = curr->domain;
  60.608      struct page_info *l3pg = mfn_to_page(pfn);
  60.609 -    int rc = 1;
  60.610 +    int rc = 0;
  60.611  
  60.612      if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
  60.613      {
  60.614          MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
  60.615 -        return 0;
  60.616 +        return -EINVAL;
  60.617      }
  60.618  
  60.619      /*
  60.620 @@ -1608,12 +1690,12 @@ static int mod_l3_entry(l3_pgentry_t *pl
  60.621       * would be a pain to ensure they remain continuously valid throughout.
  60.622       */
  60.623      if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
  60.624 -        return 0;
  60.625 +        return -EINVAL;
  60.626  
  60.627      page_lock(l3pg);
  60.628  
  60.629      if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
  60.630 -        return page_unlock(l3pg), 0;
  60.631 +        return page_unlock(l3pg), -EFAULT;
  60.632  
  60.633      if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
  60.634      {
  60.635 @@ -1622,7 +1704,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  60.636              page_unlock(l3pg);
  60.637              MEM_LOG("Bad L3 flags %x",
  60.638                      l3e_get_flags(nl3e) & l3_disallow_mask(d));
  60.639 -            return 0;
  60.640 +            return -EINVAL;
  60.641          }
  60.642  
  60.643          /* Fast path for identical mapping and presence. */
  60.644 @@ -1631,28 +1713,30 @@ static int mod_l3_entry(l3_pgentry_t *pl
  60.645              adjust_guest_l3e(nl3e, d);
  60.646              rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
  60.647              page_unlock(l3pg);
  60.648 -            return rc;
  60.649 +            return rc ? 0 : -EFAULT;
  60.650          }
  60.651  
  60.652 -        if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
  60.653 -            return page_unlock(l3pg), 0;
  60.654 +        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
  60.655 +        if ( unlikely(rc < 0) )
  60.656 +            return page_unlock(l3pg), rc;
  60.657 +        rc = 0;
  60.658  
  60.659          adjust_guest_l3e(nl3e, d);
  60.660          if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
  60.661                                      preserve_ad)) )
  60.662          {
  60.663              ol3e = nl3e;
  60.664 -            rc = 0;
  60.665 +            rc = -EFAULT;
  60.666          }
  60.667      }
  60.668      else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
  60.669                                       preserve_ad)) )
  60.670      {
  60.671          page_unlock(l3pg);
  60.672 -        return 0;
  60.673 +        return -EFAULT;
  60.674      }
  60.675  
  60.676 -    if ( likely(rc) )
  60.677 +    if ( likely(rc == 0) )
  60.678      {
  60.679          if ( !create_pae_xen_mappings(d, pl3e) )
  60.680              BUG();
  60.681 @@ -1661,36 +1745,35 @@ static int mod_l3_entry(l3_pgentry_t *pl
  60.682      }
  60.683  
  60.684      page_unlock(l3pg);
  60.685 -    put_page_from_l3e(ol3e, pfn);
  60.686 +    put_page_from_l3e(ol3e, pfn, 0);
  60.687      return rc;
  60.688  }
  60.689  
  60.690 -#endif
  60.691 -
  60.692  #if CONFIG_PAGING_LEVELS >= 4
  60.693  
  60.694  /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
  60.695  static int mod_l4_entry(l4_pgentry_t *pl4e, 
  60.696                          l4_pgentry_t nl4e, 
  60.697                          unsigned long pfn,
  60.698 -                        int preserve_ad)
  60.699 +                        int preserve_ad,
  60.700 +                        int preemptible)
  60.701  {
  60.702      struct vcpu *curr = current;
  60.703      struct domain *d = curr->domain;
  60.704      l4_pgentry_t ol4e;
  60.705      struct page_info *l4pg = mfn_to_page(pfn);
  60.706 -    int rc = 1;
  60.707 +    int rc = 0;
  60.708  
  60.709      if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
  60.710      {
  60.711          MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
  60.712 -        return 0;
  60.713 +        return -EINVAL;
  60.714      }
  60.715  
  60.716      page_lock(l4pg);
  60.717  
  60.718      if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
  60.719 -        return page_unlock(l4pg), 0;
  60.720 +        return page_unlock(l4pg), -EFAULT;
  60.721  
  60.722      if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
  60.723      {
  60.724 @@ -1699,7 +1782,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
  60.725              page_unlock(l4pg);
  60.726              MEM_LOG("Bad L4 flags %x",
  60.727                      l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
  60.728 -            return 0;
  60.729 +            return -EINVAL;
  60.730          }
  60.731  
  60.732          /* Fast path for identical mapping and presence. */
  60.733 @@ -1708,29 +1791,31 @@ static int mod_l4_entry(l4_pgentry_t *pl
  60.734              adjust_guest_l4e(nl4e, d);
  60.735              rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
  60.736              page_unlock(l4pg);
  60.737 -            return rc;
  60.738 +            return rc ? 0 : -EFAULT;
  60.739          }
  60.740  
  60.741 -        if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
  60.742 -            return page_unlock(l4pg), 0;
  60.743 +        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
  60.744 +        if ( unlikely(rc < 0) )
  60.745 +            return page_unlock(l4pg), rc;
  60.746 +        rc = 0;
  60.747  
  60.748          adjust_guest_l4e(nl4e, d);
  60.749          if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
  60.750                                      preserve_ad)) )
  60.751          {
  60.752              ol4e = nl4e;
  60.753 -            rc = 0;
  60.754 +            rc = -EFAULT;
  60.755          }
  60.756      }
  60.757      else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
  60.758                                       preserve_ad)) )
  60.759      {
  60.760          page_unlock(l4pg);
  60.761 -        return 0;
  60.762 +        return -EFAULT;
  60.763      }
  60.764  
  60.765      page_unlock(l4pg);
  60.766 -    put_page_from_l4e(ol4e, pfn);
  60.767 +    put_page_from_l4e(ol4e, pfn, 0);
  60.768      return rc;
  60.769  }
  60.770  
  60.771 @@ -1788,9 +1873,11 @@ int get_page(struct page_info *page, str
  60.772  }
  60.773  
  60.774  
  60.775 -static int alloc_page_type(struct page_info *page, unsigned long type)
  60.776 +static int alloc_page_type(struct page_info *page, unsigned long type,
  60.777 +                           int preemptible)
  60.778  {
  60.779      struct domain *owner = page_get_owner(page);
  60.780 +    int rc;
  60.781  
  60.782      /* A page table is dirtied when its type count becomes non-zero. */
  60.783      if ( likely(owner != NULL) )
  60.784 @@ -1799,30 +1886,65 @@ static int alloc_page_type(struct page_i
  60.785      switch ( type & PGT_type_mask )
  60.786      {
  60.787      case PGT_l1_page_table:
  60.788 -        return alloc_l1_table(page);
  60.789 +        alloc_l1_table(page);
  60.790 +        rc = 0;
  60.791 +        break;
  60.792      case PGT_l2_page_table:
  60.793 -        return alloc_l2_table(page, type);
  60.794 +        rc = alloc_l2_table(page, type, preemptible);
  60.795 +        break;
  60.796      case PGT_l3_page_table:
  60.797 -        return alloc_l3_table(page);
  60.798 +        rc = alloc_l3_table(page, preemptible);
  60.799 +        break;
  60.800      case PGT_l4_page_table:
  60.801 -        return alloc_l4_table(page);
  60.802 +        rc = alloc_l4_table(page, preemptible);
  60.803 +        break;
  60.804      case PGT_seg_desc_page:
  60.805 -        return alloc_segdesc_page(page);
  60.806 +        rc = alloc_segdesc_page(page);
  60.807 +        break;
  60.808      default:
  60.809          printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 
  60.810                 type, page->u.inuse.type_info,
  60.811                 page->count_info);
  60.812 +        rc = -EINVAL;
  60.813          BUG();
  60.814      }
  60.815  
  60.816 -    return 0;
  60.817 +    /* No need for atomic update of type_info here: noone else updates it. */
  60.818 +    wmb();
  60.819 +    if ( rc == -EAGAIN )
  60.820 +    {
  60.821 +        page->u.inuse.type_info |= PGT_partial;
  60.822 +    }
  60.823 +    else if ( rc == -EINTR )
  60.824 +    {
  60.825 +        ASSERT((page->u.inuse.type_info &
  60.826 +                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
  60.827 +        page->u.inuse.type_info &= ~PGT_count_mask;
  60.828 +    }
  60.829 +    else if ( rc )
  60.830 +    {
  60.831 +        ASSERT(rc < 0);
  60.832 +        MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
  60.833 +                PRtype_info ": caf=%08x taf=%" PRtype_info,
  60.834 +                page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
  60.835 +                type, page->count_info, page->u.inuse.type_info);
  60.836 +        page->u.inuse.type_info = 0;
  60.837 +    }
  60.838 +    else
  60.839 +    {
  60.840 +        page->u.inuse.type_info |= PGT_validated;
  60.841 +    }
  60.842 +
  60.843 +    return rc;
  60.844  }
  60.845  
  60.846  
  60.847 -void free_page_type(struct page_info *page, unsigned long type)
  60.848 +int free_page_type(struct page_info *page, unsigned long type,
  60.849 +                   int preemptible)
  60.850  {
  60.851      struct domain *owner = page_get_owner(page);
  60.852      unsigned long gmfn;
  60.853 +    int rc;
  60.854  
  60.855      if ( likely(owner != NULL) )
  60.856      {
  60.857 @@ -1842,7 +1964,7 @@ void free_page_type(struct page_info *pa
  60.858              paging_mark_dirty(owner, page_to_mfn(page));
  60.859  
  60.860              if ( shadow_mode_refcounts(owner) )
  60.861 -                return;
  60.862 +                return 0;
  60.863  
  60.864              gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
  60.865              ASSERT(VALID_M2P(gmfn));
  60.866 @@ -1850,42 +1972,80 @@ void free_page_type(struct page_info *pa
  60.867          }
  60.868      }
  60.869  
  60.870 +    if ( !(type & PGT_partial) )
  60.871 +    {
  60.872 +        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
  60.873 +        page->partial_pte = 0;
  60.874 +    }
  60.875      switch ( type & PGT_type_mask )
  60.876      {
  60.877      case PGT_l1_page_table:
  60.878          free_l1_table(page);
  60.879 +        rc = 0;
  60.880          break;
  60.881 -
  60.882      case PGT_l2_page_table:
  60.883 -        free_l2_table(page);
  60.884 +        rc = free_l2_table(page, preemptible);
  60.885          break;
  60.886 -
  60.887 -#if CONFIG_PAGING_LEVELS >= 3
  60.888      case PGT_l3_page_table:
  60.889 -        free_l3_table(page);
  60.890 -        break;
  60.891 +#if CONFIG_PAGING_LEVELS == 3
  60.892 +        if ( !(type & PGT_partial) )
  60.893 +            page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
  60.894  #endif
  60.895 -
  60.896 -#if CONFIG_PAGING_LEVELS >= 4
  60.897 -    case PGT_l4_page_table:
  60.898 -        free_l4_table(page);
  60.899 +        rc = free_l3_table(page, preemptible);
  60.900          break;
  60.901 -#endif
  60.902 -
  60.903 +    case PGT_l4_page_table:
  60.904 +        rc = free_l4_table(page, preemptible);
  60.905 +        break;
  60.906      default:
  60.907 -        printk("%s: type %lx pfn %lx\n",__FUNCTION__,
  60.908 -               type, page_to_mfn(page));
  60.909 +        MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
  60.910 +        rc = -EINVAL;
  60.911          BUG();
  60.912      }
  60.913 +
  60.914 +    /* No need for atomic update of type_info here: noone else updates it. */
  60.915 +    if ( rc == 0 )
  60.916 +    {
  60.917 +        /*
  60.918 +         * Record TLB information for flush later. We do not stamp page tables
  60.919 +         * when running in shadow mode:
  60.920 +         *  1. Pointless, since it's the shadow pt's which must be tracked.
  60.921 +         *  2. Shadow mode reuses this field for shadowed page tables to
  60.922 +         *     store flags info -- we don't want to conflict with that.
  60.923 +         */
  60.924 +        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
  60.925 +               (page->count_info & PGC_page_table)) )
  60.926 +            page->tlbflush_timestamp = tlbflush_current_time();
  60.927 +        wmb();
  60.928 +        page->u.inuse.type_info--;
  60.929 +    }
  60.930 +    else if ( rc == -EINTR )
  60.931 +    {
  60.932 +        ASSERT(!(page->u.inuse.type_info &
  60.933 +                 (PGT_count_mask|PGT_validated|PGT_partial)));
  60.934 +        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
  60.935 +               (page->count_info & PGC_page_table)) )
  60.936 +            page->tlbflush_timestamp = tlbflush_current_time();
  60.937 +        wmb();
  60.938 +        page->u.inuse.type_info |= PGT_validated;
  60.939 +    }
  60.940 +    else
  60.941 +    {
  60.942 +        BUG_ON(rc != -EAGAIN);
  60.943 +        wmb();
  60.944 +        page->u.inuse.type_info |= PGT_partial;
  60.945 +    }
  60.946 +
  60.947 +    return rc;
  60.948  }
  60.949  
  60.950  
  60.951 -void put_page_type(struct page_info *page)
  60.952 +static int __put_page_type(struct page_info *page,
  60.953 +                           int preemptible)
  60.954  {
  60.955      unsigned long nx, x, y = page->u.inuse.type_info;
  60.956  
  60.957 - again:
  60.958 -    do {
  60.959 +    for ( ; ; )
  60.960 +    {
  60.961          x  = y;
  60.962          nx = x - 1;
  60.963  
  60.964 @@ -1894,21 +2054,19 @@ void put_page_type(struct page_info *pag
  60.965          if ( unlikely((nx & PGT_count_mask) == 0) )
  60.966          {
  60.967              if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  60.968 -                 likely(nx & PGT_validated) )
  60.969 +                 likely(nx & (PGT_validated|PGT_partial)) )
  60.970              {
  60.971                  /*
  60.972                   * Page-table pages must be unvalidated when count is zero. The
  60.973                   * 'free' is safe because the refcnt is non-zero and validated
  60.974                   * bit is clear => other ops will spin or fail.
  60.975                   */
  60.976 -                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
  60.977 -                                           x & ~PGT_validated)) != x) )
  60.978 -                    goto again;
  60.979 +                nx = x & ~(PGT_validated|PGT_partial);
  60.980 +                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
  60.981 +                                           x, nx)) != x) )
  60.982 +                    continue;
  60.983                  /* We cleared the 'valid bit' so we do the clean up. */
  60.984 -                free_page_type(page, x);
  60.985 -                /* Carry on, but with the 'valid bit' now clear. */
  60.986 -                x  &= ~PGT_validated;
  60.987 -                nx &= ~PGT_validated;
  60.988 +                return free_page_type(page, x, preemptible);
  60.989              }
  60.990  
  60.991              /*
  60.992 @@ -1922,25 +2080,33 @@ void put_page_type(struct page_info *pag
  60.993                     (page->count_info & PGC_page_table)) )
  60.994                  page->tlbflush_timestamp = tlbflush_current_time();
  60.995          }
  60.996 +
  60.997 +        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
  60.998 +            break;
  60.999 +
 60.1000 +        if ( preemptible && hypercall_preempt_check() )
 60.1001 +            return -EINTR;
 60.1002      }
 60.1003 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
 60.1004 +
 60.1005 +    return 0;
 60.1006  }
 60.1007  
 60.1008  
 60.1009 -int get_page_type(struct page_info *page, unsigned long type)
 60.1010 +static int __get_page_type(struct page_info *page, unsigned long type,
 60.1011 +                           int preemptible)
 60.1012  {
 60.1013      unsigned long nx, x, y = page->u.inuse.type_info;
 60.1014  
 60.1015      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
 60.1016  
 60.1017 - again:
 60.1018 -    do {
 60.1019 +    for ( ; ; )
 60.1020 +    {
 60.1021          x  = y;
 60.1022          nx = x + 1;
 60.1023          if ( unlikely((nx & PGT_count_mask) == 0) )
 60.1024          {
 60.1025              MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
 60.1026 -            return 0;
 60.1027 +            return -EINVAL;
 60.1028          }
 60.1029          else if ( unlikely((x & PGT_count_mask) == 0) )
 60.1030          {
 60.1031 @@ -1993,28 +2159,43 @@ int get_page_type(struct page_info *page
 60.1032              /* Don't log failure if it could be a recursive-mapping attempt. */
 60.1033              if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
 60.1034                   (type == PGT_l1_page_table) )
 60.1035 -                return 0;
 60.1036 +                return -EINVAL;
 60.1037              if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
 60.1038                   (type == PGT_l2_page_table) )
 60.1039 -                return 0;
 60.1040 +                return -EINVAL;
 60.1041              if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
 60.1042                   (type == PGT_l3_page_table) )
 60.1043 -                return 0;
 60.1044 +                return -EINVAL;
 60.1045              MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
 60.1046                      "for mfn %lx (pfn %lx)",
 60.1047                      x, type, page_to_mfn(page),
 60.1048                      get_gpfn_from_mfn(page_to_mfn(page)));
 60.1049 -            return 0;
 60.1050 +            return -EINVAL;
 60.1051          }
 60.1052          else if ( unlikely(!(x & PGT_validated)) )
 60.1053          {
 60.1054 -            /* Someone else is updating validation of this page. Wait... */
 60.1055 -            while ( (y = page->u.inuse.type_info) == x )
 60.1056 -                cpu_relax();
 60.1057 -            goto again;
 60.1058 +            if ( !(x & PGT_partial) )
 60.1059 +            {
 60.1060 +                /* Someone else is updating validation of this page. Wait... */
 60.1061 +                while ( (y = page->u.inuse.type_info) == x )
 60.1062 +                {
 60.1063 +                    if ( preemptible && hypercall_preempt_check() )
 60.1064 +                        return -EINTR;
 60.1065 +                    cpu_relax();
 60.1066 +                }
 60.1067 +                continue;
 60.1068 +            }
 60.1069 +            /* Type ref count was left at 1 when PGT_partial got set. */
 60.1070 +            ASSERT((x & PGT_count_mask) == 1);
 60.1071 +            nx = x & ~PGT_partial;
 60.1072          }
 60.1073 +
 60.1074 +        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
 60.1075 +            break;
 60.1076 +
 60.1077 +        if ( preemptible && hypercall_preempt_check() )
 60.1078 +            return -EINTR;
 60.1079      }
 60.1080 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
 60.1081  
 60.1082      if ( unlikely((x & PGT_type_mask) != type) )
 60.1083      {
 60.1084 @@ -2032,25 +2213,42 @@ int get_page_type(struct page_info *page
 60.1085  
 60.1086      if ( unlikely(!(nx & PGT_validated)) )
 60.1087      {
 60.1088 -        /* Try to validate page type; drop the new reference on failure. */
 60.1089 -        if ( unlikely(!alloc_page_type(page, type)) )
 60.1090 +        if ( !(x & PGT_partial) )
 60.1091          {
 60.1092 -            MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
 60.1093 -                    PRtype_info ": caf=%08x taf=%" PRtype_info,
 60.1094 -                    page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
 60.1095 -                    type, page->count_info, page->u.inuse.type_info);
 60.1096 -            /* Noone else can get a reference. We hold the only ref. */
 60.1097 -            page->u.inuse.type_info = 0;
 60.1098 -            return 0;
 60.1099 +            page->nr_validated_ptes = 0;
 60.1100 +            page->partial_pte = 0;
 60.1101          }
 60.1102 -
 60.1103 -        /* Noone else is updating simultaneously. */
 60.1104 -        __set_bit(_PGT_validated, &page->u.inuse.type_info);
 60.1105 +        return alloc_page_type(page, type, preemptible);
 60.1106      }
 60.1107  
 60.1108 -    return 1;
 60.1109 +    return 0;
 60.1110 +}
 60.1111 +
 60.1112 +void put_page_type(struct page_info *page)
 60.1113 +{
 60.1114 +    int rc = __put_page_type(page, 0);
 60.1115 +    ASSERT(rc == 0);
 60.1116 +    (void)rc;
 60.1117  }
 60.1118  
 60.1119 +int get_page_type(struct page_info *page, unsigned long type)
 60.1120 +{
 60.1121 +    int rc = __get_page_type(page, type, 0);
 60.1122 +    if ( likely(rc == 0) )
 60.1123 +        return 1;
 60.1124 +    ASSERT(rc == -EINVAL);
 60.1125 +    return 0;
 60.1126 +}
 60.1127 +
 60.1128 +int put_page_type_preemptible(struct page_info *page)
 60.1129 +{
 60.1130 +    return __put_page_type(page, 1);
 60.1131 +}
 60.1132 +
 60.1133 +int get_page_type_preemptible(struct page_info *page, unsigned long type)
 60.1134 +{
 60.1135 +    return __get_page_type(page, type, 1);
 60.1136 +}
 60.1137  
 60.1138  void cleanup_page_cacheattr(struct page_info *page)
 60.1139  {
 60.1140 @@ -2087,7 +2285,7 @@ int new_guest_cr3(unsigned long mfn)
 60.1141                      l4e_from_pfn(
 60.1142                          mfn,
 60.1143                          (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
 60.1144 -                    pagetable_get_pfn(v->arch.guest_table), 0);
 60.1145 +                    pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
 60.1146          if ( unlikely(!okay) )
 60.1147          {
 60.1148              MEM_LOG("Error while installing new compat baseptr %lx", mfn);
 60.1149 @@ -2102,7 +2300,7 @@ int new_guest_cr3(unsigned long mfn)
 60.1150  #endif
 60.1151      okay = paging_mode_refcounts(d)
 60.1152          ? get_page_from_pagenr(mfn, d)
 60.1153 -        : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
 60.1154 +        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
 60.1155      if ( unlikely(!okay) )
 60.1156      {
 60.1157          MEM_LOG("Error while installing new baseptr %lx", mfn);
 60.1158 @@ -2276,9 +2474,7 @@ int do_mmuext_op(
 60.1159      {
 60.1160          if ( hypercall_preempt_check() )
 60.1161          {
 60.1162 -            rc = hypercall_create_continuation(
 60.1163 -                __HYPERVISOR_mmuext_op, "hihi",
 60.1164 -                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 60.1165 +            rc = -EAGAIN;
 60.1166              break;
 60.1167          }
 60.1168  
 60.1169 @@ -2325,10 +2521,14 @@ int do_mmuext_op(
 60.1170              if ( paging_mode_refcounts(FOREIGNDOM) )
 60.1171                  break;
 60.1172  
 60.1173 -            okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
 60.1174 +            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
 60.1175 +            okay = !rc;
 60.1176              if ( unlikely(!okay) )
 60.1177              {
 60.1178 -                MEM_LOG("Error while pinning mfn %lx", mfn);
 60.1179 +                if ( rc == -EINTR )
 60.1180 +                    rc = -EAGAIN;
 60.1181 +                else if ( rc != -EAGAIN )
 60.1182 +                    MEM_LOG("Error while pinning mfn %lx", mfn);
 60.1183                  break;
 60.1184              }
 60.1185  
 60.1186 @@ -2373,8 +2573,11 @@ int do_mmuext_op(
 60.1187              {
 60.1188                  put_page_and_type(page);
 60.1189                  put_page(page);
 60.1190 -                /* A page is dirtied when its pin status is cleared. */
 60.1191 -                paging_mark_dirty(d, mfn);
 60.1192 +                if ( !rc )
 60.1193 +                {
 60.1194 +                    /* A page is dirtied when its pin status is cleared. */
 60.1195 +                    paging_mark_dirty(d, mfn);
 60.1196 +                }
 60.1197              }
 60.1198              else
 60.1199              {
 60.1200 @@ -2398,8 +2601,8 @@ int do_mmuext_op(
 60.1201                  if ( paging_mode_refcounts(d) )
 60.1202                      okay = get_page_from_pagenr(mfn, d);
 60.1203                  else
 60.1204 -                    okay = get_page_and_type_from_pagenr(
 60.1205 -                        mfn, PGT_root_page_table, d);
 60.1206 +                    okay = !get_page_and_type_from_pagenr(
 60.1207 +                        mfn, PGT_root_page_table, d, 0);
 60.1208                  if ( unlikely(!okay) )
 60.1209                  {
 60.1210                      MEM_LOG("Error while installing new mfn %lx", mfn);
 60.1211 @@ -2517,6 +2720,11 @@ int do_mmuext_op(
 60.1212          guest_handle_add_offset(uops, 1);
 60.1213      }
 60.1214  
 60.1215 +    if ( rc == -EAGAIN )
 60.1216 +        rc = hypercall_create_continuation(
 60.1217 +            __HYPERVISOR_mmuext_op, "hihi",
 60.1218 +            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 60.1219 +
 60.1220      process_deferred_ops();
 60.1221  
 60.1222      perfc_add(num_mmuext_ops, i);
 60.1223 @@ -2576,9 +2784,7 @@ int do_mmu_update(
 60.1224      {
 60.1225          if ( hypercall_preempt_check() )
 60.1226          {
 60.1227 -            rc = hypercall_create_continuation(
 60.1228 -                __HYPERVISOR_mmu_update, "hihi",
 60.1229 -                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 60.1230 +            rc = -EAGAIN;
 60.1231              break;
 60.1232          }
 60.1233  
 60.1234 @@ -2653,27 +2859,29 @@ int do_mmu_update(
 60.1235                                          cmd == MMU_PT_UPDATE_PRESERVE_AD);
 60.1236                  }
 60.1237                  break;
 60.1238 -#if CONFIG_PAGING_LEVELS >= 3
 60.1239                  case PGT_l3_page_table:
 60.1240                  {
 60.1241                      l3_pgentry_t l3e = l3e_from_intpte(req.val);
 60.1242 -                    okay = mod_l3_entry(va, l3e, mfn,
 60.1243 -                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
 60.1244 +                    rc = mod_l3_entry(va, l3e, mfn,
 60.1245 +                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
 60.1246 +                    okay = !rc;
 60.1247                  }
 60.1248                  break;
 60.1249 -#endif
 60.1250  #if CONFIG_PAGING_LEVELS >= 4
 60.1251                  case PGT_l4_page_table:
 60.1252                  {
 60.1253                      l4_pgentry_t l4e = l4e_from_intpte(req.val);
 60.1254 -                    okay = mod_l4_entry(va, l4e, mfn,
 60.1255 -                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
 60.1256 +                    rc = mod_l4_entry(va, l4e, mfn,
 60.1257 +                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
 60.1258 +                    okay = !rc;
 60.1259                  }
 60.1260                  break;
 60.1261  #endif
 60.1262                  }
 60.1263  
 60.1264                  put_page_type(page);
 60.1265 +                if ( rc == -EINTR )
 60.1266 +                    rc = -EAGAIN;
 60.1267              }
 60.1268              break;
 60.1269  
 60.1270 @@ -2742,6 +2950,11 @@ int do_mmu_update(
 60.1271          guest_handle_add_offset(ureqs, 1);
 60.1272      }
 60.1273  
 60.1274 +    if ( rc == -EAGAIN )
 60.1275 +        rc = hypercall_create_continuation(
 60.1276 +            __HYPERVISOR_mmu_update, "hihi",
 60.1277 +            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 60.1278 +
 60.1279      process_deferred_ops();
 60.1280  
 60.1281      domain_mmap_cache_destroy(&mapcache);
 60.1282 @@ -3339,6 +3552,7 @@ DEFINE_XEN_GUEST_HANDLE(e820entry_t);
 60.1283  
 60.1284  long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
 60.1285  {
 60.1286 +    struct page_info *page = NULL;
 60.1287      switch ( op )
 60.1288      {
 60.1289      case XENMEM_add_to_physmap:
 60.1290 @@ -3389,12 +3603,22 @@ long arch_memory_op(int op, XEN_GUEST_HA
 60.1291  
 60.1292              spin_unlock(&d->grant_table->lock);
 60.1293              break;
 60.1294 +        case XENMAPSPACE_mfn:
 60.1295 +        {
 60.1296 +            if ( get_page_from_pagenr(xatp.idx, d) ) {
 60.1297 +                mfn = xatp.idx;
 60.1298 +                page = mfn_to_page(mfn);
 60.1299 +            }
 60.1300 +            break;
 60.1301 +        }
 60.1302          default:
 60.1303              break;
 60.1304          }
 60.1305  
 60.1306          if ( !paging_mode_translate(d) || (mfn == 0) )
 60.1307          {
 60.1308 +            if ( page )
 60.1309 +                put_page(page);
 60.1310              rcu_unlock_domain(d);
 60.1311              return -EINVAL;
 60.1312          }
 60.1313 @@ -3423,6 +3647,53 @@ long arch_memory_op(int op, XEN_GUEST_HA
 60.1314  
 60.1315          domain_unlock(d);
 60.1316  
 60.1317 +        if ( page )
 60.1318 +            put_page(page);
 60.1319 +
 60.1320 +        rcu_unlock_domain(d);
 60.1321 +
 60.1322 +        break;
 60.1323 +    }
 60.1324 +
 60.1325 +    case XENMEM_remove_from_physmap:
 60.1326 +    {
 60.1327 +        struct xen_remove_from_physmap xrfp;
 60.1328 +        unsigned long mfn;
 60.1329 +        struct domain *d;
 60.1330 +
 60.1331 +        if ( copy_from_guest(&xrfp, arg, 1) )
 60.1332 +            return -EFAULT;
 60.1333 +
 60.1334 +        if ( xrfp.domid == DOMID_SELF )
 60.1335 +        {
 60.1336 +            d = rcu_lock_current_domain();
 60.1337 +        }
 60.1338 +        else
 60.1339 +        {
 60.1340 +            if ( (d = rcu_lock_domain_by_id(xrfp.domid)) == NULL )
 60.1341 +                return -ESRCH;
 60.1342 +            if ( !IS_PRIV_FOR(current->domain, d) )
 60.1343 +            {
 60.1344 +                rcu_unlock_domain(d);
 60.1345 +                return -EPERM;
 60.1346 +            }
 60.1347 +        }
 60.1348 +
 60.1349 +        if ( xsm_remove_from_physmap(current->domain, d) )
 60.1350 +        {
 60.1351 +            rcu_unlock_domain(d);
 60.1352 +            return -EPERM;
 60.1353 +        }
 60.1354 +
 60.1355 +        domain_lock(d);
 60.1356 +
 60.1357 +        mfn = gmfn_to_mfn(d, xrfp.gpfn);
 60.1358 +
 60.1359 +        if ( mfn_valid(mfn) )
 60.1360 +            guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
 60.1361 +
 60.1362 +        domain_unlock(d);
 60.1363 +
 60.1364          rcu_unlock_domain(d);
 60.1365  
 60.1366          break;
 60.1367 @@ -3637,9 +3908,8 @@ static int ptwr_emulated_update(
 60.1368      nl1e = l1e_from_intpte(val);
 60.1369      if ( unlikely(!get_page_from_l1e(nl1e, d)) )
 60.1370      {
 60.1371 -        if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
 60.1372 -             (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
 60.1373 -             (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
 60.1374 +        if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
 60.1375 +             !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
 60.1376          {
 60.1377              /*
 60.1378               * If this is an upper-half write to a PAE PTE then we assume that
    61.1 --- a/xen/arch/x86/platform_hypercall.c	Tue Sep 02 16:34:53 2008 -0700
    61.2 +++ b/xen/arch/x86/platform_hypercall.c	Tue Sep 02 16:55:55 2008 -0700
    61.3 @@ -147,8 +147,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
    61.4  
    61.5      case XENPF_microcode_update:
    61.6      {
    61.7 -        extern int microcode_update(XEN_GUEST_HANDLE(void), unsigned long len);
    61.8 -        XEN_GUEST_HANDLE(void) data;
    61.9 +        XEN_GUEST_HANDLE(const_void) data;
   61.10  
   61.11          ret = xsm_microcode();
   61.12          if ( ret )
    62.1 --- a/xen/arch/x86/time.c	Tue Sep 02 16:34:53 2008 -0700
    62.2 +++ b/xen/arch/x86/time.c	Tue Sep 02 16:55:55 2008 -0700
    62.3 @@ -840,12 +840,11 @@ struct cpu_calibration {
    62.4      u64 local_tsc_stamp;
    62.5      s_time_t stime_local_stamp;
    62.6      s_time_t stime_master_stamp;
    62.7 -    struct timer softirq_callback;
    62.8  };
    62.9  static DEFINE_PER_CPU(struct cpu_calibration, cpu_calibration);
   62.10  
   62.11  /* Softirq handler for per-CPU time calibration. */
   62.12 -static void local_time_calibration(void *unused)
   62.13 +static void local_time_calibration(void)
   62.14  {
   62.15      struct cpu_time *t = &this_cpu(cpu_time);
   62.16      struct cpu_calibration *c = &this_cpu(cpu_calibration);
   62.17 @@ -1004,13 +1003,12 @@ static void time_calibration_rendezvous(
   62.18      struct cpu_calibration *c = &this_cpu(cpu_calibration);
   62.19      struct calibration_rendezvous *r = _r;
   62.20  
   62.21 -    local_irq_disable();
   62.22 -
   62.23      if ( smp_processor_id() == 0 )
   62.24      {
   62.25          while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
   62.26              cpu_relax();
   62.27          r->master_stime = read_platform_stime();
   62.28 +        mb(); /* write r->master_stime /then/ signal */
   62.29          atomic_inc(&r->nr_cpus);
   62.30      }
   62.31      else
   62.32 @@ -1018,16 +1016,14 @@ static void time_calibration_rendezvous(
   62.33          atomic_inc(&r->nr_cpus);
   62.34          while ( atomic_read(&r->nr_cpus) != total_cpus )
   62.35              cpu_relax();
   62.36 +        mb(); /* receive signal /then/ read r->master_stime */
   62.37      }
   62.38  
   62.39      rdtscll(c->local_tsc_stamp);
   62.40      c->stime_local_stamp = get_s_time();
   62.41      c->stime_master_stamp = r->master_stime;
   62.42  
   62.43 -    local_irq_enable();
   62.44 -
   62.45 -    /* Callback in softirq context as soon as possible. */
   62.46 -    set_timer(&c->softirq_callback, c->stime_local_stamp);
   62.47 +    raise_softirq(TIME_CALIBRATE_SOFTIRQ);
   62.48  }
   62.49  
   62.50  static void time_calibration(void *unused)
   62.51 @@ -1036,6 +1032,7 @@ static void time_calibration(void *unuse
   62.52          .nr_cpus = ATOMIC_INIT(0)
   62.53      };
   62.54  
   62.55 +    /* @wait=1 because we must wait for all cpus before freeing @r. */
   62.56      on_each_cpu(time_calibration_rendezvous, &r, 0, 1);
   62.57  }
   62.58  
   62.59 @@ -1053,9 +1050,6 @@ void init_percpu_time(void)
   62.60      t->stime_master_stamp = now;
   62.61      t->stime_local_stamp  = now;
   62.62  
   62.63 -    init_timer(&this_cpu(cpu_calibration).softirq_callback,
   62.64 -               local_time_calibration, NULL, smp_processor_id());
   62.65 -
   62.66      if ( smp_processor_id() == 0 )
   62.67      {
   62.68          init_timer(&calibration_timer, time_calibration, NULL, 0);
   62.69 @@ -1073,6 +1067,8 @@ int __init init_xen_time(void)
   62.70      if ( cpuid_edx(0x80000007) & (1u<<8) )
   62.71          tsc_invariant = 1;
   62.72  
   62.73 +    open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
   62.74 +
   62.75      init_percpu_time();
   62.76  
   62.77      stime_platform_stamp = 0;
   62.78 @@ -1180,7 +1176,7 @@ int time_suspend(void)
   62.79      }
   62.80  
   62.81      /* Better to cancel calibration timer for accuracy. */
   62.82 -    kill_timer(&this_cpu(cpu_calibration).softirq_callback);
   62.83 +    clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
   62.84  
   62.85      return 0;
   62.86  }
    63.1 --- a/xen/arch/x86/traps.c	Tue Sep 02 16:34:53 2008 -0700
    63.2 +++ b/xen/arch/x86/traps.c	Tue Sep 02 16:55:55 2008 -0700
    63.3 @@ -2124,6 +2124,36 @@ static int emulate_privileged_op(struct 
    63.4              if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
    63.5                  goto fail;
    63.6              break;
    63.7 +        case MSR_AMD64_NB_CFG:
    63.8 +            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
    63.9 +                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
   63.10 +                goto fail;
   63.11 +            if ( !IS_PRIV(v->domain) )
   63.12 +                break;
   63.13 +            if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
   63.14 +                 (eax != l) ||
   63.15 +                 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
   63.16 +                goto invalid;
   63.17 +            if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
   63.18 +                goto fail;
   63.19 +            break;
   63.20 +        case MSR_FAM10H_MMIO_CONF_BASE:
   63.21 +            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
   63.22 +                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
   63.23 +                goto fail;
   63.24 +            if ( !IS_PRIV(v->domain) )
   63.25 +                break;
   63.26 +            if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
   63.27 +                 (((((u64)h << 32) | l) ^ res) &
   63.28 +                  ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
   63.29 +                    (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
   63.30 +                     FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
   63.31 +                    ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
   63.32 +                     FAM10H_MMIO_CONF_BASE_SHIFT))) )
   63.33 +                goto invalid;
   63.34 +            if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
   63.35 +                goto fail;
   63.36 +            break;
   63.37          case MSR_IA32_PERF_CTL:
   63.38              if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
   63.39                  goto fail;
   63.40 @@ -2137,6 +2167,7 @@ static int emulate_privileged_op(struct 
   63.41                  break;
   63.42              if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
   63.43                   (eax != l) || (edx != h) )
   63.44 +        invalid:
   63.45                  gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
   63.46                          "%08x:%08x to %08x:%08x.\n",
   63.47                          _p(regs->ecx), h, l, edx, eax);
    64.1 --- a/xen/arch/x86/x86_64/compat/mm.c	Tue Sep 02 16:34:53 2008 -0700
    64.2 +++ b/xen/arch/x86/x86_64/compat/mm.c	Tue Sep 02 16:55:55 2008 -0700
    64.3 @@ -69,6 +69,20 @@ int compat_arch_memory_op(int op, XEN_GU
    64.4          break;
    64.5      }
    64.6  
    64.7 +    case XENMEM_remove_from_physmap:
    64.8 +    {
    64.9 +        struct compat_remove_from_physmap cmp;
   64.10 +        struct xen_remove_from_physmap *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
   64.11 +
   64.12 +        if ( copy_from_guest(&cmp, arg, 1) )
   64.13 +            return -EFAULT;
   64.14 +
   64.15 +        XLAT_remove_from_physmap(nat, &cmp);
   64.16 +        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
   64.17 +
   64.18 +        break;
   64.19 +    }
   64.20 +
   64.21      case XENMEM_set_memory_map:
   64.22      {
   64.23          struct compat_foreign_memory_map cmp;
    65.1 --- a/xen/common/softirq.c	Tue Sep 02 16:34:53 2008 -0700
    65.2 +++ b/xen/common/softirq.c	Tue Sep 02 16:55:55 2008 -0700
    65.3 @@ -49,6 +49,7 @@ asmlinkage void do_softirq(void)
    65.4  
    65.5  void open_softirq(int nr, softirq_handler handler)
    65.6  {
    65.7 +    ASSERT(nr < NR_SOFTIRQS);
    65.8      softirq_handlers[nr] = handler;
    65.9  }
   65.10  
    66.1 --- a/xen/common/timer.c	Tue Sep 02 16:34:53 2008 -0700
    66.2 +++ b/xen/common/timer.c	Tue Sep 02 16:55:55 2008 -0700
    66.3 @@ -30,6 +30,7 @@
    66.4  struct timers {
    66.5      spinlock_t     lock;
    66.6      struct timer **heap;
    66.7 +    struct timer  *list;
    66.8      struct timer  *running;
    66.9  } __cacheline_aligned;
   66.10  
   66.11 @@ -86,13 +87,11 @@ static void up_heap(struct timer **heap,
   66.12  
   66.13  
   66.14  /* Delete @t from @heap. Return TRUE if new top of heap. */
   66.15 -static int remove_entry(struct timer **heap, struct timer *t)
   66.16 +static int remove_from_heap(struct timer **heap, struct timer *t)
   66.17  {
   66.18      int sz = GET_HEAP_SIZE(heap);
   66.19      int pos = t->heap_offset;
   66.20  
   66.21 -    t->heap_offset = 0;
   66.22 -
   66.23      if ( unlikely(pos == sz) )
   66.24      {
   66.25          SET_HEAP_SIZE(heap, sz-1);
   66.26 @@ -115,7 +114,7 @@ static int remove_entry(struct timer **h
   66.27  
   66.28  
   66.29  /* Add new entry @t to @heap. Return TRUE if new top of heap. */
   66.30 -static int add_entry(struct timer ***pheap, struct timer *t)
   66.31 +static int add_to_heap(struct timer ***pheap, struct timer *t)
   66.32  {
   66.33      struct timer **heap = *pheap;
   66.34      int sz = GET_HEAP_SIZE(heap);
   66.35 @@ -126,8 +125,11 @@ static int add_entry(struct timer ***phe
   66.36          /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
   66.37          int old_limit = GET_HEAP_LIMIT(heap);
   66.38          int new_limit = ((old_limit + 1) << 4) - 1;
   66.39 +        if ( in_irq() )
   66.40 +            goto out;
   66.41          heap = xmalloc_array(struct timer *, new_limit + 1);
   66.42 -        BUG_ON(heap == NULL);
   66.43 +        if ( heap == NULL )
   66.44 +            goto out;
   66.45          memcpy(heap, *pheap, (old_limit + 1) * sizeof(*heap));
   66.46          SET_HEAP_LIMIT(heap, new_limit);
   66.47          if ( old_limit != 0 )
   66.48 @@ -139,26 +141,95 @@ static int add_entry(struct timer ***phe
   66.49      heap[sz] = t;
   66.50      t->heap_offset = sz;
   66.51      up_heap(heap, sz);
   66.52 + out:
   66.53      return (t->heap_offset == 1);
   66.54  }
   66.55  
   66.56  
   66.57  /****************************************************************************
   66.58 + * LINKED LIST OPERATIONS.
   66.59 + */
   66.60 +
   66.61 +static int remove_from_list(struct timer **pprev, struct timer *t)
   66.62 +{
   66.63 +    struct timer *curr, **_pprev = pprev;
   66.64 +
   66.65 +    while ( (curr = *_pprev) != t )
   66.66 +        _pprev = &curr->list_next;
   66.67 +
   66.68 +    *_pprev = t->list_next;
   66.69 +
   66.70 +    return (_pprev == pprev);
   66.71 +}
   66.72 +
   66.73 +static int add_to_list(struct timer **pprev, struct timer *t)
   66.74 +{
   66.75 +    struct timer *curr, **_pprev = pprev;
   66.76 +
   66.77 +    while ( ((curr = *_pprev) != NULL) && (curr->expires <= t->expires) )
   66.78 +        _pprev = &curr->list_next;
   66.79 +
   66.80 +    t->list_next = curr;
   66.81 +    *_pprev = t;
   66.82 +
   66.83 +    return (_pprev == pprev);
   66.84 +}
   66.85 +
   66.86 +
   66.87 +/****************************************************************************
   66.88   * TIMER OPERATIONS.
   66.89   */
   66.90  
   66.91 +static int remove_entry(struct timers *timers, struct timer *t)
   66.92 +{
   66.93 +    int rc;
   66.94 +
   66.95 +    switch ( t->status )
   66.96 +    {
   66.97 +    case TIMER_STATUS_in_heap:
   66.98 +        rc = remove_from_heap(timers->heap, t);
   66.99 +        break;
  66.100 +    case TIMER_STATUS_in_list:
  66.101 +        rc = remove_from_list(&timers->list, t);
  66.102 +        break;
  66.103 +    default:
  66.104 +        rc = 0;
  66.105 +        BUG();
  66.106 +    }
  66.107 +
  66.108 +    t->status = TIMER_STATUS_inactive;
  66.109 +    return rc;
  66.110 +}
  66.111 +
  66.112 +static int add_entry(struct timers *timers, struct timer *t)
  66.113 +{
  66.114 +    int rc;
  66.115 +
  66.116 +    ASSERT(t->status == TIMER_STATUS_inactive);
  66.117 +
  66.118 +    /* Try to add to heap. t->heap_offset indicates whether we succeed. */
  66.119 +    t->heap_offset = 0;
  66.120 +    t->status = TIMER_STATUS_in_heap;
  66.121 +    rc = add_to_heap(&timers->heap, t);
  66.122 +    if ( t->heap_offset != 0 )
  66.123 +        return rc;
  66.124 +
  66.125 +    /* Fall back to adding to the slower linked list. */
  66.126 +    t->status = TIMER_STATUS_in_list;
  66.127 +    return add_to_list(&timers->list, t);
  66.128 +}
  66.129 +
  66.130  static inline void __add_timer(struct timer *timer)
  66.131  {
  66.132      int cpu = timer->cpu;
  66.133 -    if ( add_entry(&per_cpu(timers, cpu).heap, timer) )
  66.134 +    if ( add_entry(&per_cpu(timers, cpu), timer) )
  66.135          cpu_raise_softirq(cpu, TIMER_SOFTIRQ);
  66.136  }
  66.137  
  66.138 -
  66.139  static inline void __stop_timer(struct timer *timer)
  66.140  {
  66.141      int cpu = timer->cpu;
  66.142 -    if ( remove_entry(per_cpu(timers, cpu).heap, timer) )
  66.143 +    if ( remove_entry(&per_cpu(timers, cpu), timer) )
  66.144          cpu_raise_softirq(cpu, TIMER_SOFTIRQ);
  66.145  }
  66.146  
  66.147 @@ -203,7 +274,7 @@ void set_timer(struct timer *timer, s_ti
  66.148  
  66.149      timer->expires = expires;
  66.150  
  66.151 -    if ( likely(!timer->killed) )
  66.152 +    if ( likely(timer->status != TIMER_STATUS_killed) )
  66.153          __add_timer(timer);
  66.154  
  66.155      timer_unlock_irqrestore(timer, flags);
  66.156 @@ -278,7 +349,7 @@ void kill_timer(struct timer *timer)
  66.157  
  66.158      if ( active_timer(timer) )
  66.159          __stop_timer(timer);
  66.160 -    timer->killed = 1;
  66.161 +    timer->status = TIMER_STATUS_killed;
  66.162  
  66.163      timer_unlock_irqrestore(timer, flags);
  66.164  
  66.165 @@ -290,43 +361,76 @@ void kill_timer(struct timer *timer)
  66.166  
  66.167  static void timer_softirq_action(void)
  66.168  {
  66.169 -    struct timer  *t, **heap;
  66.170 +    struct timer  *t, **heap, *next;
  66.171      struct timers *ts;
  66.172 -    s_time_t       now;
  66.173 +    s_time_t       now, deadline;
  66.174      void         (*fn)(void *);
  66.175      void          *data;
  66.176  
  66.177      ts = &this_cpu(timers);
  66.178  
  66.179      spin_lock_irq(&ts->lock);
  66.180 -    
  66.181 -    do {
  66.182 -        heap = ts->heap;
  66.183 -        now  = NOW();
  66.184  
  66.185 -        while ( (GET_HEAP_SIZE(heap) != 0) &&
  66.186 -                ((t = heap[1])->expires < (now + TIMER_SLOP)) )
  66.187 -        {
  66.188 -            remove_entry(heap, t);
  66.189 +    /* Try to move timers from overflow linked list to more efficient heap. */
  66.190 +    next = ts->list;
  66.191 +    ts->list = NULL;
  66.192 +    while ( unlikely((t = next) != NULL) )
  66.193 +    {
  66.194 +        next = t->list_next;
  66.195 +        t->status = TIMER_STATUS_inactive;
  66.196 +        add_entry(ts, t);
  66.197 +    }
  66.198 +    
  66.199 +    heap = ts->heap;
  66.200 +    now  = NOW();
  66.201 +
  66.202 +    while ( (GET_HEAP_SIZE(heap) != 0) &&
  66.203 +            ((t = heap[1])->expires < (now + TIMER_SLOP)) )
  66.204 +    {
  66.205 +        remove_entry(ts, t);
  66.206 +
  66.207 +        ts->running = t;
  66.208  
  66.209 -            ts->running = t;
  66.210 +        fn   = t->function;
  66.211 +        data = t->data;
  66.212  
  66.213 -            fn   = t->function;
  66.214 -            data = t->data;
  66.215 +        spin_unlock_irq(&ts->lock);
  66.216 +        (*fn)(data);
  66.217 +        spin_lock_irq(&ts->lock);
  66.218  
  66.219 -            spin_unlock_irq(&ts->lock);
  66.220 -            (*fn)(data);
  66.221 -            spin_lock_irq(&ts->lock);
  66.222 +        /* Heap may have grown while the lock was released. */
  66.223 +        heap = ts->heap;
  66.224 +    }
  66.225 +
  66.226 +    deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
  66.227  
  66.228 -            /* Heap may have grown while the lock was released. */
  66.229 -            heap = ts->heap;
  66.230 +    while ( unlikely((t = ts->list) != NULL) )
  66.231 +    {
  66.232 +        if ( t->expires >= (now + TIMER_SLOP) )
  66.233 +        {
  66.234 +            if ( (deadline == 0) || (deadline > t->expires) )
  66.235 +                deadline = t->expires;
  66.236 +            break;
  66.237          }
  66.238  
  66.239 -        ts->running = NULL;
  66.240 +        ts->list = t->list_next;
  66.241 +        t->status = TIMER_STATUS_inactive;
  66.242 +
  66.243 +        ts->running = t;
  66.244 +
  66.245 +        fn   = t->function;
  66.246 +        data = t->data;
  66.247  
  66.248 -        this_cpu(timer_deadline) = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
  66.249 +        spin_unlock_irq(&ts->lock);
  66.250 +        (*fn)(data);
  66.251 +        spin_lock_irq(&ts->lock);
  66.252      }
  66.253 -    while ( !reprogram_timer(this_cpu(timer_deadline)) );
  66.254 +
  66.255 +    ts->running = NULL;
  66.256 +
  66.257 +    this_cpu(timer_deadline) = deadline;
  66.258 +    if ( !reprogram_timer(deadline) )
  66.259 +        raise_softirq(TIMER_SOFTIRQ);
  66.260  
  66.261      spin_unlock_irq(&ts->lock);
  66.262  }
  66.263 @@ -364,6 +468,9 @@ static void dump_timerq(unsigned char ke
  66.264              printk ("  %d : %p ex=0x%08X%08X %p\n",
  66.265                      j, t, (u32)(t->expires>>32), (u32)t->expires, t->data);
  66.266          }
  66.267 +        for ( t = ts->list, j = 0; t != NULL; t = t->list_next, j++ )
  66.268 +            printk (" L%d : %p ex=0x%08X%08X %p\n",
  66.269 +                    j, t, (u32)(t->expires>>32), (u32)t->expires, t->data);
  66.270          spin_unlock_irqrestore(&ts->lock, flags);
  66.271          printk("\n");
  66.272      }
    67.1 --- a/xen/common/xmalloc.c	Tue Sep 02 16:34:53 2008 -0700
    67.2 +++ b/xen/common/xmalloc.c	Tue Sep 02 16:55:55 2008 -0700
    67.3 @@ -136,15 +136,14 @@ static void maybe_split(struct xmalloc_h
    67.4  static void *xmalloc_new_page(size_t size)
    67.5  {
    67.6      struct xmalloc_hdr *hdr;
    67.7 -    unsigned long flags;
    67.8  
    67.9      hdr = alloc_xenheap_page();
   67.10      if ( hdr == NULL )
   67.11          return NULL;
   67.12  
   67.13 -    spin_lock_irqsave(&freelist_lock, flags);
   67.14 +    spin_lock(&freelist_lock);
   67.15      maybe_split(hdr, size, PAGE_SIZE);
   67.16 -    spin_unlock_irqrestore(&freelist_lock, flags);
   67.17 +    spin_unlock(&freelist_lock);
   67.18  
   67.19      return data_from_header(hdr);
   67.20  }
   67.21 @@ -175,7 +174,6 @@ static inline size_t align_up(size_t siz
   67.22  void *_xmalloc(size_t size, size_t align)
   67.23  {
   67.24      struct xmalloc_hdr *i;
   67.25 -    unsigned long flags;
   67.26  
   67.27      ASSERT(!in_irq());
   67.28  
   67.29 @@ -196,17 +194,17 @@ void *_xmalloc(size_t size, size_t align
   67.30          return xmalloc_whole_pages(size);
   67.31  
   67.32      /* Search free list. */
   67.33 -    spin_lock_irqsave(&freelist_lock, flags);
   67.34 +    spin_lock(&freelist_lock);
   67.35      list_for_each_entry( i, &freelist, freelist )
   67.36      {
   67.37          if ( i->size < size )
   67.38              continue;
   67.39          del_from_freelist(i);
   67.40          maybe_split(i, size, i->size);
   67.41 -        spin_unlock_irqrestore(&freelist_lock, flags);
   67.42 +        spin_unlock(&freelist_lock);
   67.43          return data_from_header(i);
   67.44      }
   67.45 -    spin_unlock_irqrestore(&freelist_lock, flags);
   67.46 +    spin_unlock(&freelist_lock);
   67.47  
   67.48      /* Alloc a new page and return from that. */
   67.49      return xmalloc_new_page(size);
   67.50 @@ -214,7 +212,6 @@ void *_xmalloc(size_t size, size_t align
   67.51  
   67.52  void xfree(void *p)
   67.53  {
   67.54 -    unsigned long flags;
   67.55      struct xmalloc_hdr *i, *tmp, *hdr;
   67.56  
   67.57      ASSERT(!in_irq());
   67.58 @@ -238,7 +235,7 @@ void xfree(void *p)
   67.59      }
   67.60  
   67.61      /* Merge with other free block, or put in list. */
   67.62 -    spin_lock_irqsave(&freelist_lock, flags);
   67.63 +    spin_lock(&freelist_lock);
   67.64      list_for_each_entry_safe( i, tmp, &freelist, freelist )
   67.65      {
   67.66          unsigned long _i   = (unsigned long)i;
   67.67 @@ -275,7 +272,7 @@ void xfree(void *p)
   67.68          add_to_freelist(hdr);
   67.69      }
   67.70  
   67.71 -    spin_unlock_irqrestore(&freelist_lock, flags);
   67.72 +    spin_unlock(&freelist_lock);
   67.73  }
   67.74  
   67.75  /*
    68.1 --- a/xen/drivers/passthrough/vtd/intremap.c	Tue Sep 02 16:34:53 2008 -0700
    68.2 +++ b/xen/drivers/passthrough/vtd/intremap.c	Tue Sep 02 16:55:55 2008 -0700
    68.3 @@ -43,7 +43,7 @@ u16 apicid_to_bdf(int apic_id)
    68.4      return 0;
    68.5  }
    68.6  
    68.7 -static void remap_entry_to_ioapic_rte(
    68.8 +static int remap_entry_to_ioapic_rte(
    68.9      struct iommu *iommu, struct IO_APIC_route_entry *old_rte)
   68.10  {
   68.11      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
   68.12 @@ -56,15 +56,19 @@ static void remap_entry_to_ioapic_rte(
   68.13      {
   68.14          dprintk(XENLOG_ERR VTDPREFIX,
   68.15                  "remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
   68.16 -        return;
   68.17 +        return -EFAULT;
   68.18      }
   68.19  
   68.20      remap_rte = (struct IO_APIC_route_remap_entry *) old_rte;
   68.21      index = (remap_rte->index_15 << 15) | remap_rte->index_0_14;
   68.22  
   68.23      if ( index > ir_ctrl->iremap_index )
   68.24 -        panic("%s: index (%d) is larger than remap table entry size (%d)!\n",
   68.25 -              __func__, index, ir_ctrl->iremap_index);
   68.26 +    {
   68.27 +        dprintk(XENLOG_ERR VTDPREFIX,
   68.28 +                "%s: index (%d) is larger than remap table entry size (%d)!\n",
   68.29 +                __func__, index, ir_ctrl->iremap_index);
   68.30 +        return -EFAULT;
   68.31 +    }
   68.32  
   68.33      spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
   68.34  
   68.35 @@ -82,9 +86,10 @@ static void remap_entry_to_ioapic_rte(
   68.36  
   68.37      unmap_vtd_domain_page(iremap_entries);
   68.38      spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
   68.39 +    return 0;
   68.40  }
   68.41  
   68.42 -static void ioapic_rte_to_remap_entry(struct iommu *iommu,
   68.43 +static int ioapic_rte_to_remap_entry(struct iommu *iommu,
   68.44      int apic_id, struct IO_APIC_route_entry *old_rte,
   68.45      unsigned int rte_upper, unsigned int value)
   68.46  {
   68.47 @@ -108,7 +113,14 @@ static void ioapic_rte_to_remap_entry(st
   68.48          index = (remap_rte->index_15 << 15) | remap_rte->index_0_14;
   68.49  
   68.50      if ( index > IREMAP_ENTRY_NR - 1 )
   68.51 -        panic("ioapic_rte_to_remap_entry: intremap index is more than 256!\n");
   68.52 +    {
   68.53 +        dprintk(XENLOG_ERR VTDPREFIX,
   68.54 +                "%s: intremap index (%d) is larger than"
   68.55 +                " the maximum index (%ld)!\n",
   68.56 +                __func__, index, IREMAP_ENTRY_NR - 1);
   68.57 +        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
   68.58 +        return -EFAULT;
   68.59 +    }
   68.60  
   68.61      iremap_entries =
   68.62          (struct iremap_entry *)map_vtd_domain_page(ir_ctrl->iremap_maddr);
   68.63 @@ -159,7 +171,7 @@ static void ioapic_rte_to_remap_entry(st
   68.64  
   68.65      unmap_vtd_domain_page(iremap_entries);
   68.66      spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
   68.67 -    return;
   68.68 +    return 0;
   68.69  }
   68.70  
   68.71  unsigned int io_apic_read_remap_rte(
   68.72 @@ -189,23 +201,22 @@ unsigned int io_apic_read_remap_rte(
   68.73  
   68.74      remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
   68.75  
   68.76 -    if ( remap_rte->mask || (remap_rte->format == 0) )
   68.77 +    if ( remap_rte->format == 0 )
   68.78      {
   68.79 -        *IO_APIC_BASE(apic) = reg;
   68.80 +        *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
   68.81          return *(IO_APIC_BASE(apic)+4);
   68.82      }
   68.83  
   68.84 -    remap_entry_to_ioapic_rte(iommu, &old_rte);
   68.85 -    if ( rte_upper )
   68.86 +    if ( remap_entry_to_ioapic_rte(iommu, &old_rte) )
   68.87      {
   68.88 -        *IO_APIC_BASE(apic) = reg + 1;
   68.89 -        return (*(((u32 *)&old_rte) + 1));
   68.90 +        *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
   68.91 +        return *(IO_APIC_BASE(apic)+4);
   68.92      }
   68.93 +
   68.94 +    if ( rte_upper )
   68.95 +        return (*(((u32 *)&old_rte) + 1));
   68.96      else
   68.97 -    {
   68.98 -        *IO_APIC_BASE(apic) = reg;
   68.99          return (*(((u32 *)&old_rte) + 0));
  68.100 -    }
  68.101  }
  68.102  
  68.103  void io_apic_write_remap_rte(
  68.104 @@ -243,8 +254,13 @@ void io_apic_write_remap_rte(
  68.105      *(IO_APIC_BASE(apic)+4) = *(((int *)&old_rte)+0);
  68.106      remap_rte->mask = saved_mask;
  68.107  
  68.108 -    ioapic_rte_to_remap_entry(iommu, mp_ioapics[apic].mpc_apicid,
  68.109 -                              &old_rte, rte_upper, value);
  68.110 +    if ( ioapic_rte_to_remap_entry(iommu, mp_ioapics[apic].mpc_apicid,
  68.111 +                                   &old_rte, rte_upper, value) )
  68.112 +    {
  68.113 +        *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
  68.114 +        *(IO_APIC_BASE(apic)+4) = value;
  68.115 +        return;
  68.116 +    }
  68.117  
  68.118      /* write new entry to ioapic */
  68.119      *IO_APIC_BASE(apic) = reg;
  68.120 @@ -253,7 +269,7 @@ void io_apic_write_remap_rte(
  68.121      *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1);
  68.122  }
  68.123  
  68.124 -static void remap_entry_to_msi_msg(
  68.125 +static int remap_entry_to_msi_msg(
  68.126      struct iommu *iommu, struct msi_msg *msg)
  68.127  {
  68.128      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
  68.129 @@ -266,7 +282,7 @@ static void remap_entry_to_msi_msg(
  68.130      {
  68.131          dprintk(XENLOG_ERR VTDPREFIX,
  68.132                  "remap_entry_to_msi_msg: ir_ctl == NULL");
  68.133 -        return;
  68.134 +        return -EFAULT;
  68.135      }
  68.136  
  68.137      remap_rte = (struct msi_msg_remap_entry *) msg;
  68.138 @@ -274,8 +290,12 @@ static void remap_entry_to_msi_msg(
  68.139               remap_rte->address_lo.index_0_14;
  68.140  
  68.141      if ( index > ir_ctrl->iremap_index )
  68.142 -        panic("%s: index (%d) is larger than remap table entry size (%d)\n",
  68.143 -              __func__, index, ir_ctrl->iremap_index);
  68.144 +    {
  68.145 +        dprintk(XENLOG_ERR VTDPREFIX,
  68.146 +                "%s: index (%d) is larger than remap table entry size (%d)\n",
  68.147 +                __func__, index, ir_ctrl->iremap_index);
  68.148 +        return -EFAULT;
  68.149 +    }
  68.150  
  68.151      spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
  68.152  
  68.153 @@ -304,9 +324,10 @@ static void remap_entry_to_msi_msg(
  68.154  
  68.155      unmap_vtd_domain_page(iremap_entries);
  68.156      spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
  68.157 +    return 0;
  68.158  }
  68.159  
  68.160 -static void msi_msg_to_remap_entry(
  68.161 +static int msi_msg_to_remap_entry(
  68.162      struct iommu *iommu, struct pci_dev *pdev, struct msi_msg *msg)
  68.163  {
  68.164      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
  68.165 @@ -343,7 +364,15 @@ static void msi_msg_to_remap_entry(
  68.166          index = i;
  68.167  
  68.168      if ( index > IREMAP_ENTRY_NR - 1 )
  68.169 -        panic("msi_msg_to_remap_entry: intremap index is more than 256!\n");
  68.170 +    {
  68.171 +        dprintk(XENLOG_ERR VTDPREFIX,
  68.172 +                "%s: intremap index (%d) is larger than"
  68.173 +                " the maximum index (%ld)!\n",
  68.174 +                __func__, index, IREMAP_ENTRY_NR - 1);
  68.175 +        unmap_vtd_domain_page(iremap_entries);
  68.176 +        spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
  68.177 +        return -EFAULT;
  68.178 +    }
  68.179  
  68.180      iremap_entry = &iremap_entries[index];
  68.181      memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
  68.182 @@ -385,7 +414,7 @@ static void msi_msg_to_remap_entry(
  68.183  
  68.184      unmap_vtd_domain_page(iremap_entries);
  68.185      spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
  68.186 -    return;
  68.187 +    return 0;
  68.188  }
  68.189  
  68.190  void msi_msg_read_remap_rte(
    69.1 --- a/xen/drivers/passthrough/vtd/iommu.c	Tue Sep 02 16:34:53 2008 -0700
    69.2 +++ b/xen/drivers/passthrough/vtd/iommu.c	Tue Sep 02 16:55:55 2008 -0700
    69.3 @@ -624,15 +624,10 @@ static int iommu_set_root_entry(struct i
    69.4      unsigned long flags;
    69.5      s_time_t start_time;
    69.6  
    69.7 -    if ( iommu->root_maddr != 0 )
    69.8 -    {
    69.9 -        free_pgtable_maddr(iommu->root_maddr);
   69.10 -        iommu->root_maddr = 0;
   69.11 -    }
   69.12 -
   69.13      spin_lock_irqsave(&iommu->register_lock, flags);
   69.14  
   69.15 -    iommu->root_maddr = alloc_pgtable_maddr();
   69.16 +    if ( iommu->root_maddr == 0 )
   69.17 +        iommu->root_maddr = alloc_pgtable_maddr();
   69.18      if ( iommu->root_maddr == 0 )
   69.19      {
   69.20          spin_unlock_irqrestore(&iommu->register_lock, flags);
   69.21 @@ -1864,37 +1859,31 @@ static int intel_iommu_group_id(u8 bus, 
   69.22          return -1;
   69.23  }
   69.24  
   69.25 -u8 iommu_state[MAX_IOMMU_REGS * MAX_IOMMUS];
   69.26 +static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
   69.27  int iommu_suspend(void)
   69.28  {
   69.29      struct acpi_drhd_unit *drhd;
   69.30      struct iommu *iommu;
   69.31 -    int i = 0;
   69.32 +    u32    i;
   69.33 +
   69.34 +    if ( !vtd_enabled )
   69.35 +        return 0;
   69.36  
   69.37      iommu_flush_all();
   69.38  
   69.39      for_each_drhd_unit ( drhd )
   69.40      {
   69.41          iommu = drhd->iommu;
   69.42 -        iommu_state[DMAR_RTADDR_REG * i] =
   69.43 -            (u64) dmar_readq(iommu->reg, DMAR_RTADDR_REG);
   69.44 -        iommu_state[DMAR_FECTL_REG * i] =
   69.45 +        i = iommu->index;
   69.46 +
   69.47 +        iommu_state[i][DMAR_FECTL_REG] =
   69.48              (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
   69.49 -        iommu_state[DMAR_FEDATA_REG * i] =
   69.50 +        iommu_state[i][DMAR_FEDATA_REG] =
   69.51              (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
   69.52 -        iommu_state[DMAR_FEADDR_REG * i] =
   69.53 +        iommu_state[i][DMAR_FEADDR_REG] =
   69.54              (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
   69.55 -        iommu_state[DMAR_FEUADDR_REG * i] =
   69.56 +        iommu_state[i][DMAR_FEUADDR_REG] =
   69.57              (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
   69.58 -        iommu_state[DMAR_PLMBASE_REG * i] =
   69.59 -            (u32) dmar_readl(iommu->reg, DMAR_PLMBASE_REG);
   69.60 -        iommu_state[DMAR_PLMLIMIT_REG * i] =
   69.61 -            (u32) dmar_readl(iommu->reg, DMAR_PLMLIMIT_REG);
   69.62 -        iommu_state[DMAR_PHMBASE_REG * i] =
   69.63 -            (u64) dmar_readq(iommu->reg, DMAR_PHMBASE_REG);
   69.64 -        iommu_state[DMAR_PHMLIMIT_REG * i] =
   69.65 -            (u64) dmar_readq(iommu->reg, DMAR_PHMLIMIT_REG);
   69.66 -        i++;
   69.67      }
   69.68  
   69.69      return 0;
   69.70 @@ -1904,37 +1893,34 @@ int iommu_resume(void)
   69.71  {
   69.72      struct acpi_drhd_unit *drhd;
   69.73      struct iommu *iommu;
   69.74 -    int i = 0;
   69.75 +    u32 i;
   69.76 +
   69.77 +    if ( !vtd_enabled )
   69.78 +        return 0;
   69.79  
   69.80      iommu_flush_all();
   69.81  
   69.82 -    init_vtd_hw();
   69.83 +    if ( init_vtd_hw() != 0  && force_iommu )
   69.84 +         panic("IOMMU setup failed, crash Xen for security purpose!\n");
   69.85 +
   69.86      for_each_drhd_unit ( drhd )
   69.87      {
   69.88          iommu = drhd->iommu;
   69.89 -        dmar_writeq( iommu->reg, DMAR_RTADDR_REG,
   69.90 -                     (u64) iommu_state[DMAR_RTADDR_REG * i]);
   69.91 +        i = iommu->index;
   69.92 +
   69.93          dmar_writel(iommu->reg, DMAR_FECTL_REG,
   69.94 -                    (u32) iommu_state[DMAR_FECTL_REG * i]);
   69.95 +                    (u32) iommu_state[i][DMAR_FECTL_REG]);
   69.96          dmar_writel(iommu->reg, DMAR_FEDATA_REG,
   69.97 -                    (u32) iommu_state[DMAR_FEDATA_REG * i]);
   69.98 +                    (u32) iommu_state[i][DMAR_FEDATA_REG]);
   69.99          dmar_writel(iommu->reg, DMAR_FEADDR_REG,
  69.100 -                    (u32) iommu_state[DMAR_FEADDR_REG * i]);
  69.101 +                    (u32) iommu_state[i][DMAR_FEADDR_REG]);
  69.102          dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
  69.103 -                    (u32) iommu_state[DMAR_FEUADDR_REG * i]);
  69.104 -        dmar_writel(iommu->reg, DMAR_PLMBASE_REG,
  69.105 -                    (u32) iommu_state[DMAR_PLMBASE_REG * i]);
  69.106 -        dmar_writel(iommu->reg, DMAR_PLMLIMIT_REG,
  69.107 -                    (u32) iommu_state[DMAR_PLMLIMIT_REG * i]);
  69.108 -        dmar_writeq(iommu->reg, DMAR_PHMBASE_REG,
  69.109 -                    (u64) iommu_state[DMAR_PHMBASE_REG * i]);
  69.110 -        dmar_writeq(iommu->reg, DMAR_PHMLIMIT_REG,
  69.111 -                    (u64) iommu_state[DMAR_PHMLIMIT_REG * i]);
  69.112 +                    (u32) iommu_state[i][DMAR_FEUADDR_REG]);
  69.113  
  69.114          if ( iommu_enable_translation(iommu) )
  69.115              return -EIO;
  69.116 -        i++;
  69.117      }
  69.118 +
  69.119      return 0;
  69.120  }
  69.121  
    70.1 --- a/xen/include/asm-x86/io_apic.h	Tue Sep 02 16:34:53 2008 -0700
    70.2 +++ b/xen/include/asm-x86/io_apic.h	Tue Sep 02 16:55:55 2008 -0700
    70.3 @@ -125,7 +125,7 @@ extern int mpc_default_type;
    70.4  
    70.5  static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
    70.6  {
    70.7 -	if (vtd_enabled)
    70.8 +	if (iommu_enabled)
    70.9  		return io_apic_read_remap_rte(apic, reg);
   70.10  	*IO_APIC_BASE(apic) = reg;
   70.11  	return *(IO_APIC_BASE(apic)+4);
   70.12 @@ -152,6 +152,8 @@ extern int sis_apic_bug;
   70.13  #endif
   70.14  static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
   70.15  {
   70.16 +	if (iommu_enabled)
   70.17 +		return iommu_update_ire_from_apic(apic, reg, value);
   70.18  	if (sis_apic_bug)
   70.19  		*IO_APIC_BASE(apic) = reg;
   70.20  	*(IO_APIC_BASE(apic)+4) = value;
    71.1 --- a/xen/include/asm-x86/mm.h	Tue Sep 02 16:34:53 2008 -0700
    71.2 +++ b/xen/include/asm-x86/mm.h	Tue Sep 02 16:55:55 2008 -0700
    71.3 @@ -59,6 +59,17 @@ struct page_info
    71.4          u32 tlbflush_timestamp;
    71.5  
    71.6          /*
    71.7 +         * When PGT_partial is true then this field is valid and indicates
    71.8 +         * that PTEs in the range [0, @nr_validated_ptes) have been validated.
    71.9 +         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
   71.10 +         * partially validated.
   71.11 +         */
   71.12 +        struct {
   71.13 +            u16 nr_validated_ptes;
   71.14 +            bool_t partial_pte;
   71.15 +        };
   71.16 +
   71.17 +        /*
   71.18           * Guest pages with a shadow.  This does not conflict with
   71.19           * tlbflush_timestamp since page table pages are explicitly not
   71.20           * tracked for TLB-flush avoidance when a guest runs in shadow mode.
   71.21 @@ -86,9 +97,12 @@ struct page_info
   71.22   /* PAE only: is this an L2 page directory containing Xen-private mappings? */
   71.23  #define _PGT_pae_xen_l2     26
   71.24  #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
   71.25 +/* Has this page been *partially* validated for use as its current type? */
   71.26 +#define _PGT_partial        25
   71.27 +#define PGT_partial         (1U<<_PGT_partial)
   71.28  
   71.29 - /* 26-bit count of uses of this frame as its current type. */
   71.30 -#define PGT_count_mask      ((1U<<26)-1)
   71.31 + /* 25-bit count of uses of this frame as its current type. */
   71.32 +#define PGT_count_mask      ((1U<<25)-1)
   71.33  
   71.34   /* Cleared when the owning guest 'frees' this page. */
   71.35  #define _PGC_allocated      31
   71.36 @@ -154,7 +168,8 @@ extern unsigned long max_page;
   71.37  extern unsigned long total_pages;
   71.38  void init_frametable(void);
   71.39  
   71.40 -void free_page_type(struct page_info *page, unsigned long type);
   71.41 +int free_page_type(struct page_info *page, unsigned long type,
   71.42 +                   int preemptible);
   71.43  int _shadow_mode_refcounts(struct domain *d);
   71.44  
   71.45  void cleanup_page_cacheattr(struct page_info *page);
   71.46 @@ -165,6 +180,8 @@ void put_page(struct page_info *page);
   71.47  int  get_page(struct page_info *page, struct domain *domain);
   71.48  void put_page_type(struct page_info *page);
   71.49  int  get_page_type(struct page_info *page, unsigned long type);
   71.50 +int  put_page_type_preemptible(struct page_info *page);
   71.51 +int  get_page_type_preemptible(struct page_info *page, unsigned long type);
   71.52  int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
   71.53  void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
   71.54  
   71.55 @@ -174,6 +191,19 @@ static inline void put_page_and_type(str
   71.56      put_page(page);
   71.57  }
   71.58  
   71.59 +static inline int put_page_and_type_preemptible(struct page_info *page,
   71.60 +                                                int preemptible)
   71.61 +{
   71.62 +    int rc = 0;
   71.63 +
   71.64 +    if ( preemptible )
   71.65 +        rc = put_page_type_preemptible(page);
   71.66 +    else
   71.67 +        put_page_type(page);
   71.68 +    if ( likely(rc == 0) )
   71.69 +        put_page(page);
   71.70 +    return rc;
   71.71 +}
   71.72  
   71.73  static inline int get_page_and_type(struct page_info *page,
   71.74                                      struct domain *domain,
    72.1 --- a/xen/include/asm-x86/msr-index.h	Tue Sep 02 16:34:53 2008 -0700
    72.2 +++ b/xen/include/asm-x86/msr-index.h	Tue Sep 02 16:55:55 2008 -0700
    72.3 @@ -187,15 +187,30 @@
    72.4  #define MSR_K8_VM_CR			0xc0010114
    72.5  #define MSR_K8_VM_HSAVE_PA		0xc0010117
    72.6  
    72.7 +#define MSR_K8_FEATURE_MASK		0xc0011004
    72.8 +#define MSR_K8_EXT_FEATURE_MASK		0xc0011005
    72.9 +
   72.10  /* MSR_K8_VM_CR bits: */
   72.11  #define _K8_VMCR_SVME_DISABLE		4
   72.12  #define K8_VMCR_SVME_DISABLE		(1 << _K8_VMCR_SVME_DISABLE)
   72.13  
   72.14 +/* AMD64 MSRs */
   72.15 +#define MSR_AMD64_NB_CFG		0xc001001f
   72.16 +#define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT	46
   72.17 +
   72.18  /* AMD Family10h machine check MSRs */
   72.19  #define MSR_F10_MC4_MISC1		0xc0000408
   72.20  #define MSR_F10_MC4_MISC2		0xc0000409
   72.21  #define MSR_F10_MC4_MISC3		0xc000040A
   72.22  
   72.23 +/* Other AMD Fam10h MSRs */
   72.24 +#define MSR_FAM10H_MMIO_CONF_BASE	0xc0010058
   72.25 +#define FAM10H_MMIO_CONF_ENABLE_BIT	0
   72.26 +#define FAM10H_MMIO_CONF_BUSRANGE_MASK	0xf
   72.27 +#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
   72.28 +#define FAM10H_MMIO_CONF_BASE_MASK	0xfffffff
   72.29 +#define FAM10H_MMIO_CONF_BASE_SHIFT	20
   72.30 +
   72.31  /* K6 MSRs */
   72.32  #define MSR_K6_EFER			0xc0000080
   72.33  #define MSR_K6_STAR			0xc0000081
    73.1 --- a/xen/include/asm-x86/processor.h	Tue Sep 02 16:34:53 2008 -0700
    73.2 +++ b/xen/include/asm-x86/processor.h	Tue Sep 02 16:55:55 2008 -0700
    73.3 @@ -583,6 +583,8 @@ int rdmsr_hypervisor_regs(
    73.4  int wrmsr_hypervisor_regs(
    73.5      uint32_t idx, uint32_t eax, uint32_t edx);
    73.6  
    73.7 +int microcode_update(XEN_GUEST_HANDLE(const_void), unsigned long len);
    73.8 +
    73.9  #endif /* !__ASSEMBLY__ */
   73.10  
   73.11  #endif /* __ASM_X86_PROCESSOR_H */
    74.1 --- a/xen/include/asm-x86/softirq.h	Tue Sep 02 16:34:53 2008 -0700
    74.2 +++ b/xen/include/asm-x86/softirq.h	Tue Sep 02 16:55:55 2008 -0700
    74.3 @@ -1,8 +1,9 @@
    74.4  #ifndef __ASM_SOFTIRQ_H__
    74.5  #define __ASM_SOFTIRQ_H__
    74.6  
    74.7 -#define NMI_MCE_SOFTIRQ     (NR_COMMON_SOFTIRQS + 0)
    74.8 +#define NMI_MCE_SOFTIRQ        (NR_COMMON_SOFTIRQS + 0)
    74.9 +#define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
   74.10  
   74.11 -#define NR_ARCH_SOFTIRQS    1
   74.12 +#define NR_ARCH_SOFTIRQS       2
   74.13  
   74.14  #endif /* __ASM_SOFTIRQ_H__ */
    75.1 --- a/xen/include/public/memory.h	Tue Sep 02 16:34:53 2008 -0700
    75.2 +++ b/xen/include/public/memory.h	Tue Sep 02 16:55:55 2008 -0700
    75.3 @@ -204,6 +204,7 @@ struct xen_add_to_physmap {
    75.4      /* Source mapping space. */
    75.5  #define XENMAPSPACE_shared_info 0 /* shared info page */
    75.6  #define XENMAPSPACE_grant_table 1 /* grant table page */
    75.7 +#define XENMAPSPACE_mfn         2 /* usual MFN */
    75.8      unsigned int space;
    75.9  
   75.10      /* Index into source mapping space. */
   75.11 @@ -216,6 +217,22 @@ typedef struct xen_add_to_physmap xen_ad
   75.12  DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
   75.13  
   75.14  /*
   75.15 + * Unmaps the page appearing at a particular GPFN from the specified guest's
   75.16 + * pseudophysical address space.
   75.17 + * arg == addr of xen_remove_from_physmap_t.
   75.18 + */
   75.19 +#define XENMEM_remove_from_physmap      15
   75.20 +struct xen_remove_from_physmap {
   75.21 +    /* Which domain to change the mapping for. */
   75.22 +    domid_t domid;
   75.23 +
   75.24 +    /* GPFN of the current mapping of the page. */
   75.25 +    xen_pfn_t     gpfn;
   75.26 +};
   75.27 +typedef struct xen_remove_from_physmap xen_remove_from_physmap_t;
   75.28 +DEFINE_XEN_GUEST_HANDLE(xen_remove_from_physmap_t);
   75.29 +
   75.30 +/*
   75.31   * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
   75.32   * code on failure. This call only works for auto-translated guests.
   75.33   */
    76.1 --- a/xen/include/public/platform.h	Tue Sep 02 16:34:53 2008 -0700
    76.2 +++ b/xen/include/public/platform.h	Tue Sep 02 16:55:55 2008 -0700
    76.3 @@ -97,7 +97,7 @@ DEFINE_XEN_GUEST_HANDLE(xenpf_read_memty
    76.4  #define XENPF_microcode_update    35
    76.5  struct xenpf_microcode_update {
    76.6      /* IN variables. */
    76.7 -    XEN_GUEST_HANDLE(void) data;      /* Pointer to microcode data */
    76.8 +    XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */
    76.9      uint32_t length;                  /* Length of microcode data. */
   76.10  };
   76.11  typedef struct xenpf_microcode_update xenpf_microcode_update_t;
    77.1 --- a/xen/include/xen/compat.h	Tue Sep 02 16:34:53 2008 -0700
    77.2 +++ b/xen/include/xen/compat.h	Tue Sep 02 16:55:55 2008 -0700
    77.3 @@ -19,7 +19,9 @@
    77.4          type *_[0] __attribute__((__packed__)); \
    77.5      } __compat_handle_ ## name
    77.6  
    77.7 -#define DEFINE_COMPAT_HANDLE(name)   __DEFINE_COMPAT_HANDLE(name, name)
    77.8 +#define DEFINE_COMPAT_HANDLE(name) \
    77.9 +    __DEFINE_COMPAT_HANDLE(name, name); \
   77.10 +    __DEFINE_COMPAT_HANDLE(const_ ## name, const name)
   77.11  #define COMPAT_HANDLE(name)          __compat_handle_ ## name
   77.12  
   77.13  /* Is the compat handle a NULL reference? */
    78.1 --- a/xen/include/xen/iommu.h	Tue Sep 02 16:34:53 2008 -0700
    78.2 +++ b/xen/include/xen/iommu.h	Tue Sep 02 16:55:55 2008 -0700
    78.3 @@ -109,4 +109,8 @@ struct iommu_ops {
    78.4  
    78.5  void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value);
    78.6  void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg);
    78.7 +
    78.8 +int iommu_suspend(void);
    78.9 +int iommu_resume(void);
   78.10 +
   78.11  #endif /* _IOMMU_H_ */
    79.1 --- a/xen/include/xen/timer.h	Tue Sep 02 16:34:53 2008 -0700
    79.2 +++ b/xen/include/xen/timer.h	Tue Sep 02 16:55:55 2008 -0700
    79.3 @@ -14,16 +14,29 @@
    79.4  
    79.5  struct timer {
    79.6      /* System time expiry value (nanoseconds since boot). */
    79.7 -    s_time_t      expires;
    79.8 -    /* CPU on which this timer will be installed and executed. */
    79.9 -    unsigned int  cpu;
   79.10 +    s_time_t expires;
   79.11 +
   79.12 +    /* Position in active-timer data structure. */
   79.13 +    union {
   79.14 +        /* Timer-heap offset. */
   79.15 +        unsigned int heap_offset;
   79.16 +        /* Overflow linked list. */
   79.17 +        struct timer *list_next;
   79.18 +    };
   79.19 +
   79.20      /* On expiry, '(*function)(data)' will be executed in softirq context. */
   79.21 -    void        (*function)(void *);
   79.22 -    void         *data;
   79.23 -    /* Timer-heap offset. */
   79.24 -    unsigned int  heap_offset;
   79.25 -    /* Has this timer been killed (cannot be activated)? */
   79.26 -    int           killed;
   79.27 +    void (*function)(void *);
   79.28 +    void *data;
   79.29 +
   79.30 +    /* CPU on which this timer will be installed and executed. */
   79.31 +    uint16_t cpu;
   79.32 +
   79.33 +    /* Timer status. */
   79.34 +#define TIMER_STATUS_inactive 0 /* Not in use; can be activated.    */
   79.35 +#define TIMER_STATUS_killed   1 /* Not in use; canot be activated.  */
   79.36 +#define TIMER_STATUS_in_heap  2 /* In use; on timer heap.           */
   79.37 +#define TIMER_STATUS_in_list  3 /* In use; on overflow linked list. */
   79.38 +    uint8_t status;
   79.39  };
   79.40  
   79.41  /*
   79.42 @@ -37,7 +50,7 @@ struct timer {
   79.43   */
   79.44  static inline int active_timer(struct timer *timer)
   79.45  {
   79.46 -    return (timer->heap_offset != 0);
   79.47 +    return (timer->status >= TIMER_STATUS_in_heap);
   79.48  }
   79.49  
   79.50  /*
    80.1 --- a/xen/include/xlat.lst	Tue Sep 02 16:34:53 2008 -0700
    80.2 +++ b/xen/include/xlat.lst	Tue Sep 02 16:55:55 2008 -0700
    80.3 @@ -33,6 +33,7 @@
    80.4  !	kexec_image			kexec.h
    80.5  !	kexec_range			kexec.h
    80.6  !	add_to_physmap			memory.h
    80.7 +!	remove_from_physmap		memory.h
    80.8  !	foreign_memory_map		memory.h
    80.9  !	memory_exchange			memory.h
   80.10  !	memory_map			memory.h
    81.1 --- a/xen/include/xsm/xsm.h	Tue Sep 02 16:34:53 2008 -0700
    81.2 +++ b/xen/include/xsm/xsm.h	Tue Sep 02 16:55:55 2008 -0700
    81.3 @@ -136,6 +136,7 @@ struct xsm_operations {
    81.4      int (*mmu_machphys_update) (struct domain *d, unsigned long mfn);
    81.5      int (*update_va_mapping) (struct domain *d, l1_pgentry_t pte);
    81.6      int (*add_to_physmap) (struct domain *d1, struct domain *d2);
    81.7 +    int (*remove_from_physmap) (struct domain *d1, struct domain *d2);
    81.8  #endif
    81.9  };
   81.10  
   81.11 @@ -532,6 +533,11 @@ static inline int xsm_add_to_physmap(str
   81.12  {
   81.13      return xsm_call(add_to_physmap(d1, d2));
   81.14  }
   81.15 +
   81.16 +static inline int xsm_remove_from_physmap(struct domain *d1, struct domain *d2)
   81.17 +{
   81.18 +    return xsm_call(remove_from_physmap(d1, d2));
   81.19 +}
   81.20  #endif /* CONFIG_X86 */
   81.21  
   81.22  #endif /* __XSM_H */
    82.1 --- a/xen/xsm/dummy.c	Tue Sep 02 16:34:53 2008 -0700
    82.2 +++ b/xen/xsm/dummy.c	Tue Sep 02 16:55:55 2008 -0700
    82.3 @@ -385,6 +385,11 @@ static int dummy_add_to_physmap (struct 
    82.4  {
    82.5      return 0;
    82.6  }
    82.7 +
    82.8 +static int dummy_remove_from_physmap (struct domain *d1, struct domain *d2)
    82.9 +{
   82.10 +    return 0;
   82.11 +}
   82.12  #endif
   82.13  
   82.14  struct xsm_operations dummy_xsm_ops;
   82.15 @@ -484,5 +489,6 @@ void xsm_fixup_ops (struct xsm_operation
   82.16      set_to_dummy_if_null(ops, mmu_machphys_update);
   82.17      set_to_dummy_if_null(ops, update_va_mapping);
   82.18      set_to_dummy_if_null(ops, add_to_physmap);
   82.19 +    set_to_dummy_if_null(ops, remove_from_physmap);
   82.20  #endif
   82.21  }
    83.1 --- a/xen/xsm/flask/hooks.c	Tue Sep 02 16:34:53 2008 -0700
    83.2 +++ b/xen/xsm/flask/hooks.c	Tue Sep 02 16:55:55 2008 -0700
    83.3 @@ -1028,6 +1028,11 @@ static int flask_add_to_physmap(struct d
    83.4  {
    83.5      return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
    83.6  }
    83.7 +
    83.8 +static int flask_remove_from_physmap(struct domain *d1, struct domain *d2)
    83.9 +{
   83.10 +    return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
   83.11 +}
   83.12  #endif
   83.13  
   83.14  long do_flask_op(XEN_GUEST_HANDLE(xsm_op_t) u_flask_op);
   83.15 @@ -1115,6 +1120,7 @@ static struct xsm_operations flask_ops =
   83.16      .mmu_machphys_update = flask_mmu_machphys_update,
   83.17      .update_va_mapping = flask_update_va_mapping,
   83.18      .add_to_physmap = flask_add_to_physmap,
   83.19 +    .remove_from_physmap = flask_remove_from_physmap,
   83.20  #endif
   83.21  };
   83.22