]> xenbits.xen.org Git - xenclient/xen.git/commitdiff
Import xen-unstable, changeset 19433: d5ddc782bc49
authorJean Guyader <jean.guyader@eu.citrix.com>
Thu, 2 Apr 2009 13:23:44 +0000 (14:23 +0100)
committerJean Guyader <jean.guyader@eu.citrix.com>
Thu, 2 Apr 2009 13:23:44 +0000 (14:23 +0100)
657 files changed:
Config.mk
Makefile
buildconfigs/enable-xen-config
buildconfigs/mk.linux-2.6-common
buildconfigs/mk.linux-2.6-pvops [new file with mode: 0644]
buildconfigs/src.tarball
config/NetBSD.mk
config/StdGNU.mk
docs/check_pkgs
docs/man/xm.pod.1
docs/misc/dump-core-format.txt
docs/misc/vtd.txt
docs/misc/xen-error-handling.txt [new file with mode: 0644]
docs/misc/xsm-flask.txt [new file with mode: 0644]
docs/src/user.tex
docs/xen-api/coversheet.tex
docs/xen-api/revision-history.tex
docs/xen-api/xenapi-coversheet.tex
docs/xen-api/xenapi-datamodel-graph.dot
docs/xen-api/xenapi-datamodel.tex
extras/mini-os/Makefile
extras/mini-os/arch/ia64/arch.mk
extras/mini-os/arch/ia64/common.c
extras/mini-os/arch/ia64/debug.c
extras/mini-os/arch/ia64/efi.c
extras/mini-os/arch/ia64/fw.S
extras/mini-os/arch/ia64/ia64.S
extras/mini-os/arch/ia64/ivt.S
extras/mini-os/arch/ia64/minios-ia64.lds
extras/mini-os/arch/ia64/mm.c
extras/mini-os/arch/ia64/sal.c
extras/mini-os/arch/ia64/time.c
extras/mini-os/arch/ia64/xencomm.c
extras/mini-os/arch/x86/ioremap.c [new file with mode: 0644]
extras/mini-os/arch/x86/mm.c
extras/mini-os/arch/x86/setup.c
extras/mini-os/events.c
extras/mini-os/fs-front.c
extras/mini-os/include/ia64/atomic.h
extras/mini-os/include/ia64/hypercall-ia64.h
extras/mini-os/include/ia64/ia64_cpu.h
extras/mini-os/include/ia64/os.h
extras/mini-os/include/ioremap.h [new file with mode: 0644]
extras/mini-os/include/mm.h
extras/mini-os/include/posix/net/if.h [new file with mode: 0644]
extras/mini-os/include/sched.h
extras/mini-os/include/wait.h
extras/mini-os/include/x86/arch_mm.h
extras/mini-os/include/xenbus.h
extras/mini-os/kernel.c
extras/mini-os/lib/sys.c
extras/mini-os/minios.mk
extras/mini-os/xenbus/xenbus.c
stubdom/Makefile
stubdom/README
stubdom/grub/mini-os.c
stubdom/stubdom-dm
tools/Makefile
tools/Rules.mk
tools/blktap/drivers/Makefile
tools/blktap/drivers/blktapctrl.c
tools/blktap/drivers/block-qcow.c
tools/blktap/drivers/block-qcow2.c
tools/blktap/drivers/qcow-create.c
tools/blktap/drivers/tapdisk.h
tools/blktap/lib/blktaplib.h
tools/blktap/lib/xenbus.c
tools/console/Makefile
tools/console/client/main.c
tools/console/daemon/io.c
tools/console/daemon/main.c
tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/Makefile.in
tools/examples/Makefile
tools/examples/xend-config.sxp
tools/examples/xmexample.hvm
tools/examples/xmexample.hvm-dm
tools/examples/xmexample.pv-grub
tools/examples/xmexample.vti
tools/examples/xmexample1
tools/examples/xmexample2
tools/examples/xmexample3
tools/firmware/Makefile
tools/firmware/Rules.mk
tools/firmware/hvmloader/32bitbios_support.c
tools/firmware/hvmloader/Makefile
tools/firmware/hvmloader/acpi/Makefile
tools/firmware/hvmloader/acpi/build.c
tools/firmware/hvmloader/acpi/dsdt.asl
tools/firmware/hvmloader/acpi/dsdt.c
tools/firmware/hvmloader/acpi/static_tables.c
tools/firmware/hvmloader/cacheattr.c
tools/firmware/hvmloader/config.h
tools/firmware/hvmloader/e820.h
tools/firmware/hvmloader/hvmloader.c
tools/firmware/hvmloader/mp_tables.c
tools/firmware/hvmloader/smbios.c
tools/firmware/hvmloader/smp.c
tools/firmware/hvmloader/util.c
tools/firmware/hvmloader/util.h
tools/firmware/rombios/32bit/32bitbios.c
tools/firmware/rombios/32bit/Makefile
tools/firmware/rombios/32bit/pmm.c [new file with mode: 0644]
tools/firmware/rombios/32bit/rombios_compat.h
tools/firmware/rombios/32bit/tcgbios/Makefile
tools/firmware/rombios/32bit/tcgbios/tcgbios.c
tools/firmware/rombios/32bitgateway.c
tools/firmware/rombios/32bitprotos.h
tools/firmware/rombios/Makefile
tools/firmware/rombios/rombios.c
tools/firmware/rombios/rombios.h [new file with mode: 0644]
tools/firmware/rombios/tcgbios.c
tools/firmware/vgabios/vbe.c
tools/firmware/vgabios/vbe.h
tools/firmware/vgabios/vbetables-gen.c
tools/firmware/vgabios/vgabios.c
tools/flask/libflask/Makefile
tools/flask/loadpolicy/Makefile
tools/flask/policy/policy/modules/xen/xen.te
tools/fs-back/Makefile
tools/fs-back/fs-backend.c
tools/fs-back/fs-backend.h
tools/fs-back/fs-debug.h [new file with mode: 0644]
tools/fs-back/fs-ops.c
tools/fs-back/fs-xenbus.c
tools/fs-back/sys-queue.h [new file with mode: 0644]
tools/hotplug/Linux/Makefile [new file with mode: 0644]
tools/hotplug/Linux/block [new file with mode: 0644]
tools/hotplug/Linux/block-common.sh [new file with mode: 0644]
tools/hotplug/Linux/block-enbd [new file with mode: 0644]
tools/hotplug/Linux/block-nbd [new file with mode: 0644]
tools/hotplug/Linux/external-device-migrate [new file with mode: 0644]
tools/hotplug/Linux/init.d/sysconfig.xendomains [new file with mode: 0644]
tools/hotplug/Linux/init.d/xend [new file with mode: 0755]
tools/hotplug/Linux/init.d/xendomains [new file with mode: 0644]
tools/hotplug/Linux/locking.sh [new file with mode: 0644]
tools/hotplug/Linux/logging.sh [new file with mode: 0644]
tools/hotplug/Linux/network-bridge [new file with mode: 0644]
tools/hotplug/Linux/network-nat [new file with mode: 0644]
tools/hotplug/Linux/network-route [new file with mode: 0644]
tools/hotplug/Linux/vif-bridge [new file with mode: 0644]
tools/hotplug/Linux/vif-common.sh [new file with mode: 0644]
tools/hotplug/Linux/vif-nat [new file with mode: 0644]
tools/hotplug/Linux/vif-route [new file with mode: 0644]
tools/hotplug/Linux/vscsi [new file with mode: 0644]
tools/hotplug/Linux/vtpm [new file with mode: 0644]
tools/hotplug/Linux/vtpm-common.sh [new file with mode: 0644]
tools/hotplug/Linux/vtpm-delete [new file with mode: 0644]
tools/hotplug/Linux/vtpm-hotplug-common.sh [new file with mode: 0644]
tools/hotplug/Linux/vtpm-impl [new file with mode: 0644]
tools/hotplug/Linux/vtpm-migration.sh [new file with mode: 0644]
tools/hotplug/Linux/xen-backend.agent [new file with mode: 0644]
tools/hotplug/Linux/xen-backend.rules [new file with mode: 0644]
tools/hotplug/Linux/xen-hotplug-cleanup [new file with mode: 0644]
tools/hotplug/Linux/xen-hotplug-common.sh [new file with mode: 0644]
tools/hotplug/Linux/xen-network-common.sh [new file with mode: 0644]
tools/hotplug/Linux/xen-script-common.sh [new file with mode: 0644]
tools/hotplug/Linux/xend.rules [new file with mode: 0644]
tools/hotplug/Makefile [new file with mode: 0644]
tools/hotplug/NetBSD/Makefile [new file with mode: 0644]
tools/hotplug/NetBSD/block-nbsd [new file with mode: 0644]
tools/hotplug/NetBSD/qemu-ifup-nbsd [new file with mode: 0644]
tools/hotplug/NetBSD/vif-bridge-nbsd [new file with mode: 0644]
tools/hotplug/NetBSD/vif-ip-nbsd [new file with mode: 0644]
tools/hotplug/common/Makefile [new file with mode: 0644]
tools/include/Makefile
tools/include/xen-foreign/reference.size
tools/libaio/src/Makefile
tools/libfsimage/Rules.mk
tools/libfsimage/common/Makefile
tools/libfsimage/zfs/fsys_zfs.c
tools/libfsimage/zfs/zfs-include/zfs.h
tools/libxc/Makefile
tools/libxc/ia64/xc_ia64_hvm_build.c
tools/libxc/ia64/xc_ia64_linux_restore.c
tools/libxc/xc_core.c
tools/libxc/xc_core.h
tools/libxc/xc_core_ia64.c
tools/libxc/xc_core_x86.c
tools/libxc/xc_core_x86.h
tools/libxc/xc_cpufeature.h
tools/libxc/xc_cpuid_x86.c
tools/libxc/xc_dom.h
tools/libxc/xc_dom_core.c
tools/libxc/xc_dom_x86.c
tools/libxc/xc_domain.c
tools/libxc/xc_domain_restore.c
tools/libxc/xc_domain_save.c
tools/libxc/xc_elf.h
tools/libxc/xc_hvm_build.c
tools/libxc/xc_pagetab.c
tools/libxc/xc_pm.c
tools/libxc/xc_private.c
tools/libxc/xc_private.h
tools/libxc/xc_ptrace.c
tools/libxc/xc_ptrace_core.c
tools/libxc/xc_solaris.c
tools/libxc/xc_suspend.c [new file with mode: 0644]
tools/libxc/xenctrl.h
tools/libxc/xenguest.h
tools/libxc/xg_private.c
tools/libxen/src/xen_common.c
tools/misc/Makefile
tools/misc/xen-detect.c
tools/misc/xenpm.c
tools/pygrub/Makefile
tools/pygrub/src/pygrub
tools/python/Makefile
tools/python/setup.py
tools/python/xen/lowlevel/flask/flask.c
tools/python/xen/lowlevel/process/process.c [new file with mode: 0644]
tools/python/xen/lowlevel/xc/xc.c
tools/python/xen/lowlevel/xs/xs.c
tools/python/xen/util/blkif.py
tools/python/xen/util/diagnose.py
tools/python/xen/util/oshelp.py
tools/python/xen/util/pci.py
tools/python/xen/util/rwlock.py [new file with mode: 0644]
tools/python/xen/util/vscsi_util.py
tools/python/xen/web/SrvDir.py
tools/python/xen/web/connection.py
tools/python/xen/web/unix.py
tools/python/xen/xend/XendAPI.py
tools/python/xen/xend/XendAPIStore.py
tools/python/xen/xend/XendBootloader.py
tools/python/xen/xend/XendCheckpoint.py
tools/python/xen/xend/XendConfig.py
tools/python/xen/xend/XendConstants.py
tools/python/xen/xend/XendDPCI.py
tools/python/xen/xend/XendDomain.py
tools/python/xen/xend/XendDomainInfo.py
tools/python/xen/xend/XendNode.py
tools/python/xen/xend/XendOptions.py
tools/python/xen/xend/XendPPCI.py
tools/python/xen/xend/balloon.py
tools/python/xen/xend/image.py
tools/python/xen/xend/osdep.py
tools/python/xen/xend/server/BlktapController.py
tools/python/xen/xend/server/DevConstants.py [new file with mode: 0644]
tools/python/xen/xend/server/DevController.py
tools/python/xen/xend/server/SrvDaemon.py
tools/python/xen/xend/server/SrvDomain.py
tools/python/xen/xend/server/blkif.py
tools/python/xen/xend/server/iopif.py
tools/python/xen/xend/server/irqif.py
tools/python/xen/xend/server/netif.py
tools/python/xen/xend/server/pciif.py
tools/python/xen/xend/server/pciquirk.py
tools/python/xen/xend/server/relocate.py
tools/python/xen/xend/server/udevevent.py [new file with mode: 0644]
tools/python/xen/xend/server/vscsiif.py
tools/python/xen/xm/addlabel.py
tools/python/xen/xm/create.dtd
tools/python/xen/xm/create.py
tools/python/xen/xm/main.py
tools/python/xen/xm/xenapi_create.py
tools/tests/blowfish.mk
tools/vnet/Makefile
tools/vnet/libxutil/Makefile
tools/vtpm/Makefile
tools/vtpm/Rules.mk
tools/vtpm_manager/Rules.mk
tools/xcutils/Makefile
tools/xcutils/readnotes.c
tools/xcutils/xc_save.c
tools/xenmon/Makefile
tools/xenpmd/Makefile [new file with mode: 0644]
tools/xenpmd/xenpmd.c [new file with mode: 0644]
tools/xenstat/libxenstat/Makefile
tools/xenstat/libxenstat/src/xenstat_linux.c
tools/xenstat/xentop/Makefile
tools/xenstat/xentop/xentop.c
tools/xenstore/Makefile
tools/xenstore/xenstored_core.c
tools/xenstore/xs.c
tools/xenstore/xs.h
tools/xentrace/Makefile
tools/xentrace/formats
tools/xentrace/xenctx.c
tools/xentrace/xentrace_format
tools/xm-test/lib/XmTestLib/NetConfig.py
unmodified_drivers/linux-2.6/Makefile
unmodified_drivers/linux-2.6/balloon/Kbuild
unmodified_drivers/linux-2.6/compat-include/linux/scatterlist.h [new file with mode: 0644]
unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h
unmodified_drivers/linux-2.6/mkbuildtree
unmodified_drivers/linux-2.6/overrides.mk
unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c
unmodified_drivers/linux-2.6/platform-pci/platform-compat.c
unmodified_drivers/linux-2.6/scsifront/Kbuild [new file with mode: 0644]
unmodified_drivers/linux-2.6/scsifront/Makefile [new file with mode: 0644]
xen/Makefile
xen/Rules.mk
xen/arch/ia64/Makefile
xen/arch/ia64/Rules.mk
xen/arch/ia64/linux-xen/acpi.c
xen/arch/ia64/linux-xen/head.S
xen/arch/ia64/linux-xen/iosapic.c
xen/arch/ia64/linux-xen/irq_ia64.c
xen/arch/ia64/linux-xen/mca.c
xen/arch/ia64/linux-xen/mm_contig.c
xen/arch/ia64/linux-xen/smp.c
xen/arch/ia64/linux-xen/smpboot.c
xen/arch/ia64/linux-xen/sn/kernel/irq.c
xen/arch/ia64/vmx/sioemu.c
xen/arch/ia64/vmx/viosapic.c
xen/arch/ia64/vmx/vmmu.c
xen/arch/ia64/vmx/vmx_fault.c
xen/arch/ia64/vmx/vmx_init.c
xen/arch/ia64/vmx/vmx_interrupt.c
xen/arch/ia64/vmx/vmx_ivt.S
xen/arch/ia64/vmx/vtlb.c
xen/arch/ia64/xen/cpufreq/cpufreq.c
xen/arch/ia64/xen/dom0_ops.c
xen/arch/ia64/xen/dom_fw_asm.S
xen/arch/ia64/xen/dom_fw_common.c
xen/arch/ia64/xen/domain.c
xen/arch/ia64/xen/faults.c
xen/arch/ia64/xen/fw_emul.c
xen/arch/ia64/xen/hypercall.c
xen/arch/ia64/xen/irq.c
xen/arch/ia64/xen/ivt.S
xen/arch/ia64/xen/machine_kexec.c
xen/arch/ia64/xen/mm.c
xen/arch/ia64/xen/platform_hypercall.c
xen/arch/ia64/xen/regionreg.c
xen/arch/ia64/xen/tlb_track.c
xen/arch/ia64/xen/vcpu.c
xen/arch/ia64/xen/xenmem.c
xen/arch/ia64/xen/xensetup.c
xen/arch/x86/Makefile
xen/arch/x86/Rules.mk
xen/arch/x86/acpi/boot.c
xen/arch/x86/acpi/cpu_idle.c
xen/arch/x86/acpi/cpufreq/cpufreq.c
xen/arch/x86/acpi/cpufreq/powernow.c
xen/arch/x86/acpi/cpuidle_menu.c
xen/arch/x86/acpi/power.c
xen/arch/x86/acpi/suspend.c
xen/arch/x86/apic.c
xen/arch/x86/boot/Makefile
xen/arch/x86/boot/build32.mk [new file with mode: 0644]
xen/arch/x86/boot/head.S
xen/arch/x86/boot/mkelf32.c
xen/arch/x86/boot/reloc.c [new file with mode: 0644]
xen/arch/x86/boot/wakeup.S
xen/arch/x86/bzimage.c [new file with mode: 0644]
xen/arch/x86/copy_page.S [new file with mode: 0644]
xen/arch/x86/cpu/amd.c
xen/arch/x86/cpu/common.c
xen/arch/x86/cpu/intel.c
xen/arch/x86/cpu/mcheck/Makefile
xen/arch/x86/cpu/mcheck/amd_f10.c
xen/arch/x86/cpu/mcheck/amd_k8.c
xen/arch/x86/cpu/mcheck/amd_nonfatal.c
xen/arch/x86/cpu/mcheck/k7.c
xen/arch/x86/cpu/mcheck/mce.c
xen/arch/x86/cpu/mcheck/mce.h
xen/arch/x86/cpu/mcheck/mce_intel.c [new file with mode: 0644]
xen/arch/x86/cpu/mcheck/mctelem.c [new file with mode: 0644]
xen/arch/x86/cpu/mcheck/mctelem.h [new file with mode: 0644]
xen/arch/x86/cpu/mcheck/non-fatal.c
xen/arch/x86/cpu/mcheck/p5.c
xen/arch/x86/cpu/mcheck/winchip.c
xen/arch/x86/cpu/mcheck/x86_mca.h
xen/arch/x86/domain.c
xen/arch/x86/domain_build.c
xen/arch/x86/domctl.c
xen/arch/x86/e820.c
xen/arch/x86/hpet.c
xen/arch/x86/hvm/emulate.c
xen/arch/x86/hvm/hpet.c
xen/arch/x86/hvm/hvm.c
xen/arch/x86/hvm/i8254.c
xen/arch/x86/hvm/intercept.c
xen/arch/x86/hvm/mtrr.c
xen/arch/x86/hvm/rtc.c
xen/arch/x86/hvm/svm/emulate.c
xen/arch/x86/hvm/svm/entry.S
xen/arch/x86/hvm/svm/intr.c
xen/arch/x86/hvm/svm/svm.c
xen/arch/x86/hvm/svm/vmcb.c
xen/arch/x86/hvm/vioapic.c
xen/arch/x86/hvm/viridian.c
xen/arch/x86/hvm/vlapic.c
xen/arch/x86/hvm/vmsi.c
xen/arch/x86/hvm/vmx/entry.S
xen/arch/x86/hvm/vmx/intr.c
xen/arch/x86/hvm/vmx/realmode.c
xen/arch/x86/hvm/vmx/vmcs.c
xen/arch/x86/hvm/vmx/vmx.c
xen/arch/x86/hvm/vmx/vpmu_core2.c
xen/arch/x86/hvm/vpic.c
xen/arch/x86/hvm/vpt.c
xen/arch/x86/i8259.c
xen/arch/x86/io_apic.c
xen/arch/x86/irq.c
xen/arch/x86/machine_kexec.c
xen/arch/x86/microcode.c
xen/arch/x86/microcode_amd.c
xen/arch/x86/microcode_intel.c
xen/arch/x86/mm.c
xen/arch/x86/mm/Makefile
xen/arch/x86/mm/guest_walk.c [new file with mode: 0644]
xen/arch/x86/mm/hap/Makefile
xen/arch/x86/mm/hap/guest_walk.c
xen/arch/x86/mm/hap/hap.c
xen/arch/x86/mm/hap/p2m-ept.c
xen/arch/x86/mm/hap/private.h
xen/arch/x86/mm/p2m.c
xen/arch/x86/mm/paging.c
xen/arch/x86/mm/shadow/Makefile
xen/arch/x86/mm/shadow/common.c
xen/arch/x86/mm/shadow/multi.c
xen/arch/x86/mm/shadow/private.h
xen/arch/x86/mm/shadow/types.h
xen/arch/x86/msi.c
xen/arch/x86/nmi.c
xen/arch/x86/numa.c
xen/arch/x86/oprofile/nmi_int.c
xen/arch/x86/oprofile/op_model_p4.c
xen/arch/x86/oprofile/op_model_ppro.c
xen/arch/x86/oprofile/op_x86_model.h
xen/arch/x86/physdev.c
xen/arch/x86/platform_hypercall.c
xen/arch/x86/setup.c
xen/arch/x86/shutdown.c
xen/arch/x86/smpboot.c
xen/arch/x86/sysctl.c
xen/arch/x86/tboot.c
xen/arch/x86/time.c
xen/arch/x86/traps.c
xen/arch/x86/x86_32/asm-offsets.c
xen/arch/x86/x86_32/domain_page.c
xen/arch/x86/x86_32/entry.S
xen/arch/x86/x86_32/machine_kexec.c
xen/arch/x86/x86_32/mm.c
xen/arch/x86/x86_32/xen.lds.S
xen/arch/x86/x86_64/Makefile
xen/arch/x86/x86_64/asm-offsets.c
xen/arch/x86/x86_64/compat/entry.S
xen/arch/x86/x86_64/compat/mm.c
xen/arch/x86/x86_64/cpu_idle.c
xen/arch/x86/x86_64/cpufreq.c [new file with mode: 0644]
xen/arch/x86/x86_64/entry.S
xen/arch/x86/x86_64/machine_kexec.c
xen/arch/x86/x86_64/mm.c
xen/arch/x86/x86_64/physdev.c
xen/arch/x86/x86_64/platform_hypercall.c
xen/arch/x86/x86_64/traps.c
xen/arch/x86/x86_64/xen.lds.S
xen/arch/x86/x86_emulate/x86_emulate.c
xen/arch/x86/x86_emulate/x86_emulate.h
xen/common/Makefile
xen/common/compat/Makefile
xen/common/compat/memory.c
xen/common/domain.c
xen/common/domctl.c
xen/common/event_channel.c
xen/common/grant_table.c
xen/common/hvm/save.c
xen/common/inflate.c [new file with mode: 0644]
xen/common/kernel.c
xen/common/kexec.c
xen/common/keyhandler.c
xen/common/libelf/libelf-dominfo.c
xen/common/libelf/libelf-private.h
xen/common/memory.c
xen/common/page_alloc.c
xen/common/sched_credit.c
xen/common/schedule.c
xen/common/spinlock.c [new file with mode: 0644]
xen/common/sysctl.c
xen/common/timer.c
xen/common/trace.c
xen/common/xencomm.c
xen/common/xenoprof.c
xen/common/xmalloc_tlsf.c
xen/crypto/Makefile [new file with mode: 0644]
xen/crypto/rijndael.c [new file with mode: 0644]
xen/crypto/vmac.c [new file with mode: 0644]
xen/drivers/Makefile
xen/drivers/acpi/pmstat.c
xen/drivers/acpi/reboot.c
xen/drivers/char/console.c
xen/drivers/char/ns16550.c
xen/drivers/char/serial.c
xen/drivers/cpufreq/Makefile
xen/drivers/cpufreq/cpufreq.c
xen/drivers/cpufreq/cpufreq_misc_governors.c [new file with mode: 0644]
xen/drivers/cpufreq/cpufreq_ondemand.c
xen/drivers/cpufreq/utility.c
xen/drivers/passthrough/Makefile
xen/drivers/passthrough/amd/iommu_init.c
xen/drivers/passthrough/amd/iommu_intr.c
xen/drivers/passthrough/amd/iommu_map.c
xen/drivers/passthrough/amd/pci_amd_iommu.c
xen/drivers/passthrough/io.c
xen/drivers/passthrough/iommu.c
xen/drivers/passthrough/pci.c
xen/drivers/passthrough/vtd/Makefile
xen/drivers/passthrough/vtd/dmar.c
xen/drivers/passthrough/vtd/dmar.h
xen/drivers/passthrough/vtd/extern.h
xen/drivers/passthrough/vtd/ia64/Makefile [new file with mode: 0644]
xen/drivers/passthrough/vtd/ia64/vtd.c [new file with mode: 0644]
xen/drivers/passthrough/vtd/intremap.c
xen/drivers/passthrough/vtd/iommu.c
xen/drivers/passthrough/vtd/iommu.h
xen/drivers/passthrough/vtd/qinval.c
xen/drivers/passthrough/vtd/utils.c
xen/drivers/passthrough/vtd/vtd.h
xen/drivers/passthrough/vtd/x86/vtd.c
xen/drivers/video/vesa.c
xen/drivers/video/vga.c
xen/include/Makefile
xen/include/acpi/cpufreq/cpufreq.h
xen/include/acpi/cpufreq/processor_perf.h
xen/include/asm-ia64/bug.h
xen/include/asm-ia64/bundle.h
xen/include/asm-ia64/config.h
xen/include/asm-ia64/dom_fw.h
xen/include/asm-ia64/dom_fw_common.h
xen/include/asm-ia64/domain.h
xen/include/asm-ia64/hardirq.h
xen/include/asm-ia64/hvm/iommu.h [new file with mode: 0644]
xen/include/asm-ia64/hvm/irq.h [new file with mode: 0644]
xen/include/asm-ia64/iocap.h
xen/include/asm-ia64/linux-xen/asm/README.origin
xen/include/asm-ia64/linux-xen/asm/acpi.h
xen/include/asm-ia64/linux-xen/asm/hw_irq.h [new file with mode: 0644]
xen/include/asm-ia64/linux-xen/asm/iosapic.h
xen/include/asm-ia64/linux-xen/asm/page.h
xen/include/asm-ia64/linux-xen/asm/pgtable.h
xen/include/asm-ia64/linux-xen/asm/processor.h
xen/include/asm-ia64/linux-xen/asm/smp.h
xen/include/asm-ia64/linux-xen/asm/spinlock.h
xen/include/asm-ia64/linux-xen/linux/efi.h
xen/include/asm-ia64/linux-xen/linux/interrupt.h
xen/include/asm-ia64/linux-xen/linux/linux-pci.h
xen/include/asm-ia64/linux/asm/README.origin
xen/include/asm-ia64/linux/asm/irq.h
xen/include/asm-ia64/linux/pci_regs.h
xen/include/asm-ia64/mm.h
xen/include/asm-ia64/msi.h [new file with mode: 0644]
xen/include/asm-ia64/tlb_track.h
xen/include/asm-ia64/tlbflush.h
xen/include/asm-ia64/viosapic.h
xen/include/asm-ia64/vmx.h
xen/include/asm-ia64/vmx_platform.h
xen/include/asm-ia64/xenpage.h
xen/include/asm-x86/acpi.h
xen/include/asm-x86/apicdef.h
xen/include/asm-x86/config.h
xen/include/asm-x86/cpufeature.h
xen/include/asm-x86/desc.h
xen/include/asm-x86/domain.h
xen/include/asm-x86/e820.h
xen/include/asm-x86/event.h
xen/include/asm-x86/fixmap.h
xen/include/asm-x86/guest_pt.h [new file with mode: 0644]
xen/include/asm-x86/hpet.h
xen/include/asm-x86/hvm/domain.h
xen/include/asm-x86/hvm/hvm.h
xen/include/asm-x86/hvm/irq.h
xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
xen/include/asm-x86/hvm/trace.h
xen/include/asm-x86/hvm/vcpu.h
xen/include/asm-x86/hvm/vlapic.h
xen/include/asm-x86/hvm/vmx/vmcs.h
xen/include/asm-x86/hvm/vmx/vmx.h
xen/include/asm-x86/hvm/vmx/vpmu.h
xen/include/asm-x86/hvm/vmx/vpmu_core2.h
xen/include/asm-x86/hvm/vpt.h
xen/include/asm-x86/io_apic.h
xen/include/asm-x86/iocap.h
xen/include/asm-x86/irq.h
xen/include/asm-x86/mach-default/irq_vectors.h
xen/include/asm-x86/microcode.h
xen/include/asm-x86/mm.h
xen/include/asm-x86/msi.h
xen/include/asm-x86/msr-index.h
xen/include/asm-x86/mtrr.h
xen/include/asm-x86/p2m.h
xen/include/asm-x86/page.h
xen/include/asm-x86/paging.h
xen/include/asm-x86/perfc.h
xen/include/asm-x86/perfc_defn.h
xen/include/asm-x86/processor.h
xen/include/asm-x86/smp.h
xen/include/asm-x86/softirq.h
xen/include/asm-x86/spinlock.h
xen/include/asm-x86/system.h
xen/include/asm-x86/tboot.h
xen/include/asm-x86/time.h
xen/include/asm-x86/traps.h
xen/include/asm-x86/x86_32/page.h
xen/include/asm-x86/x86_32/system.h
xen/include/asm-x86/x86_64/page.h
xen/include/asm-x86/x86_64/system.h
xen/include/asm-x86/xenoprof.h
xen/include/crypto/rijndael.h [new file with mode: 0644]
xen/include/crypto/vmac.h [new file with mode: 0644]
xen/include/public/arch-ia64.h
xen/include/public/arch-ia64/hvm/save.h
xen/include/public/arch-x86/hvm/save.h
xen/include/public/arch-x86/xen-mca.h
xen/include/public/domctl.h
xen/include/public/elfnote.h
xen/include/public/features.h
xen/include/public/grant_table.h
xen/include/public/hvm/hvm_info_table.h
xen/include/public/hvm/params.h
xen/include/public/io/fsif.h
xen/include/public/io/pciif.h
xen/include/public/io/usbif.h [new file with mode: 0644]
xen/include/public/io/vscsiif.h [new file with mode: 0644]
xen/include/public/kexec.h
xen/include/public/memory.h
xen/include/public/physdev.h
xen/include/public/sysctl.h
xen/include/public/trace.h
xen/include/public/xen.h
xen/include/xen/acpi.h
xen/include/xen/compat.h
xen/include/xen/cpuidle.h
xen/include/xen/cpumask.h
xen/include/xen/domain.h
xen/include/xen/domain_page.h
xen/include/xen/elf.h
xen/include/xen/elfstructs.h [new file with mode: 0644]
xen/include/xen/event.h
xen/include/xen/grant_table.h
xen/include/xen/hvm/iommu.h
xen/include/xen/hvm/irq.h [new file with mode: 0644]
xen/include/xen/hvm/save.h
xen/include/xen/hypercall.h
xen/include/xen/iocap.h
xen/include/xen/iommu.h
xen/include/xen/irq.h
xen/include/xen/kexec.h
xen/include/xen/lib.h
xen/include/xen/libelf.h [new file with mode: 0644]
xen/include/xen/mm.h
xen/include/xen/pci.h
xen/include/xen/pci_regs.h
xen/include/xen/perfc_defn.h
xen/include/xen/sched.h
xen/include/xen/spinlock.h
xen/include/xen/time.h
xen/include/xen/timer.h
xen/include/xen/xenoprof.h
xen/include/xlat.lst
xen/include/xsm/xsm.h
xen/tools/symbols.c
xen/xsm/dummy.c
xen/xsm/flask/hooks.c
xen/xsm/flask/ss/policydb.c

index 92802280609c912729cb6246808c08ecac58a898..abdb1abe4a16f01125c64f4465b1cfc1d2c5e885 100644 (file)
--- a/Config.mk
+++ b/Config.mk
@@ -1,7 +1,7 @@
 # -*- mode: Makefile; -*-
 
-# A debug build of Xen and tools?
-debug ?= n
+# A debug build of Xen and tools? TEMPORARILY ENABLED
+debug ?= y
 
 XEN_COMPILE_ARCH    ?= $(shell uname -m | sed -e s/i.86/x86_32/ \
                          -e s/i86pc/x86_32/ -e s/amd64/x86_64/)
@@ -38,6 +38,15 @@ endif
 cc-option = $(shell if test -z "`$(1) $(2) -S -o /dev/null -xc \
               /dev/null 2>&1`"; then echo "$(2)"; else echo "$(3)"; fi ;)
 
+# cc-option-add: Add an option to compilation flags, but only if supported.
+# Usage: $(call cc-option-add CFLAGS,CC,-march=winchip-c6)
+cc-option-add = $(eval $(call cc-option-add-closure,$(1),$(2),$(3)))
+define cc-option-add-closure
+    ifneq ($$(call cc-option,$$($(2)),$(3),n),n)
+        $(1) += $(3)
+    endif
+endef
+
 # cc-ver: Check compiler is at least specified version. Return boolean 'y'/'n'.
 # Usage: ifeq ($(call cc-ver,$(CC),0x030400),y)
 cc-ver = $(shell if [ $$((`$(1) -dumpversion | awk -F. \
@@ -84,8 +93,8 @@ CFLAGS += -Wall -Wstrict-prototypes
 # result of any casted expression causes a warning.
 CFLAGS += -Wno-unused-value
 
-HOSTCFLAGS += $(call cc-option,$(HOSTCC),-Wdeclaration-after-statement,)
-CFLAGS     += $(call cc-option,$(CC),-Wdeclaration-after-statement,)
+$(call cc-option-add,HOSTCFLAGS,HOSTCC,-Wdeclaration-after-statement)
+$(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement)
 
 LDFLAGS += $(foreach i, $(EXTRA_LIB), -L$(i)) 
 CFLAGS += $(foreach i, $(EXTRA_INCLUDES), -I$(i))
@@ -96,6 +105,11 @@ XSM_ENABLE ?= n
 FLASK_ENABLE ?= n
 ACM_SECURITY ?= n
 
+XEN_EXTFILES_URL=http://xenbits.xensource.com/xen-extfiles
+# All the files at that location were downloaded from elsewhere on
+# the internet.  The original download URL is preserved as a comment
+# near the place in the Xen Makefiles where the file is used.
+
 QEMU_REMOTE=http://xenbits.xensource.com/git-http/qemu-xen-unstable.git
 
 # Specify which qemu-dm to use. This may be `ioemu' to use the old
index 2b724bb0659ff6f1b6d197199dc8a5cab04b74c1..131ab3d1a36c03cbdc40a6d81c6583390a3cfaa7 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -203,6 +203,8 @@ uninstall:
        rm -rf $(D)/etc/hotplug/xen-backend.agent
        rm -f  $(D)/etc/udev/rules.d/xen-backend.rules
        rm -f  $(D)/etc/udev/xen-backend.rules
+       rm -f  $(D)/etc/udev/rules.d/xend.rules
+       rm -f  $(D)/etc/udev/xend.rules
        rm -f  $(D)/etc/sysconfig/xendomains
        rm -rf $(D)/var/run/xen* $(D)/var/lib/xen*
        rm -rf $(D)/boot/*xen*
@@ -240,7 +242,8 @@ linux26:
 #
 
 TBOOT_TARFILE = tboot-20080613.tar.gz
-TBOOT_BASE_URL = http://downloads.sourceforge.net/tboot
+#TBOOT_BASE_URL = http://downloads.sourceforge.net/tboot
+TBOOT_BASE_URL = $(XEN_EXTFILES_URL)
 
 .PHONY: build-tboot
 build-tboot: download_tboot
index 75f648eab6cfec1af80d3516ea93e570a5f92b30..751b582c866d3fb66ed1cdc483de940dad781ed8 100644 (file)
@@ -19,29 +19,70 @@ setopt()
 
        # Then append the new value
        case ${VALUE} in
-           y|m) echo "${OPTION}=${VALUE}" >> "${CONFIG}" ;;
-           n)   echo "# ${OPTION} is not set" >> "${CONFIG}" ;;
-           *)   echo "Invalid value ${VALUE} for ${OPTION}" 1>&2 ; exit 1 ;;
+           n)     echo "# ${OPTION} is not set" >> "${CONFIG}" ;;
+           y|m|*) echo "${OPTION}=${VALUE}" >> "${CONFIG}" ;;
        esac
 }
 
 setopt CONFIG_PARAVIRT y
+setopt CONFIG_PARAVIRT_DEBUG y
 setopt CONFIG_PARAVIRT_GUEST y
+
 setopt CONFIG_XEN y
+setopt CONFIG_XEN_BLKDEV_FRONTEND y
+setopt CONFIG_XEN_NETDEV_FRONTEND y
+setopt CONFIG_XEN_KBDDEV_FRONTEND y
+setopt CONFIG_XEN_FBDEV_FRONTEND y
+setopt CONFIG_XEN_BALLOON y
+setopt CONFIG_XEN_SCRUB_PAGES y
+setopt CONFIG_XEN_DEV_EVTCHN y
+setopt CONFIG_XEN_BACKEND y
+setopt CONFIG_XEN_BLKDEV_BACKEND y
+setopt CONFIG_XEN_NETDEV_BACKEND y
+setopt CONFIG_XENFS y
+setopt CONFIG_XEN_COMPAT_XENFS y
+setopt CONFIG_HVC_XEN y
+setopt CONFIG_XEN_MAX_DOMAIN_MEMORY 32
+setopt CONFIG_XEN_DEBUG_FS y
+setopt CONFIG_XEN_DOM0 y
+
 setopt CONFIG_VMI y
+
 setopt CONFIG_KVM y
 setopt CONFIG_KVM_INTEL y
 setopt CONFIG_KVM_AMD y
+setopt CONFIG_KVM_CLOCK y
+setopt CONFIG_KVM_GUEST n
+setopt CONFIG_KVM_TRACE n
+
 setopt CONFIG_LGUEST n
-setopt CONFIG_XEN_BLKDEV_FRONTEND y
-setopt CONFIG_XEN_NETDEV_FRONTEND y
-setopt CONFIG_HVC_XEN y
-setopt CONFIG_NUMA n
+
 setopt CONFIG_LOCALVERSION_AUTO n
 
+# Should all be set one way or another in defconfig but aren't
+setopt CONFIG_NUMA n
+setopt CONFIG_X86_VSMP n
+setopt CONFIG_X86_UV n
+setopt CONFIG_CALGARY_IOMMU n
+setopt CONFIG_AMD_IOMMU n
+setopt CONFIG_MAXSMP n
+setopt CONFIG_SPARSEMEM_VMEMMAP n
+setopt CONFIG_I7300_IDLE n
+setopt CONFIG_DMAR n
+setopt CONFIG_INTR_REMAP n
+setopt CONFIG_GFS2_FS n
+setopt CONFIG_IOMMU_DEBUG n
+
 case ${XEN_TARGET_ARCH} in
     x86_32) setopt CONFIG_64BIT n ;;
-    x86_64) setopt CONFIG_64BIT y ;;
+    x86_64)
+       setopt CONFIG_64BIT y
+       setopt CONFIG_IA32_EMULATION y
+       setopt CONFIG_IA32_AOUT n
+       setopt CONFIG_CRYPTO_AES_X86_64 n
+       setopt CONFIG_CRYPTO_SALSA20_X86_64 n
+       setopt CONFIG_CRYPTO_TWOFISH_X86_64 n
+       ;;
     *) ;;
 esac
 
index 6561e4e43987d3c6096c8b8c3e4c26700e289d40..6f203eff3a0ed9bf08d618441fe713e89154e98d 100644 (file)
@@ -100,10 +100,10 @@ ifneq ($(EXTRAVERSION),)
 endif
        $(__NONINT_CONFIG) $(MAKE) -C $(LINUX_SRCDIR) ARCH=$(LINUX_ARCH) oldconfig O=$$(/bin/pwd)/$(LINUX_DIR)
        @set -e ; if [ ! -f $(LINUX_DIR)/Makefile ] ; then \
-           echo "***********************************"; \
+           echo "==================================="; \
            echo "oldconfig did not create a Makefile"; \
            echo "Generating $(LINUX_DIR)/Makefile   "; \
-           echo "***********************************"; \
+           echo "==================================="; \
            ( echo "# Automatically generated: don't edit"; \
              echo ""; \
              echo "VERSION = 2"; \
diff --git a/buildconfigs/mk.linux-2.6-pvops b/buildconfigs/mk.linux-2.6-pvops
new file mode 100644 (file)
index 0000000..d3258f5
--- /dev/null
@@ -0,0 +1,14 @@
+XEN_LINUX_SOURCE ?= git-clone
+LINUX_VER ?= 2.6-pvops
+
+IMAGE_TARGET ?= bzImage
+
+XEN_LINUX_CONFIG_UPDATE := buildconfigs/enable-xen-config
+
+XEN_LINUX_GIT_URL ?= git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git
+XEN_LINUX_GIT_REMOTENAME ?= xen
+XEN_LINUX_GIT_REMOTEBRANCH ?= xen/dom0/hackery
+
+EXTRAVERSION ?=
+
+include buildconfigs/mk.linux-2.6-common
index c356e4ccf5af42d144b71c7697e7c64f49baa754..2101e1a1308311afcd847b7f2d49adc3f8d9e8d7 100644 (file)
@@ -10,7 +10,7 @@ vpath linux-%.tar.bz2 $(LINUX_SRC_PATH)
 # download a pristine Linux kernel tarball if there isn't one in LINUX_SRC_PATH
 linux-%.tar.bz2:
        @echo "Cannot find $@ in path $(LINUX_SRC_PATH)"
-       wget $(XEN_LINUX_MIRROR)/$@ -O./$@
+       false wget $(XEN_LINUX_MIRROR)/$@ -O./$@
 
 # XXX create a pristine tree for diff -Nurp convenience
 
index 7e645414213199fad88ab2e196d794aeb86a7b88..19bc25d56f0383a581db2abb189eaf685306c80e 100644 (file)
@@ -2,3 +2,7 @@ include $(XEN_ROOT)/config/StdGNU.mk
 
 # Override settings for this OS
 CURSES_LIBS = -lcurses
+
+LIBLEAFDIR_x86_64 = lib
+LIBEXEC = $(PREFIX)/libexec
+PRIVATE_BINDIR = $(BINDIR)
index aaa89a26643d0c1f758409d406c5c648daaafb34..d3d69bf944b6f501c8c8c81a21d4b6e4ecb176b8 100644 (file)
@@ -25,9 +25,12 @@ PREFIX ?= /usr
 BINDIR = $(PREFIX)/bin
 INCLUDEDIR = $(PREFIX)/include
 LIBLEAFDIR = lib
+LIBLEAFDIR_x86_32 = lib
 LIBLEAFDIR_x86_64 = lib64
 LIBDIR = $(PREFIX)/$(LIBLEAFDIR)
+LIBDIR_x86_32 = $(PREFIX)/$(LIBLEAFDIR_x86_32)
 LIBDIR_x86_64 = $(PREFIX)/$(LIBLEAFDIR_x86_64)
+LIBEXEC = $(LIBDIR_x86_32)/xen/bin
 MANDIR = $(PREFIX)/share/man
 MAN1DIR = $(MANDIR)/man1
 MAN8DIR = $(MANDIR)/man8
index a4835a4b9179d4f4ceb1c774745477c599c06804..68fb76db12d7b1212dc71cf33f670765eacb301f 100644 (file)
@@ -2,12 +2,12 @@
 silent_which ()
 {
         which $1 1>/dev/null 2>/dev/null || {
-                echo "*************************************************"
-                echo "*************************************************"
-                echo "* WARNING: Package '$1' is required"
-                echo "*          to build Xen documentation"
-                echo "*************************************************"
-                echo "*************************************************"
+                echo "================================================="
+                echo "================================================="
+                echo "= WARNING: Package '$1' is required"
+                echo "=          to build Xen documentation"
+                echo "================================================="
+                echo "================================================="
         }
         which $1 1>/dev/null 2>/dev/null
 }
index ad36fce6abf38b7c7d0ccbf82665c2dadce2df6d..bcb86f48ab306b4d9d08358f7eaf0988cabf7650 100644 (file)
@@ -67,6 +67,8 @@ The attached console will perform much like a standard serial console,
 so running curses based interfaces over the console B<is not
 advised>.  Vi tends to get very odd when using it over this interface.
 
+Use the key combination Ctrl+] to detach the domain console.
+
 =item B<create> I<configfile> [I<OPTIONS>] [I<vars>]..
 
 The create subcommand requires a config file and can optionally take a
index bc1842c7472104684d411939f5c7a87b0e48500f..49c51afbeb0718fab0be07f1abc13fa08438082e 100644 (file)
@@ -30,8 +30,13 @@ The elf header members are set as follows
         e_ident[EI_OSABI] = ELFOSABI_SYSV = 0
         e_type = ET_CORE = 4
 ELFCLASS64 is always used independent of architecture.
-e_ident[EI_DATA] and e_flags are set according to the dumping system's
-architecture. Other members are set as usual.
+e_ident[EI_DATA] is set as follows
+  For x86 PV domain case, it is set according to the guest configuration
+  (i.e. if guest is 32bit it is set to EM_386 even when the dom0 is 64 bit.)
+  For other domain case (x86 HVM domain case and ia64 domain case),
+  it is set according to the dumping system's architecture.
+e_flags is set according to the dumping system's architecture.
+Other members are set as usual.
 
 Sections
 --------
@@ -241,3 +246,7 @@ Currently only (major, minor) = (0, 1) is used.
   The format version isn't bumped because analysis tools can distinguish it.
 - .xen_ia64_mapped_regs section was made only for ia64 PV domain.
   In case of IA64 HVM domain, this section doesn't exist.
+- elf header e_ident[EI_DATA]
+  On x86 PV domain case, it is set according to the guest configuration.
+  I.e. 32-on-64 case, the file will be set EM_386 instead of EM_X86_64.
+  This is the same as 32-on-32 case, so there is no impact on analysis tools.
index bd6fd104ccdd79a85adf12d41c282bed7964ffcb..5f90b640c3109db634b990c3ac2d780de643843e 100644 (file)
@@ -26,7 +26,18 @@ title Xen-Fedora Core (2.6.18-xen)
         module /boot/vmlinuz-2.6.18.8-xen root=LABEL=/ ro xencons=ttyS console=tty0 console=ttyS0, pciback.hide=(01:00.0)(03:00.0)
         module /boot/initrd-2.6.18-xen.img
 
-12) reboot system
+    or use dynamic hiding via PCI backend sysfs interface:
+        a) check if the driver has binded to the device
+            ls -l /sys/bus/pci/devices/0000:01:00.0/driver
+            ... /sys/bus/pci/devices/0000:01:00.0/driver -> ../../../../bus/pci/drivers/igb
+        b) if yes, then unload the driver first
+            echo -n 0000:01:00.0 >/sys/bus/pci/drivers/igb/unbind
+        c) add the device to the PCI backend
+            echo -n 0000:01:00.0 >/sys/bus/pci/drivers/pciback/new_slot
+        d) let the PCI backend bind to the device
+            echo -n 0000:01:00.0 >/sys/bus/pci/drivers/pciback/bind
+
+12) reboot system (not requires if you use the dynamic hiding method)
 13) add "pci" line in /etc/xen/hvm.conf for to assigned devices
         pci = [ '01:00.0', '03:00.0' ]
 15) start hvm guest and use "lspci" to see the passthru device and
@@ -38,6 +49,30 @@ Enable MSI/MSI-x for assigned devices
 Add "msi=1" option in kernel line of host grub.
 
 
+MSI-INTx translation for passthrough devices in HVM
+---------------------------------------------------
+
+If the assigned device uses a physical IRQ that is shared by more than
+one device among multiple domains, there may be significant impact on
+device performance. Unfortunately, this is quite a common case if the
+IO-APIC (INTx) IRQ is used. MSI can avoid this issue, but was only
+available if the guest enables it.
+
+With MSI-INTx translation turned on, Xen enables device MSI if it's
+available, regardless of whether the guest uses INTx or MSI. If the
+guest uses INTx IRQ, Xen will inject a translated INTx IRQ to guest's
+virtual ioapic whenever an MSI message is received. This reduces the
+interrupt sharing of the system. If the guest OS enables MSI or MSI-X,
+the translation is automatically turned off.
+
+To enable or disable MSI-INTx translation globally, add "pci_msitranslate"
+in the config file:
+       pci_msitranslate = 1         (default is 1)
+
+To override for a specific device:
+       pci = [ '01:00.0,msitranslate=0', '03:00.0' ]
+
+
 Caveat on Conventional PCI Device Passthrough
 ---------------------------------------------
 
@@ -80,6 +115,11 @@ VTd device hotplug:
 
        [root@vt-vtd ~]# xm pci-attach HVMDomainVtd 0:2:0.0 7
 
+    To specify options for the device, use -o or --options=. Following command would disable MSI-INTx translation for the device
+
+       [root@vt-vtd ~]# xm pci-attach -o msitranslate=0 0:2:0.0 7
+
+
 VTd hotplug usage model:
 ------------------------
 
@@ -131,3 +171,82 @@ driver's view are different. As a result, device can't access to the
 buffer specified by driver.
 
 Such devices assigned to HVM domain currently do not work.
+
+
+Using SR-IOV with VT-d
+--------------------------------
+
+The Single Root I/O Virtualization is a PCI Express feature supported by
+some devices such as Intel 82576 which allows you to create virtual PCI
+devices (Virtual Function) and assign them to the HVM guest.
+
+You can use latest lspci (v3.1 and above) to check if your PCIe device
+supports the SR-IOV capability or not.
+
+  $ lspci -s 01:00.0 -vvv
+
+  01:00.0 Ethernet controller: Intel Corporation 82576 Gigabit Network Connection (rev 01)
+        Subsystem: Intel Corporation Gigabit ET Dual Port Server Adapter
+
+        ...
+
+        Capabilities: [160] Single Root I/O Virtualization (SR-IOV)
+                IOVCap: Migration-, Interrupt Message Number: 000
+                IOVCtl: Enable+ Migration- Interrupt- MSE+ ARIHierarchy+
+                IOVSta: Migration-
+                Initial VFs: 8, Total VFs: 8, Number of VFs: 7, Function Dependency Link: 00
+                VF offset: 128, stride: 2, Device ID: 10ca
+                Supported Page Size: 00000553, System Page Size: 00000001
+                VF Migration: offset: 00000000, BIR: 0
+        Kernel driver in use: igb
+
+
+The function that has the SR-IOV capability is also known as Physical
+Function. You need the Physical Function driver (runs in the Dom0 and
+controls the physical resources allocation) to enable the Virtual Function.
+Following is the Virtual Functions associated with above Physical Function.
+
+  $ lspci | grep -e 01:1[01].[0246]
+
+  01:10.0 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+  01:10.2 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+  01:10.4 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+  01:10.6 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+  01:11.0 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+  01:11.2 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+  01:11.4 Ethernet controller: Intel Corporation Device 10ca (rev 01)
+
+We can tell that Physical Function 01:00.0 has 7 Virtual Functions (01:10.0,
+01:10.2, 01:10.4, 01:10.6, 01:11.0, 01:11.2, 01:11.4). And the Virtual
+Function PCI Configuration Space looks just like normal PCI device.
+
+  $ lspci -s 01:10.0 -vvv
+
+  01:10.0 Ethernet controller: Intel Corporation 82576 Gigabit Virtual Function
+        Subsystem: Intel Corporation Gigabit Virtual Function
+        Control: I/O- Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx-
+        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
+        Region 0: [virtual] Memory at d2840000 (64-bit, non-prefetchable) [size=16K]
+        Region 3: [virtual] Memory at d2860000 (64-bit, non-prefetchable) [size=16K]
+        Capabilities: [70] MSI-X: Enable+ Mask- TabSize=3
+                Vector table: BAR=3 offset=00000000
+                PBA: BAR=3 offset=00002000
+        Capabilities: [a0] Express (v2) Endpoint, MSI 00
+
+        ...
+
+
+The Virtual Function only appears after the Physical Function driver
+is loaded. Once the Physical Function driver is unloaded. All Virtual
+Functions associated with this Physical Function disappear.
+
+The Virtual Function is essentially same as the normal PCI device when
+using it in VT-d environment. You need to hide the Virtual Function,
+use the Virtual Function bus, device and function number in the HVM
+guest configuration file and then boot the HVM guest. You also need the
+Virtual Function driver which is the normal PCI device driver in the
+HMV guest to drive the Virtual Function. The PCIe SR-IOV specification
+requires that the Virtual Function can only support MSI/MSI-x if it
+uses interrupt. This means you also need to enable Xen/MSI support.
+Since the Virtual Function is dynamically allocated by Physical Function
+driver, you might want to use the dynamic hiding method mentioned above.
diff --git a/docs/misc/xen-error-handling.txt b/docs/misc/xen-error-handling.txt
new file mode 100644 (file)
index 0000000..7728787
--- /dev/null
@@ -0,0 +1,88 @@
+Error handling in Xen
+---------------------
+
+1. domain_crash()
+-----------------
+Crash the specified domain due to buggy or unsupported behaviour of the
+guest. This should not be used where the hypervisor itself is in
+error, even if the scope of that error affects only a single
+domain. BUG() is a more appropriate failure method for hypervisor
+bugs. To repeat: domain_crash() is the correct response for erroneous
+or unsupported *guest* behaviour!
+
+Note that this should be used in most cases in preference to
+domain_crash_synchronous(): domain_crash() returns to the caller,
+allowing the crash to be deferred for the currently executing VCPU
+until certain resources (notably, spinlocks) have been released.
+
+Example usages:
+ * Unrecoverable guest kernel stack overflows
+ * Unsupported corners of HVM device models
+
+2. BUG()
+--------
+Crashes the host system with an informative file/line error message
+and a backtrace. Use this to check consistency assumptions within the
+hypervisor.
+
+Be careful not to use BUG() (or BUG_ON(), or ASSERT()) for failures
+*outside* the hypervisor software -- in particular, guest bugs (where
+domain_crash() is more appropriate) or non-critical BIOS or hardware
+errors (where retry or feature disable are more appropriate).
+
+Example usage: In arch/x86/hvm/i8254.c an I/O port handler includes
+the check BUG_ON(bytes != 1). We choose this extreme reaction to the
+unexpected error case because, although it could be handled by failing
+the I/O access or crashing the domain, it is indicative of an
+unexpected inconsistency in the hypervisor itself (since the I/O
+handler was only registered for single-byte accesses).
+
+
+3. BUG_ON()
+-----------
+BUG_ON(...) is merely a convenient short form for "if (...) BUG()". It
+is most commonly used as an 'always on' alternative to ASSERT().
+
+
+4. ASSERT()
+-----------
+Similar to BUG_ON(), except that it is only enabled for debug builds
+of the hypervisor. Typically ASSERT() is used only where the (usually
+small) overheads of an always-on debug check might be considered
+excessive. A good example might be within inner loops of time-critical
+functions, or where an assertion is extreme paranoia (considered
+*particularly* unlikely ever to fail).
+
+In general, if in doubt, use BUG_ON() in preference to ASSERT().
+
+
+5. panic()
+----------
+Like BUG() and ASSERT() this will crash and reboot the host
+system. However it does this after printing only an error message with
+no extra diagnostic information such as a backtrace. panic() is
+generally used where an unsupported system configuration is detected,
+particularly during boot, and where extra diagnostic information about
+CPU context would not be useful. It may also be used before exception
+handling is enabled during Xen bootstrap (on x86, BUG() and ASSERT()
+depend on Xen's exception-handling capabilities).
+
+Example usage: Most commonly for out-of-memory errors during
+bootstrap. The failure is unexpected since a host should always have
+enough memory to boot Xen, but if the failure does occur then the
+context of the failed memory allocation itself is not very
+interesting.
+
+
+6. Feature disable
+------------------
+A possible approach to dealing with boot-time errors, rather than
+crashing the hypervisor. It's particularly appropriate when parsing
+non-critical BIOS tables and detecting extended hardware features.
+
+
+7. BUILD_BUG_ON()
+-----------------
+Useful for assertions which can be evaluated at compile time. For
+example, making explicit assumptions about size and alignment of C
+structures.
diff --git a/docs/misc/xsm-flask.txt b/docs/misc/xsm-flask.txt
new file mode 100644 (file)
index 0000000..e27f651
--- /dev/null
@@ -0,0 +1,148 @@
+These notes are compiled from xen-devel questions and postings that have occured
+since the inclusion of XSM.  These notes are not intended to be definitive
+documentation but should address many common problems that arrise when
+experimenting with XSM:FLASK.
+
+Xen XSM:FLASK configuration
+---------------------------
+
+1) cd xen-unstable.hg
+2) edit Config.mk in the toplevel xen directory as follows:
+
+       XSM_ENABLE ?= y
+       FLASK_ENABLE ?= y
+       ACM_SECURITY ?= n
+       
+NB: Only one security module can be selected at a time.  If no module is
+selected, then the default DUMMY module will be enforced.  The DUMMY module
+only exercises the security framework and does not enforce any security
+policies.  Changing the security module selection will require recompiling xen.
+These settings will also configure the corresponding toolchain support.  
+
+3) make xen
+4) make tools
+
+
+Xen XSM:FLASK policy
+--------------------
+
+These instructions will enable the configuration and build of the sample policy.
+The sample policy provides the MINIMUM policy necessary to boot a
+paravirtualized dom0 and create a paravirtualized domU.  Many of the 
+default capabilities and usages supported by dom0/domU are disallowed by the
+sample policy.  Further, the policy is comprised of a limited number of types and 
+must be adjusted to meet the specific security goals of the installation. 
+Modification of the policy is straightforward and is covered in a later section.
+
+NB: The policy is not automatically built as part of the tool support because 
+of an external dependancy on the checkpolicy compiler.  The FLASK policy uses 
+the same syntax and structure as SELinux and compiling the policy relies on 
+the SELinux policy toolchain.  This toolchain is available under many 
+distributions as well as the following URL,
+
+       http://userspace.selinuxproject.org/releases/20080909/stable/checkpolicy-1.34.7.tar.gz
+
+1) cd xen-unstable.hg/tools/flask/policy
+2) make policy
+3) cp policy.20 /boot/xenpolicy.20
+4) edit /etc/grub.conf, add a module line to the xen entry,
+
+       module /xenpolicy.20
+
+5) reboot, and select the updated xen entry
+
+NB: The module entry can be inserted on any line after the xen kernel line.  Typical
+configurations use the last module entry or the module entry that immediately 
+follows the xen kernel entry.
+
+Xen configuration of xend
+-------------------------
+
+1) cd /etc/xen
+2) edit xend-config.sxp
+3) uncomment the line containing the key:value pair entry, 
+
+       #(xsm_module_name dummy)
+
+4) change the value entry to 'flask'
+
+       (xsm_module_name flask)
+
+5) restart xend
+
+Creating policy controlled domains
+----------------------------------
+
+2) Edit the domain config file and add the following entry,
+
+       access_control = ["policy=,label=system_u:object_r:domU_t"]
+
+NB: The 'policy' field is not used by XSM:FLASK.  The 'label' must exist in the 
+loaded policy. 'system_u:object_r:domU_t' is one of the existing labels from 
+the sample policy and shown for example purposes.
+
+2) Create the domain using the 'xm create' command.
+3) Use the 'xm list -l' command to list the running domains and their labels.
+
+Updating the XSM:FLASK policy
+-----------------------------
+
+It is recommended that the XSM:FLASK policy be tailored to meet the specific
+security goals of the platform.  The policy is tailored by editing the xen.te 
+file in the 'policy' subdirectory.
+
+1) cd xen-unstable.hg/tools/flask/policy
+2) edit policy/modules/xen/xen.te - make changes to support platform security goals.
+3) make policy
+4) cp policy.20 /boot/xenpolicy.20
+5) reboot
+
+Alternatively, one may reload the policy using the 'flask_loadpolicy' tool
+installed by the xen tools.
+
+1) flask_loadpolicy policy.20
+
+NB: The sample policy permits policy reloads as well as general manipulation of
+the Flask security server only from dom0.  The policy can be tailored further to
+restrict policy reloads and other manipulations to boot-time only, by removing 
+the corresponding statements from the policy.
+
+Enforcing the XSM:FLASK policy
+------------------------------
+
+By default, XSM:FLASK is compiled and installed in permissive mode.  This
+configuration will allow an XSM:FLASK system to start in enforcing mode.
+
+1) edit /etc/grub.conf
+2) append the parameter 'flask_enforcing=1' to the xen kernel line.
+3) reboot, and select the updated xen entry
+
+
+Additional notes on XSM:FLASK
+-----------------------------
+
+1) xen command line parameters
+
+       a) flask_enforcing
+       
+       The default value for flask_enforcing is '0'.  This parameter causes the 
+       platform to boot in permissive mode which means that the policy is loaded 
+       but not enforced.  This mode is often helpful for developing new systems 
+       and policies as the policy violations are reported on the xen console and 
+       may be viewed in dom0 through 'xm dmesg'.
+       
+       To boot the platform into enforcing mode, which means that the policy is
+       loaded and enforced, append 'flask_enforcing=1' on the grub line.
+       
+       This parameter may also be changed through the flask hyercall.
+       
+       b) flask_enabled
+       
+       The default value for flask_enabled is '1'.  This parameter causes the
+       platform to enable the FLASK security module under the XSM framework.
+       The parameter may be enabled/disabled only once per boot.  If the parameter
+       is set to '0', only a reboot can re-enable flask.  When flask_enabled is '0'
+       the DUMMY module is enforced.
+
+       This parameter may also be changed through the flask hypercall.  But may
+       only be performed once per boot.
index c4a8171970f4923706c18cd9870c44ce9e500919..69971cdd66fa43d1e8c150f43394f7618beccf4d 100644 (file)
@@ -4194,6 +4194,9 @@ writing to the VGA console after domain 0 starts booting (e.g., `vga=text-80x50,
 \item [ dma\_bits=xxx ] Specify width of DMA addresses in bits. This
   is used in NUMA systems to prevent this special DMA memory from
   being exhausted in one node when remote nodes have available memory.
+\item [ vcpu\_migration\_delay=$<$minimum\_time$>$] Set minimum time of 
+  vcpu migration in microseconds (default 0). This parameter avoids agressive
+  vcpu migration. For example, the linux kernel uses 0.5ms by default.
 \end{description}
 
 In addition, the following options may be specified on the Xen command
index 3d35bf67da787bf71effb876ea05654d1d603712..247a22b476081d73344f17e6f0a1589ec4d525e4 100644 (file)
@@ -51,6 +51,7 @@ Hollis Blanchard, IBM & Alastair Tse, XenSource \\
 Mike Day, IBM & Daniel Veillard, Red Hat \\
 Jim Fehlig, Novell & Tom Wilkie, University of Cambridge \\
 Jon Harrop, XenSource & Yosuke Iwamatsu, NEC \\
+Masaki Kanno, FUJITSU \\
 \end{tabular}
 \end{large}
 
index b65fc82ef54ed2e587f0ff38318300aa86ab802e..2e41b3858157a2615243a1b34f9e012dc94de9be 100644 (file)
     \end{flushleft}
    \end{minipage}\\
   \hline
+  1.0.7 & 20th Oct. 08 & M. Kanno &
+   \begin{minipage}[t]{7cm}
+    \begin{flushleft}
+     Added definitions of new classes DSCSI and PSCSI. Updated the table
+     and the diagram representing relationships between classes.
+     Added host.PSCSIs and VM.DSCSIs fields.
+    \end{flushleft}
+   \end{minipage}\\
+  \hline
  \end{tabular}
 \end{center}
index a8ed57a1a16ee0cba6dcd3d97ccb8a8178d30547..231afed09168859f15f16f38c0f701d6bb5a8e03 100644 (file)
 \newcommand{\coversheetlogo}{xen.eps}
 
 %% Document date
-\newcommand{\datestring}{24th July 2008}
+\newcommand{\datestring}{20th October 2008}
 
 \newcommand{\releasestatement}{Stable Release}
 
 %% Document revision
-\newcommand{\revstring}{API Revision 1.0.6}
+\newcommand{\revstring}{API Revision 1.0.7}
 
 %% Document authors
 \newcommand{\docauthors}{
index 62590b45d31bdc12ad53b619d919302de6b0cc97..2850b9364e3f9bc0497d8150615a6bb20f9467e8 100644 (file)
 digraph "Xen-API Class Diagram" {
 fontname="Verdana";
 
-node [ shape=box ]; session VM host network VIF PIF SR VDI VBD PBD user XSPolicy ACMPolicy;
-node [shape=ellipse]; PIF_metrics VIF_metrics VM_metrics VBD_metrics PBD_metrics VM_guest_metrics host_metrics;
-node [shape=box]; DPCI PPCI host_cpu console VTPM
+node [ shape=box ]; session VM host network VIF PIF SR VDI VBD PBD user;
+node [ shape=box ]; XSPolicy ACMPolicy DPCI PPCI host_cpu console VTPM;
+node [ shape=box ]; DSCSI PSCSI;
+node [ shape=ellipse ]; VM_metrics VM_guest_metrics host_metrics;
+node [ shape=ellipse ]; PIF_metrics VIF_metrics VBD_metrics PBD_metrics;
 session -> host [ arrowhead="none" ]
 session -> user [ arrowhead="none" ]
 VM -> VM_metrics [ arrowhead="none" ]
@@ -41,4 +43,7 @@ XSPolicy -> ACMPolicy [ arrowhead="none" ]
 DPCI -> VM [ arrowhead="none", arrowtail="crow" ]
 DPCI -> PPCI [ arrowhead="none" ]
 PPCI -> host [ arrowhead="none", arrowtail="crow" ]
+DSCSI -> VM [ arrowhead="none", arrowtail="crow" ]
+DSCSI -> PSCSI [ arrowhead="none" ]
+PSCSI -> host [ arrowhead="none", arrowtail="crow" ]
 }
index 7589489ae9739958d1aa5fc4214e8e01edb43c84..6eb2a41e06bfc470c0ebc5a4ad881d07bdcc1f9e 100644 (file)
@@ -46,6 +46,8 @@ Name & Description \\
 {\tt console} & A console \\
 {\tt DPCI} & A pass-through PCI device \\
 {\tt PPCI} & A physical PCI device \\
+{\tt DSCSI} & A half-virtualized SCSI device \\
+{\tt PSCSI} & A physical SCSI device \\
 {\tt user} & A user of the system \\
 {\tt debug} & A basic class for testing \\
 {\tt XSPolicy} & A class for handling Xen Security Policies \\
@@ -74,6 +76,8 @@ VTPM.VM & VM.VTPMs & one-to-many\\
 console.VM & VM.consoles & one-to-many\\
 DPCI.VM & VM.DPCIs & one-to-many\\
 PPCI.host & host.PPCIs & one-to-many\\
+DSCSI.VM & VM.DSCSIs & one-to-many\\
+PSCSI.host & host.PSCSIs & one-to-many\\
 host.resident\_VMs & VM.resident\_on & many-to-one\\
 host.host\_CPUs & host\_cpu.host & many-to-one\\
 \hline
@@ -1407,6 +1411,7 @@ $\mathit{RO}_\mathit{run}$ &  {\tt VBDs} & (VBD ref) Set & virtual block devices
 $\mathit{RO}_\mathit{run}$ &  {\tt crash\_dumps} & (crashdump ref) Set & crash dumps associated with this VM \\
 $\mathit{RO}_\mathit{run}$ &  {\tt VTPMs} & (VTPM ref) Set & virtual TPMs \\
 $\mathit{RO}_\mathit{run}$ &  {\tt DPCIs} & (DPCI ref) Set & pass-through PCI devices \\
+$\mathit{RO}_\mathit{run}$ &  {\tt DSCSIs} & (DSCSI ref) Set & half-virtualized SCSI devices \\
 $\mathit{RW}$ &  {\tt PV/bootloader} & string & name of or path to bootloader \\
 $\mathit{RW}$ &  {\tt PV/kernel} & string & path to the kernel \\
 $\mathit{RW}$ &  {\tt PV/ramdisk} & string & path to the initrd \\
@@ -3446,6 +3451,38 @@ Get the DPCIs field of the given VM.
 }
 
 
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_DSCSIs}
+
+{\bf Overview:} 
+Get the DSCSIs field of the given VM.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} ((DSCSI ref) Set) get_DSCSIs (session_id s, VM ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt VM ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+(DSCSI ref) Set
+}
+
+
 value of the field
 \vspace{0.3cm}
 \vspace{0.3cm}
@@ -5518,6 +5555,7 @@ $\mathit{RW}$ &  {\tt suspend\_image\_sr} & SR ref & The SR in which VDIs for su
 $\mathit{RW}$ &  {\tt crash\_dump\_sr} & SR ref & The SR in which VDIs for crash dumps are created \\
 $\mathit{RO}_\mathit{run}$ &  {\tt PBDs} & (PBD ref) Set & physical blockdevices \\
 $\mathit{RO}_\mathit{run}$ &  {\tt PPCIs} & (PPCI ref) Set & physical PCI devices \\
+$\mathit{RO}_\mathit{run}$ &  {\tt PSCSIs} & (PSCSI ref) Set & physical SCSI devices \\
 $\mathit{RO}_\mathit{run}$ &  {\tt host\_CPUs} & (host\_cpu ref) Set & The physical CPUs on this host \\
 $\mathit{RO}_\mathit{run}$ &  {\tt metrics} & host\_metrics ref & metrics associated with this host \\
 \hline
@@ -6840,6 +6878,38 @@ Get the PPCIs field of the given host.
 }
 
 
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_PSCSIs}
+
+{\bf Overview:} 
+Get the PSCSIs field of the given host.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} ((PSCSI ref) Set) get_PSCSIs (session_id s, host ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt host ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+(PSCSI ref) Set
+}
+
+
 value of the field
 \vspace{0.3cm}
 \vspace{0.3cm}
@@ -15716,6 +15786,1096 @@ PPCI record
 }
 
 
+all fields from the object
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+
+\vspace{1cm}
+\newpage
+\section{Class: DSCSI}
+\subsection{Fields for class: DSCSI}
+\begin{longtable}{|lllp{0.38\textwidth}|}
+\hline
+\multicolumn{1}{|l}{Name} & \multicolumn{3}{l|}{\bf DSCSI} \\
+\multicolumn{1}{|l}{Description} & \multicolumn{3}{l|}{\parbox{11cm}{\em A
+half-virtualized SCSI device.}} \\
+\hline
+Quals & Field & Type & Description \\
+\hline
+$\mathit{RO}_\mathit{run}$ &  {\tt uuid} & string & unique identifier/object reference \\
+$\mathit{RO}_\mathit{inst}$ &  {\tt VM} & VM ref & the virtual machine \\
+$\mathit{RO}_\mathit{inst}$ &  {\tt PSCSI} & PSCSI ref & the physical SCSI device \\
+$\mathit{RO}_\mathit{run}$ &  {\tt virtual\_host} & int & the virtual host number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt virtual\_channel} & int & the virtual channel number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt virtual\_target} & int & the virtual target number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt virtual\_lun} & int & the virtual logical unit number \\
+$\mathit{RO}_\mathit{inst}$ &  {\tt virtual\_HCTL} & string & the virtual HCTL \\
+$\mathit{RO}_\mathit{run}$ &  {\tt runtime\_properties} & (string $\rightarrow$ string) Map & Device runtime properties \\
+\hline
+\end{longtable}
+\subsection{RPCs associated with class: DSCSI}
+\subsubsection{RPC name:~get\_all}
+
+{\bf Overview:} 
+Return a list of all the DSCSIs known to the system.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} ((DSCSI ref) Set) get_all (session_id s)\end{verbatim}
+
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+(DSCSI ref) Set
+}
+
+
+references to all objects
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_uuid}
+
+{\bf Overview:} 
+Get the uuid field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_uuid (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_VM}
+
+{\bf Overview:} 
+Get the VM field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (VM ref) get_VM (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+VM ref
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_PSCSI}
+
+{\bf Overview:} 
+Get the PSCSI field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (PSCSI ref) get_PSCSI (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+PSCSI ref
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_virtual\_host}
+
+{\bf Overview:} 
+Get the virtual\_host field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_virtual_host (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_virtual\_channel}
+
+{\bf Overview:} 
+Get the virtual\_channel field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_virtual_channel (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_virtual\_target}
+
+{\bf Overview:} 
+Get the virtual\_target field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_virtual_target (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_virtual\_lun}
+
+{\bf Overview:} 
+Get the virtual\_lun field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_virtual_lun (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_virtual\_HCTL}
+
+{\bf Overview:} 
+Get the virtual\_HCTL field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_virtual_HCTL (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_runtime\_properties}
+
+{\bf Overview:} 
+Get the runtime\_properties field of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} ((string -> string) Map) get_runtime_properties (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+(string $\rightarrow$ string) Map
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~create}
+
+{\bf Overview:} 
+Create a new DSCSI instance, and return its handle.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (DSCSI ref) create (session_id s, DSCSI record args)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI record } & args & All constructor arguments \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+DSCSI ref
+}
+
+
+reference to the newly created object
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~destroy}
+
+{\bf Overview:} 
+Destroy the specified DSCSI instance.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} void destroy (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+void
+}
+
+
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_by\_uuid}
+
+{\bf Overview:} 
+Get a reference to the DSCSI instance with the specified UUID.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (DSCSI ref) get_by_uuid (session_id s, string uuid)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt string } & uuid & UUID of object to return \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+DSCSI ref
+}
+
+
+reference to the object
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_record}
+
+{\bf Overview:} 
+Get a record containing the current state of the given DSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (DSCSI record) get_record (session_id s, DSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt DSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+DSCSI record
+}
+
+
+all fields from the object
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+
+\vspace{1cm}
+\newpage
+\section{Class: PSCSI}
+\subsection{Fields for class: PSCSI}
+\begin{longtable}{|lllp{0.38\textwidth}|}
+\hline
+\multicolumn{1}{|l}{Name} & \multicolumn{3}{l|}{\bf PSCSI} \\
+\multicolumn{1}{|l}{Description} & \multicolumn{3}{l|}{\parbox{11cm}{\em A
+physical SCSI device.}} \\
+\hline
+Quals & Field & Type & Description \\
+\hline
+$\mathit{RO}_\mathit{run}$ &  {\tt uuid} & string & unique identifier/object reference \\
+$\mathit{RO}_\mathit{run}$ &  {\tt host} & host ref &  the physical machine to which this PSCSI is connected \\
+$\mathit{RO}_\mathit{run}$ &  {\tt physical\_host} & int & the physical host number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt physical\_channel} & int & the physical channel number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt physical\_target} & int & the physical target number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt physical\_lun} & int & the physical logical unit number \\
+$\mathit{RO}_\mathit{run}$ &  {\tt physical\_HCTL} & string & the physical HCTL \\
+$\mathit{RO}_\mathit{run}$ &  {\tt vendor\_name} & string & the vendor name \\
+$\mathit{RO}_\mathit{run}$ &  {\tt model} & string & the model \\
+$\mathit{RO}_\mathit{run}$ &  {\tt type\_id} & int & the SCSI type ID \\
+$\mathit{RO}_\mathit{run}$ &  {\tt type} & string &  the SCSI type \\
+$\mathit{RO}_\mathit{run}$ &  {\tt dev\_name} & string & the SCSI device name (e.g. sda or st0) \\
+$\mathit{RO}_\mathit{run}$ &  {\tt sg\_name} & string & the SCSI generic device name (e.g. sg0) \\
+$\mathit{RO}_\mathit{run}$ &  {\tt revision} & string & the revision \\
+$\mathit{RO}_\mathit{run}$ &  {\tt scsi\_id} & string & the SCSI ID \\
+$\mathit{RO}_\mathit{run}$ &  {\tt scsi\_level} & int & the SCSI level \\
+\hline
+\end{longtable}
+\subsection{RPCs associated with class: PSCSI}
+\subsubsection{RPC name:~get\_all}
+
+{\bf Overview:} 
+Return a list of all the PSCSIs known to the system.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} ((PSCSI ref) Set) get_all (session_id s)\end{verbatim}
+
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+(PSCSI ref) Set
+}
+
+
+references to all objects
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_uuid}
+
+{\bf Overview:} 
+Get the uuid field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_uuid (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_host}
+
+{\bf Overview:} 
+Get the host field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (host ref) get_host (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+host ref
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_physical\_host}
+
+{\bf Overview:} 
+Get the physical\_host field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_physical_host (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_physical\_channel}
+
+{\bf Overview:} 
+Get the physical\_channel field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_physical_channel (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_physical\_target}
+
+{\bf Overview:} 
+Get the physical\_target field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_physical_target (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_physical\_lun}
+
+{\bf Overview:} 
+Get the physical\_lun field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_physical_lun (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_physical\_HCTL}
+
+{\bf Overview:} 
+Get the physical\_HCTL field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_physical_HCTL (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_vendor\_name}
+
+{\bf Overview:} 
+Get the vendor\_name field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_vendor_name (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_model}
+
+{\bf Overview:} 
+Get the model field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_model (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_type\_id}
+
+{\bf Overview:} 
+Get the type\_id field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_type_id (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_type}
+
+{\bf Overview:} 
+Get the type field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_type (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_dev\_name}
+
+{\bf Overview:} 
+Get the dev\_name field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_dev_name (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_sg\_name}
+
+{\bf Overview:} 
+Get the sg\_name field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_sg_name (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_revision}
+
+{\bf Overview:} 
+Get the revision field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_revision (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_scsi\_id}
+
+{\bf Overview:} 
+Get the scsi\_id field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} string get_scsi_id (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+string
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_scsi\_level}
+
+{\bf Overview:} 
+Get the scsi\_level field of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} int get_scsi_level (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+int
+}
+
+
+value of the field
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_by\_uuid}
+
+{\bf Overview:} 
+Get a reference to the PSCSI instance with the specified UUID.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (PSCSI ref) get_by_uuid (session_id s, string uuid)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt string } & uuid & UUID of object to return \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+PSCSI ref
+}
+
+
+reference to the object
+\vspace{0.3cm}
+\vspace{0.3cm}
+\vspace{0.3cm}
+\subsubsection{RPC name:~get\_record}
+
+{\bf Overview:} 
+Get a record containing the current state of the given PSCSI.
+
+ \noindent {\bf Signature:} 
+\begin{verbatim} (PSCSI record) get_record (session_id s, PSCSI ref self)\end{verbatim}
+
+
+\noindent{\bf Arguments:}
+
+
+\vspace{0.3cm}
+\begin{tabular}{|c|c|p{7cm}|}
+ \hline
+{\bf type} & {\bf name} & {\bf description} \\ \hline
+{\tt PSCSI ref } & self & reference to the object \\ \hline 
+
+\end{tabular}
+
+\vspace{0.3cm}
+
+ \noindent {\bf Return Type:} 
+{\tt 
+PSCSI record
+}
+
+
 all fields from the object
 \vspace{0.3cm}
 \vspace{0.3cm}
index ec24cba14af5a9edda1ac3d8d0da3b3110c5f405..78e0c9d527ab20e2aa75c4ea48f64d1a31b9e645 100644 (file)
@@ -93,8 +93,12 @@ endif
 $(OBJ_DIR)/$(TARGET)_app.o: $(APP_OBJS) app.lds
        $(LD) -r -d $(LDFLAGS) -\( $^ -\) $(APP_LDLIBS) --undefined main -o $@
 
-$(OBJ_DIR)/$(TARGET): links $(OBJS) $(OBJ_DIR)/$(TARGET)_app.o arch_lib
-       $(LD) -r $(LDFLAGS) $(HEAD_OBJ) $(OBJ_DIR)/$(TARGET)_app.o $(OBJS) $(LDARCHLIB) $(LDLIBS) -o $@.o
+ifneq ($(APP_OBJS),)
+APP_O=$(OBJ_DIR)/$(TARGET)_app.o 
+endif
+
+$(OBJ_DIR)/$(TARGET): links $(OBJS) $(APP_O) arch_lib
+       $(LD) -r $(LDFLAGS) $(HEAD_OBJ) $(APP_O) $(OBJS) $(LDARCHLIB) $(LDLIBS) -o $@.o
        $(OBJCOPY) -w -G $(GLOBAL_PREFIX)* -G _start $@.o $@.o
        $(LD) $(LDFLAGS) $(LDFLAGS_FINAL) $@.o $(EXTRA_OBJS) -o $@
        gzip -f -9 -c $@ >$@.gz
index 12168713eef49bcd0da9198a937cd630259f6c7c..e6a17d3eb7de6c61dd092570ee0dac09041c2279 100644 (file)
@@ -1,5 +1,3 @@
-# Build for Big Endian?
-BIGENDIAN ?= n
 
 ARCH_CFLAGS := -mfixed-range=f2-f5,f12-f15,f32-f127 -mconstant-gp
 ARCH_CFLAGS += -O2
@@ -9,12 +7,3 @@ ARCH_ASFLAGS += -fno-builtin -fno-common -fno-strict-aliasing -mconstant-gp
 
 ARCH_LDFLAGS = -warn-common
 
-# Next lines are for big endian code !
-ifeq ($(BIGENDIAN),y)
-ARCH_CFLAGS += -mbig-endian -Wa,-mbe -Wa,-mlp64
-ARCH_CFLAGS += -DBIG_ENDIAN
-ARCH_ASFLAGS += -Wa,-mbe
-ARCH_ASFLAGS += -DBIG_ENDIAN
-ARCH_LDFLAGS = -EB -d
-endif
-
index c65f0a0d075afef07d9ec2215613641b017a3a78..45b770326c2137b72229fef2f06a2157ab3fa318 100644 (file)
@@ -116,8 +116,8 @@ registerCallback(void)
 {
        struct callback_register event =
        {
-               .type = SWAP(CALLBACKTYPE_event),
-               .address = SWAP((unsigned long)&hypervisor_callback),
+               .type = CALLBACKTYPE_event,
+               .address = (unsigned long)&hypervisor_callback,
        };
        HYPERVISOR_callback_op(CALLBACKOP_register, &event);
 }
@@ -126,46 +126,44 @@ static void
 init_start_info(start_info_t* xen_start_info)
 {
        /* Make a copy of the start_info structure */
-       start_info.nr_pages = SWAP(xen_start_info->nr_pages);
-       start_info.shared_info = SWAP(xen_start_info->shared_info);
-       start_info.flags = SWAP(xen_start_info->flags);
-       start_info.store_mfn = SWAP(xen_start_info->store_mfn);
-       start_info.store_evtchn = SWAP(xen_start_info->store_evtchn);
-       start_info.console.domU.mfn = SWAP(xen_start_info->console.domU.mfn);
+       start_info.nr_pages = xen_start_info->nr_pages;
+       start_info.shared_info = xen_start_info->shared_info;
+       start_info.flags = xen_start_info->flags;
+       start_info.store_mfn = xen_start_info->store_mfn;
+       start_info.store_evtchn = xen_start_info->store_evtchn;
+       start_info.console.domU.mfn = xen_start_info->console.domU.mfn;
        start_info.console.domU.evtchn =
-                               SWAP(xen_start_info->console.domU.evtchn);
-       start_info.pt_base = SWAP(xen_start_info->pt_base);
-       start_info.nr_pt_frames = SWAP(xen_start_info->nr_pt_frames);
-       start_info.mfn_list = SWAP(xen_start_info->mfn_list);
-       start_info.mod_start = SWAP(xen_start_info->mod_start);
-       start_info.mod_len = SWAP(xen_start_info->mod_len);
+                               xen_start_info->console.domU.evtchn;
+       start_info.pt_base = xen_start_info->pt_base;
+       start_info.nr_pt_frames = xen_start_info->nr_pt_frames;
+       start_info.mfn_list = xen_start_info->mfn_list;
+       start_info.mod_start = xen_start_info->mod_start;
+       start_info.mod_len = xen_start_info->mod_len;
 }
 
 static void
 init_boot_params(void)
 {
-       ia64BootParamG.command_line = SWAP(ia64_boot_paramP->command_line);
-       ia64BootParamG.efi_systab = SWAP(ia64_boot_paramP->efi_systab);
-       ia64BootParamG.efi_memmap = SWAP(ia64_boot_paramP->efi_memmap);
-       ia64BootParamG.efi_memmap_size =
-                               SWAP(ia64_boot_paramP->efi_memmap_size);
-       ia64BootParamG.efi_memdesc_size =
-                               SWAP(ia64_boot_paramP->efi_memdesc_size);
+       ia64BootParamG.command_line = ia64_boot_paramP->command_line;
+       ia64BootParamG.efi_systab = ia64_boot_paramP->efi_systab;
+       ia64BootParamG.efi_memmap = ia64_boot_paramP->efi_memmap;
+       ia64BootParamG.efi_memmap_size = ia64_boot_paramP->efi_memmap_size;
+       ia64BootParamG.efi_memdesc_size = ia64_boot_paramP->efi_memdesc_size;
        ia64BootParamG.efi_memdesc_version =
-                               SWAP(ia64_boot_paramP->efi_memdesc_version);
+                               ia64_boot_paramP->efi_memdesc_version;
        ia64BootParamG.console_info.num_cols =
-                               SWAP(ia64_boot_paramP->console_info.num_cols);
+                               ia64_boot_paramP->console_info.num_cols;
        ia64BootParamG.console_info.num_rows =
-                               SWAP(ia64_boot_paramP->console_info.num_rows);
+                               ia64_boot_paramP->console_info.num_rows;
        ia64BootParamG.console_info.orig_x =
-                               SWAP(ia64_boot_paramP->console_info.orig_x);
+                               ia64_boot_paramP->console_info.orig_x;
        ia64BootParamG.console_info.orig_y =
-                               SWAP(ia64_boot_paramP->console_info.orig_y);
-       ia64BootParamG.fpswa = SWAP(ia64_boot_paramP->fpswa);
-       ia64BootParamG.initrd_start = SWAP(ia64_boot_paramP->initrd_start);
-       ia64BootParamG.initrd_size = SWAP(ia64_boot_paramP->initrd_size);
-       ia64BootParamG.domain_start = SWAP(ia64_boot_paramP->domain_start);
-       ia64BootParamG.domain_size = SWAP(ia64_boot_paramP->domain_size);
+                               ia64_boot_paramP->console_info.orig_y;
+       ia64BootParamG.fpswa = ia64_boot_paramP->fpswa;
+       ia64BootParamG.initrd_start = ia64_boot_paramP->initrd_start;
+       ia64BootParamG.initrd_size = ia64_boot_paramP->initrd_size;
+       ia64BootParamG.domain_start = ia64_boot_paramP->domain_start;
+       ia64BootParamG.domain_size = ia64_boot_paramP->domain_size;
 
        /*
         * Copy and parse the boot command line.
index d95252b4c3ffcae21232ed18934ec94c62608909..f5b378381fbfe80c3de13e82bdb0ffac667ba7f3 100644 (file)
@@ -102,7 +102,6 @@ static const char *ia64_vector_names[] = {
 
 typedef struct
 {
-#if !defined(BIG_ENDIAN)
        uint64_t sof    :7;     /* 0-6 size of frame */
        uint64_t sol    :7;     /* 7-13 size of locals (in + loc) */
        uint64_t sor    :4;
@@ -111,16 +110,6 @@ typedef struct
        uint64_t rrb_pr :6;
        uint64_t res    :25;    /* reserved */
        uint64_t v      :1;     /* The v bit */
-#else /* !BIG_ENDIAN */
-       uint64_t v      :1;     /* The v bit */
-       uint64_t res    :25;    /* reserved */
-       uint64_t rrb_pr :6;
-       uint64_t rrb_fr :7;
-       uint64_t rrb_gr :7;
-       uint64_t sor    :4;
-       uint64_t sol    :7;     /* 7-13 size of locals (in + loc) */
-       uint64_t sof    :7;     /* 0-6 size of frame */
-#endif /* BIG_ENDIAN */
 } ifs_t;
 
 void
index 498eac42adfcfb4c9ed9e91276f78ef95d5dc6c3..cdb25b33ece5ec5face2395998ede51dd3bb1b22 100644 (file)
@@ -49,13 +49,6 @@ efi_get_time(efi_time_t* tmP)
                printk("efi.getTime() failed\n");
                return 0;
        }
-
-#if defined(BIG_ENDIAN)
-       tmP->Year = SWAP(tmP->Year);
-       tmP->TimeZone = SWAP(tmP->TimeZone);
-       tmP->Nanosecond = SWAP(tmP->Nanosecond);
-#endif
-
        return 1;
 }
 
@@ -65,17 +58,7 @@ efi_get_time(efi_time_t* tmP)
 static int
 efi_guid_cmp(efi_guid_t* a_le, efi_guid_t* b)
 {
-#if defined(BIG_ENDIAN)
-       if(SWAP(a_le->Data1) != b->Data1)
-               return 1;
-       if(SWAP(a_le->Data2) != b->Data2)
-               return 1;
-       if(SWAP(a_le->Data3) != b->Data3)
-               return 1;
-       return memcmp(a_le->Data4, b->Data4, sizeof(uint8_t)*8);
-#else
        return memcmp(a_le, b, sizeof(efi_guid_t));
-#endif
 }
 
 void
@@ -99,20 +82,20 @@ init_efi(void)
        efiSysTableP = (efi_system_table_t*)__va(ia64BootParamG.efi_systab);
        machineFwG.efi.efiSysTableP = efiSysTableP;
        PRINT_BV("EfiSystemTable at: %p\n", efiSysTableP);
-       fwP = (uint16_t*) __va(SWAP(efiSysTableP->FirmwareVendor));
+       fwP = (uint16_t*) __va(efiSysTableP->FirmwareVendor);
        if (fwP) {
                for (i = 0; i < (int)sizeof(fwVendor) - 1 && *fwP; ++i)
-                       fwVendor[i] = SWAP(*fwP++);
+                       fwVendor[i] = *fwP++;
                fwVendor[i] = '\0';
        }
        PRINT_BV("  EFI-FirmwareVendor        : %s\n", fwVendor);
        PRINT_BV("  EFI-FirmwareRevision      : %d\n",
-                SWAP(efiSysTableP->FirmwareRevision));
+                efiSysTableP->FirmwareRevision);
        PRINT_BV("  EFI-SystemTable-Revision  : %d.%d\n",
-                SWAP(efiSysTableP->Hdr.Revision)>>16,
-                SWAP(efiSysTableP->Hdr.Revision)&0xffff);
+                efiSysTableP->Hdr.Revision >> 16,
+                efiSysTableP->Hdr.Revision & 0xffff);
        rsP = (efi_runtime_services_t*)
-               __va(SWAP(efiSysTableP->RuntimeServices));
+               __va(efiSysTableP->RuntimeServices);
        mdcnt = ia64BootParamG.efi_memmap_size /
                ia64BootParamG.efi_memdesc_size;
        memdP = (efi_memory_descriptor_t*) __va(ia64BootParamG.efi_memmap);
@@ -123,10 +106,10 @@ init_efi(void)
             mdP = NextMemoryDescriptor(mdP, ia64BootParamG.efi_memdesc_size)) {
                /* Relocate runtime memory segments for firmware. */
                PRINT_BV("  %d. Type: %x  Attributes: 0x%lx\n",
-                        i, SWAP(mdP->Type), SWAP(mdP->Attribute));
+                        i, mdP->Type, mdP->Attribute);
                PRINT_BV("     PhysStart: 0x%lx  NumPages: 0x%lx\n",
-                        SWAP(mdP->PhysicalStart), SWAP(mdP->NumberOfPages));
-               switch (SWAP(mdP->Type)) {
+                        mdP->PhysicalStart, mdP->NumberOfPages);
+               switch (mdP->Type) {
                        case EfiRuntimeServicesData:
                                PRINT_BV("     -> EfiRuntimeServicesData\n");
                                break;
@@ -139,18 +122,17 @@ init_efi(void)
                        case EfiConventionalMemory:
                                PRINT_BV("     -> EfiConventionalMemory\n");
                                PRINT_BV("        start: 0x%lx end: 0x%lx\n",
-                                       SWAP(mdP->PhysicalStart),
-                                       SWAP(mdP->PhysicalStart)+
-                                       SWAP(mdP->NumberOfPages)*EFI_PAGE_SIZE);
+                                       mdP->PhysicalStart,
+                                       mdP->PhysicalStart +
+                                       mdP->NumberOfPages * EFI_PAGE_SIZE);
                                if (numConvMem) {
                                        printk("     Currently only one efi "
                                                "memory chunk supported !!!\n");
                                        break;
                                }
-                               machineFwG.mach_mem_start =
-                                       SWAP(mdP->PhysicalStart);
+                               machineFwG.mach_mem_start = mdP->PhysicalStart;
                                machineFwG.mach_mem_size =
-                                       SWAP(mdP->NumberOfPages)*EFI_PAGE_SIZE;
+                                       mdP->NumberOfPages * EFI_PAGE_SIZE;
                                numConvMem++;
                                break;
                        case EfiMemoryMappedIOPortSpace:
@@ -158,7 +140,7 @@ init_efi(void)
                                break;
                        case EfiPalCode:
                                        machineFwG.ia64_pal_base =
-                                       __va(SWAP(mdP->PhysicalStart));
+                                       __va(mdP->PhysicalStart);
                                PRINT_BV("     -> EfiPalCode\n"
                                         "        start : %p\n",
                                         machineFwG.ia64_pal_base);
@@ -170,12 +152,11 @@ init_efi(void)
                 * virtual addressing and the efi runtime functions
                 * may be called directly.
                 */
-               if (SWAP(mdP->Attribute) & EFI_MEMORY_RUNTIME) {
-                       if (SWAP(mdP->Attribute) & EFI_MEMORY_WB)
-                               mdP->VirtualStart =
-                                       SWAP(__va(mdP->PhysicalStart));
+               if (mdP->Attribute & EFI_MEMORY_RUNTIME) {
+                       if (mdP->Attribute & EFI_MEMORY_WB)
+                               mdP->VirtualStart = __va(mdP->PhysicalStart);
                        else {
-                               if (SWAP(mdP->Attribute) & EFI_MEMORY_UC)
+                               if (mdP->Attribute & EFI_MEMORY_UC)
                                        printk("efi_init: RuntimeMemory with "
                                                "UC attribute !!!!!!\n");
                                        /*
@@ -187,7 +168,7 @@ init_efi(void)
        }
        /* Now switch efi runtime stuff to virtual addressing. */
        status = ia64_call_efi_physical(
-                       (void*)__va(SWAP((uint64_t)rsP->SetVirtualAddressMap)),
+                       (void*)__va((uint64_t)rsP->SetVirtualAddressMap),
                        ia64BootParamG.efi_memmap_size,
                        ia64BootParamG.efi_memdesc_size,
                        ia64BootParamG.efi_memdesc_version,
@@ -200,35 +181,35 @@ init_efi(void)
        }
        /* Getting efi function pointer for getEfiTime. */
        machineFwG.efi.getTimeF =
-               (efi_get_time_t)__va(SWAP((uint64_t)rsP->GetTime));
+               (efi_get_time_t)__va((uint64_t)rsP->GetTime);
        /* Getting efi function pointer for resetSystem. */
        machineFwG.efi.resetSystemF =
-               (efi_reset_system_t)__va(SWAP((uint64_t)rsP->ResetSystem));
+               (efi_reset_system_t)__va((uint64_t)rsP->ResetSystem);
 
        /* Scanning the Configuration table of the EfiSystemTable. */
        PRINT_BV("NumberOfConfigTableEntries: %ld\n",
-                SWAP(efiSysTableP->NumberOfTableEntries));
+                efiSysTableP->NumberOfTableEntries);
 
        confP = (efi_configuration_table_t*)
-                       __va(SWAP(efiSysTableP->ConfigurationTable));
-       for (i = 0; i < SWAP(efiSysTableP->NumberOfTableEntries); i++) {
+                       __va(efiSysTableP->ConfigurationTable);
+       for (i = 0; i < efiSysTableP->NumberOfTableEntries; i++) {
                if (!efi_guid_cmp(&confP[i].VendorGuid, &sal)) {
                        machineFwG.ia64_sal_tableP = (sal_system_table_t*)
-                               __va(SWAP((uint64_t) confP[i].VendorTable));
+                               __va((uint64_t) confP[i].VendorTable);
                        PRINT_BV("  Found SalSystemTable at: 0x%lx\n",
                                 (uint64_t) machineFwG.ia64_sal_tableP);
                        continue;
                }
                if (!efi_guid_cmp(&confP[i].VendorGuid, &acpi)) {
                        machineFwG.ia64_efi_acpi_table =
-                               __va(SWAP((uint64_t) confP[i].VendorTable));
+                               __va((uint64_t) confP[i].VendorTable);
                        PRINT_BV("  Found AcpiTable at:      0x%lx\n",
                                 (uint64_t) machineFwG.ia64_efi_acpi_table);
                        continue;
                }
                if (!efi_guid_cmp(&confP[i].VendorGuid, &acpi20)) {
                        machineFwG.ia64_efi_acpi20_table =
-                               __va(SWAP((uint64_t) confP[i].VendorTable));
+                               __va((uint64_t) confP[i].VendorTable);
                        PRINT_BV("  Found Acpi20Table at:    0x%lx\n",
                                 (uint64_t) machineFwG.ia64_efi_acpi20_table);
                        continue;
index 48bb851e5d9cec8d9e71b89d453de003e577e864..ef2c4d2d2f1ad28c04b16fed858d47ec69e7c5fd 100644 (file)
@@ -155,11 +155,6 @@ ENTRY(ia64_call_efi_physical)
        ld8     r14=[in0],8             // function address
        ;;
        ld8     gp=[in0]                // function gp value
-#if defined(BIG_ENDIAN)
-       mux1    r14=r14,@rev            // swap because mini-os is in BE
-       mov     ar.rsc=3
-       ;;
-#endif
        mov     out0=in1
        mov     out1=in2
        mov     out2=in3
@@ -167,19 +162,7 @@ ENTRY(ia64_call_efi_physical)
        mov     out4=in5
        mov     b6=r14
        ;;
-#if defined(BIG_ENDIAN)
-       mux1    gp=gp,@rev              // swap because mini-os is in BE
-       rum IA64_PSR_BE
-       ;;
-#endif
-
        br.call.sptk.many rp=b6         // call EFI procedure
-
-#if defined(BIG_ENDIAN)
-       ;;
-       sum IA64_PSR_BE
-       mov     ar.rsc=IA64_RSE_EAGER
-#endif
        mov     gp=loc3                 // restore kernel gp
        mov     r14=loc2                // psr to restore mode
        ;;
@@ -227,16 +210,8 @@ psrsave    =       loc4
        mov     b0=palret
        rsm     psr.i                   // disable interrupts
        ;;
-#if defined(BIG_ENDIAN)
-       rum     IA64_PSR_BE             // set psr.be==0
-       ;;
-#endif
        br.cond.sptk b6                 // call into firmware
        ;;
-#if defined(BIG_ENDIAN)
-       sum     IA64_PSR_BE             // set psr.be==1
-       ;;
-#endif
        ssm     psr.i                   // enable interrupts
        ;;
 2:     mov     psr.l=psrsave
@@ -271,25 +246,9 @@ ENTRY(ia64_call_efi_func)
        ;;
        ld8     gp=[in0]                // function gp value
        ;;
-#if defined(BIG_ENDIAN)
-       mux1    r14=r14,@rev            // swap if mini-os is in BE
-       mux1    gp=gp,@rev              // swap if mini-os is in BE
-#endif
-       ;;
        mov     b6=r14
-
-#if defined(BIG_ENDIAN)
-       rum     IA64_PSR_BE
-       ;;
-#endif
-
        br.call.sptk.many rp=b6         // call EFI procedure
        
-#if defined(BIG_ENDIAN)
-       sum     IA64_PSR_BE
-       ;;
-#endif
-       
        mov     ar.pfs=loc0
        mov     gp=loc1
        mov     rp=loc2
index 7ea9433199dfaa1b42293ababdc96efc6c908e3a..adcf1769a2953c3b375a8c8dbb41b1bb037c28b3 100644 (file)
@@ -205,10 +205,6 @@ ENTRY(_start)
        START_INFO_PFN_ld       r14=[r15]       // load the start_info_pfn
        add     r16=7, r0
        ;;
-#if defined(BIG_ENDIAN)
-       mux1    r14=r14,@rev            // swap because mini-os is in BE
-#endif
-       ;;
        shl     r15=r14,PAGE_SHIFT_XEN_16K      // pfn << PAGE_SHIFT_XEN_16K
        shl     r16=r16,IA64_RR_IDX_POS         // (7<<IA64_RR_IDX_POS)
        ;;
index 79971fef34c63ab1591f7854343ffa41c91361a1..8971b3bee9b8c26df01c013adf46e48bc0388194 100644 (file)
@@ -87,11 +87,6 @@ ENTRY(save_tf_rse_switch)
        ;;
        ld8     r21=[r21]               // XEN.ipsr
        ld8     r22=[r22];;             // XEN.iip
-#if defined(BIG_ENDIAN)
-       mux1    r21=r21,@rev            // swap because mini-os is in BE
-       mux1    r22=r22,@rev            // swap because mini-os is in BE
-       ;;
-#endif
        add     r19=TF_IPSR,r18
        add     r20=TF_IIP,r18
        ;;
@@ -129,10 +124,6 @@ ENTRY(save_tf_rse_switch)
        //bsw.1         // switch to bank 1 for saving these registers.
        movl r30=XSI_BANKNUM            // Switch to bank 1.
        mov r31=1;;
-#if defined(BIG_ENDIAN)
-       mux1    r31=r31,@rev            // swap because mini-os is in BE
-       ;;
-#endif
        st4 [r30]=r31
        ;;
        /*
@@ -143,38 +134,13 @@ ENTRY(save_tf_rse_switch)
        movl r30=XSI_BANK1_R16;
        movl r31=XSI_BANK1_R16+8;; 
        ld8 r16=[r30],16; ld8 r17=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r16=r16,@rev; mux1 r17=r17,@rev;;
-#endif
        ld8 r18=[r30],16; ld8 r19=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r18=r18,@rev; mux1 r19=r19,@rev;;
-#endif
        ld8 r20=[r30],16; ld8 r21=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r20=r20,@rev; mux1 r21=r21,@rev;;
-#endif
        ld8 r22=[r30],16; ld8 r23=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r22=r22,@rev; mux1 r23=r23,@rev;;
-#endif
        ld8 r24=[r30],16; ld8 r25=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r24=r24,@rev; mux1 r25=r25,@rev;;
-#endif
        ld8 r26=[r30],16; ld8 r27=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r26=r26,@rev; mux1 r27=r27,@rev;;
-#endif
        ld8 r28=[r30],16; ld8 r29=[r31],16;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r28=r28,@rev; mux1 r29=r29,@rev;;
-#endif
        ld8 r30=[r30]; ld8 r31=[r31];;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r30=r30,@rev; mux1 r31=r31,@rev;;
-#endif
-
        add     r2=TF_GREG16,r14
        add     r3=TF_GREG17,r14
        ;;
@@ -251,10 +217,6 @@ ENTRY(save_tf_rse_switch)
        ;;
        ld8     r21=[r8]
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r21=r21,@rev
-       ;;
-#endif
        st8     [r19]=r21               // store cr.ifs
        dep.z   r22=r21,0,38            // copy ifm part from ifs.ifm
        ;;
@@ -330,12 +292,6 @@ ENTRY(restore_tf_rse_switch)
        ;;
        ld8     r21=[r19]               // load cr.ipsr
        ld8     r22=[r20]               // load cr.iip
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       ;;
-       mux1    r21=r21,@rev
-       mux1    r22=r22,@rev
-       ;;
-#endif
        movl    r16=XSI_IPSR            // XEN !!
        ;;
        st8     [r16]=r21,XSI_IIP_OFS-XSI_IPSR_OFS      // XEN.ipsr
@@ -353,9 +309,6 @@ ENTRY(restore_tf_rse_switch)
        ld8     r22=[r19]               // ndirty
        ;;
        shl     r21=r22,16              // value for ar.rsc
-       //mov   r19=(MOS_IA64_RSC_BE << IA64_RSC_BE)
-       ;;
-       or      r21=(MOS_IA64_RSC_BE << IA64_RSC_BE),r21
        ;;
        mov     ar.rsc=r21              // setup for loadrs
        ;;
@@ -386,10 +339,6 @@ ENTRY(restore_tf_rse_switch)
        ld8     r21=[r19]               // load ar.pfs
        ld8     r22=[r20]               // load cr.ifs
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r22=r22,@rev
-       ;;
-#endif
        add     r19=TF_RSC,r18
        mov     ar.pfs=r21
        st8     [r16]=r22               // XEN.ifs
@@ -429,10 +378,6 @@ ENTRY(restore_tf_rse_switch)
        // bsw.1
        movl r30=XSI_BANKNUM            // Switch to bank 1.
        mov r31=1;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r31=r31,@rev
-       ;;
-#endif
        st4 [r30]=r31
        ;;
        add     r2=TF_GREG16,r14
@@ -455,51 +400,27 @@ ENTRY(restore_tf_rse_switch)
        movl r2=XSI_BANK1_R16
        movl r3=XSI_BANK1_R16+8
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r16=r16,@rev; mux1 r17=r17,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r16,16
        .mem.offset 8,0; st8.spill [r3]=r17,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r18=r18,@rev; mux1 r19=r19,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r18,16
        .mem.offset 8,0; st8.spill [r3]=r19,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r20=r20,@rev; mux1 r21=r21,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r20,16
        .mem.offset 8,0; st8.spill [r3]=r21,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r22=r22,@rev; mux1 r23=r23,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r22,16
        .mem.offset 8,0; st8.spill [r3]=r23,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r24=r24,@rev; mux1 r25=r25,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r24,16
        .mem.offset 8,0; st8.spill [r3]=r25,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r26=r26,@rev; mux1 r27=r27,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r26,16
        .mem.offset 8,0; st8.spill [r3]=r27,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r28=r28,@rev; mux1 r29=r29,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r28,16
        .mem.offset 8,0; st8.spill [r3]=r29,16
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r30=r30,@rev; mux1 r31=r31,@rev;;
-#endif
        .mem.offset 0,0; st8.spill [r2]=r30,16
        .mem.offset 8,0; st8.spill [r3]=r31,16
        ;;
@@ -567,17 +488,11 @@ ENTRY(save_special_regs)
        add     loc5=TF_IFA,in0
        add     loc6=TF_ISR,in0
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    loc3=loc3,@rev; mux1 loc4=loc4,@rev;;
-#endif
        st8     [loc5]=loc3,TF_IIM-TF_IFA       // store cr.ifa
        st8     [loc6]=loc4                     // store cr.isr
        ;;
        ld8     loc3=[loc1]                     // load XEN.iim
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    loc3=loc3,@rev;;
-#endif
        st8     [loc5]=loc3                     // store cr.iim
        ;;
        mov     ar.pfs=loc0
@@ -605,9 +520,6 @@ ENTRY(hypervisor_callback)
        mov     out0=r18                // the trap frame
        movl    r22=XSI_PSR_IC
        mov     r23=1;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r23=r23,@rev;;
-#endif
        st8     [r22]=r23               // ssm psr.ic
        ;;
        br.call.sptk.few rp = do_hypervisor_callback
@@ -649,9 +561,6 @@ ENTRY(trap_error)
        ;;
        movl r30=XSI_BANKNUM            // bsw.1
        mov r31=1;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r31=r31,@rev;;
-#endif
        st4 [r30]=r31;;
 
                /* Save extra interrupt registers to the trap frame. */
@@ -664,9 +573,6 @@ ENTRY(trap_error)
        ld8     r23=[r23]
        mov     r25=1
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r25=r25,@rev; mux1 r23=r23,@rev;;
-#endif
        st4     [r22]=r25               // ssm psr.ic
        st1     [r23]=r0                // ssm psr.i
        ;;
@@ -680,10 +586,6 @@ ENTRY(trap_error)
        ld8 r23=[r23]
        mov r25=1
        ;;
-#if defined(BIG_ENDIAN)                        // swap because mini-os is in BE
-       mux1    r25=r25,@rev;;
-       mux1    r25=r25,@rev; mux1 r23=r23,@rev;;
-#endif
        st1 [r23]=r25
        st4 [r22]=r0            // note: clears both vpsr.i and vpsr.ic!
        ;;
index df616c8a507a3f6a2b23fa95db2c7bfea0e053a3..8b54c0f3946a5c88c7d4465336aca71a5d0dd83b 100644 (file)
@@ -52,7 +52,8 @@ SECTIONS
   .fini_array     : { *(.fini_array) }
   PROVIDE (__fini_array_end = .);
 
-  .ctors : {
+  .ctors : AT(ADDR(.ctors) - (((5<<(61))+0x100000000) - (1 << 20)))
+       {
         __CTOR_LIST__ = .;
         QUAD((__CTOR_END__ - __CTOR_LIST__) / 8 - 2)
         *(.ctors)
@@ -61,7 +62,8 @@ SECTIONS
         __CTOR_END__ = .;
         }
 
-  .dtors : {
+  .dtors : AT(ADDR(.dtors) - (((5<<(61))+0x100000000) - (1 << 20)))
+        {
         __DTOR_LIST__ = .;
         QUAD((__DTOR_END__ - __DTOR_LIST__) / 8 - 2)
         *(.dtors)
index 916e2f610bfd3f06622816b026829c469df8137c..c6aef8331f7154ac7a0c981a8f7ef7a669f12654 100644 (file)
@@ -153,7 +153,19 @@ map_frames_ex(unsigned long* frames, unsigned long n, unsigned long stride,
         ASSERT(n == 1 || (stride == 0 && increment == 1));
         ASSERT(id == DOMID_SELF);
         ASSERT(prot == 0);
-       return (void*) __va(SWAP(frames[0]) << PAGE_SHIFT);
+       return (void*) __va(frames[0] << PAGE_SHIFT);
+}
+
+int unmap_frames(unsigned long virt_addr, unsigned long num_frames)
+{  
+    /* TODO */
+    ASSERT(0);
+}
+
+unsigned long alloc_contig_pages(int order, unsigned int addr_bits)
+{
+    /* TODO */
+    ASSERT(0);
 }
 
 void arch_init_p2m(unsigned long max_pfn)
index d08705328771c593ff25856302936a59a9342b70..286e5be38c9b1abe0b7a959ef9c720a2bd36b2d4 100644 (file)
@@ -76,19 +76,19 @@ ia64_sal_init(struct sal_system_table *saltab)
                return;
        }
        p = (uint8_t *) (saltab + 1);
-       for (i = 0; i < SWAP(saltab->sal_entry_count); i++) {
-               switch (SWAP(*p)) {
+       for (i = 0; i < saltab->sal_entry_count; i++) {
+               switch (*p) {
                case SAL_DESC_ENTRYPOINT:               // 0
                {
                        struct sal_entrypoint_descriptor *dp;
 
                        dp = (struct sal_entrypoint_descriptor*)p;
                        ia64_pal_entry =
-                               IA64_PHYS_TO_RR7(SWAP(dp->sale_pal_proc));
+                               IA64_PHYS_TO_RR7(dp->sale_pal_proc);
                        PRINT_BV("  PAL Proc at 0x%lx\n", ia64_pal_entry);
                        sal_fdesc.func =
-                               IA64_PHYS_TO_RR7(SWAP(dp->sale_sal_proc));
-                       sal_fdesc.gp = IA64_PHYS_TO_RR7(SWAP(dp->sale_sal_gp));
+                               IA64_PHYS_TO_RR7(dp->sale_sal_proc);
+                       sal_fdesc.gp = IA64_PHYS_TO_RR7(dp->sale_sal_gp);
                        PRINT_BV("  SAL Proc at 0x%lx, GP at 0x%lx\n",
                                 sal_fdesc.func, sal_fdesc.gp);
                        ia64_sal_entry = (sal_entry_t *) &sal_fdesc;
index baf9096330c1bcc31d6abc416e99baa82c753770..6da27fb4d7fd4c6420539401c43806ad85e815cf 100644 (file)
@@ -197,15 +197,6 @@ calculate_frequencies(void)
        struct ia64_pal_result pal_res;
 
        pal_res = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
-       //sal_res = ia64_sal_call(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
-#if defined(BIG_ENDIAN)
-//#warning calculate_frequencies TODO
-       /*
-        * I have to do an own function with switching psr.be!
-        * Currently it's running because it's a break into the hypervisor
-        * behind the call.!
-        */
-#endif
        sal_res = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
 
        if (sal_res.sal_status == 0 && pal_res.pal_status == 0) {
@@ -260,9 +251,8 @@ init_time(void)
        if (efi_get_time(&tm)) {
                printk("  EFI-Time: %d.%d.%d   %d:%d:%d\n", tm.Day,
                       tm.Month, tm.Year, tm.Hour, tm.Minute, tm.Second);
-               os_time.tv_sec = _mktime(SWAP(tm.Year), SWAP(tm.Month),
-                                       SWAP(tm.Day), SWAP(tm.Hour),
-                                       SWAP(tm.Minute), SWAP(tm.Second));
+               os_time.tv_sec = _mktime(tm.Year, tm.Month,
+                                       tm.Day, tm.Hour, tm.Minute, tm.Second);
                os_time.tv_nsec = tm.Nanosecond;
        } else
                printk("efi_get_time() failed\n");
index 03d163cb9423cd414fbaec4100b5e8ac61e3bb7a..fccfee2f1e5c13c06cd2bffc4a7799c630e8cd83 100644 (file)
@@ -24,6 +24,8 @@
 
 
 #include <os.h>
+#include <mini-os/errno.h>
+#include <mini-os/lib.h>
 #include <hypervisor.h>
 #include <xen/xencomm.h>
 #include <xen/grant_table.h>
@@ -38,6 +40,7 @@ struct xencomm_mini
 
 #define xen_guest_handle(hnd)  ((hnd).p)
 
+struct xencomm_handle;
 
 /* Translate virtual address to physical address.  */
 uint64_t
@@ -52,6 +55,16 @@ xencomm_vaddr_to_paddr(uint64_t vaddr)
        return 0;
 }
 
+/* Inline version.  To be used only on linear space (kernel space).  */
+static struct xencomm_handle *
+xencomm_create_inline(void *buffer)
+{
+       unsigned long paddr;
+
+       paddr = xencomm_vaddr_to_paddr((unsigned long)buffer);
+       return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
+}
+
 #define min(a,b) (((a) < (b)) ? (a) : (b))
 static int
 xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes)
@@ -82,7 +95,7 @@ xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes)
                        return -EINVAL;
                }
 
-               desc->address[i++] = SWAP(paddr);
+               desc->address[i++] = paddr;
                recorded += chunksz;
        }
        if (recorded < bytes) {
@@ -93,8 +106,8 @@ xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes)
 
        /* mark remaining addresses invalid (just for safety) */
        while (i < desc->nr_addrs)
-               desc->address[i++] = SWAP(XENCOMM_INVALID);
-       desc->magic = SWAP(XENCOMM_MAGIC);
+               desc->address[i++] = XENCOMM_INVALID;
+       desc->magic = XENCOMM_MAGIC;
        return 0;
 }
 
@@ -171,15 +184,14 @@ xencommize_mini_grant_table_op(struct xencomm_mini *xc_area, int *nbr_area,
                        return -EINVAL;
                rc = xencomm_create_mini
                        (xc_area, nbr_area,
-                        (void*)SWAP((uint64_t)
-                                    xen_guest_handle(setup->frame_list)),
-                        SWAP(setup->nr_frames)
+                        (void*)(uint64_t) xen_guest_handle(setup->frame_list),
+                        setup->nr_frames
                         * sizeof(*xen_guest_handle(setup->frame_list)),
                         &desc1);
                if (rc)
                        return rc;
                set_xen_guest_handle(setup->frame_list,
-                                    (void *)SWAP((uint64_t)desc1));
+                                    (void *)(uint64_t)desc1);
                break;
        }
        case GNTTABOP_dump_table:
@@ -201,6 +213,14 @@ xencommize_mini_grant_table_op(struct xencomm_mini *xc_area, int *nbr_area,
        return rc;
 }
 
+static inline int
+xencomm_arch_hypercall_grant_table_op(unsigned int cmd,
+                                      struct xencomm_handle *uop,
+                                      unsigned int count)
+{
+       return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
 int
 xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op,
                                       unsigned int count)
@@ -263,8 +283,112 @@ HYPERVISOR_suspend(unsigned long srec)
 {
         struct sched_shutdown arg;
 
-        arg.reason = (uint32_t)SWAP((uint32_t)SHUTDOWN_suspend);
+        arg.reason = (uint32_t)SHUTDOWN_suspend;
 
         return xencomm_arch_hypercall_suspend(xencomm_create_inline(&arg));
 }
 
+int
+HYPERVISOR_event_channel_op(int cmd, void *arg)
+{
+       int rc;
+       struct xencomm_handle *newArg;
+
+       newArg = xencomm_create_inline(arg);
+       rc = _hypercall2(int, event_channel_op, cmd, newArg);
+       if (unlikely(rc == -ENOSYS)) {
+               struct evtchn_op op;
+
+               op.cmd = cmd;
+               memcpy(&op.u, arg, sizeof(op.u));
+               rc = _hypercall1(int, event_channel_op_compat, &op);
+       }
+       return rc;
+}
+
+static int
+xencomm_arch_xen_version(int cmd, struct xencomm_handle *arg)
+{
+       return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static int
+xencomm_arch_xen_feature(int cmd, struct xencomm_handle *arg)
+{
+       struct xencomm_handle *newArg;
+
+       newArg = xencomm_create_inline(arg);
+       return _hypercall2(int, xen_version, cmd, newArg);
+}
+
+int
+HYPERVISOR_xen_version(int cmd, void *arg)
+{
+       switch(cmd) {
+               case XENVER_version:
+                       return xencomm_arch_xen_version(cmd, 0);
+               case XENVER_get_features:
+                       return xencomm_arch_xen_feature(cmd, arg);
+               default:
+                       return -1;
+       }
+}
+
+int
+HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+       struct xencomm_handle *newStr;
+
+       newStr = xencomm_create_inline(str);
+       return _hypercall3(int, console_io, cmd, count, newStr);
+}
+
+int
+HYPERVISOR_sched_op_compat(int cmd, unsigned long arg)
+{
+       return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+
+int
+HYPERVISOR_sched_op(int cmd, void *arg)
+{
+       struct xencomm_handle *newArg;
+
+       newArg = xencomm_create_inline(arg);
+       return _hypercall2(int, sched_op, cmd, newArg);
+}
+
+int
+HYPERVISOR_callback_op(int cmd, void *arg)
+{
+       struct xencomm_handle *newArg;
+
+       newArg = xencomm_create_inline(arg);
+       return _hypercall2(int, callback_op, cmd, newArg);
+}
+
+int
+HYPERVISOR_opt_feature(void *arg)
+{
+       struct xencomm_handle *new_arg;
+
+       new_arg = xencomm_create_inline(arg);
+
+       return _hypercall1(int, opt_feature, new_arg);
+}
+
+int
+HYPERVISOR_shutdown(unsigned int reason)
+{
+       struct sched_shutdown sched_shutdown = {
+               .reason = reason
+       };
+
+       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
+
+       if (rc == -ENOSYS)
+               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
+
+       return rc;
+}
+
diff --git a/extras/mini-os/arch/x86/ioremap.c b/extras/mini-os/arch/x86/ioremap.c
new file mode 100644 (file)
index 0000000..d94f4e7
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2009,  Netronome Systems, Inc.
+ *                
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <types.h>
+#include <lib.h>
+#include <xmalloc.h>
+#include <mm.h>
+#include <ioremap.h>
+
+/* Map a physical address range into virtual address space with provided
+ * flags. Return a virtual address range it is mapped to. */
+static void *__do_ioremap(unsigned long phys_addr, unsigned long size, 
+                          unsigned long prot)
+{
+    unsigned long va;
+    unsigned long mfns, mfn;
+    unsigned long num_pages, offset;
+    int i;
+
+    /* allow non page aligned addresses but for mapping we need to align them */
+    offset = (phys_addr & ~PAGE_MASK);
+    num_pages = (offset + size + PAGE_SIZE - 1) / PAGE_SIZE;
+    phys_addr &= PAGE_MASK;
+    mfns = mfn = phys_addr >> PAGE_SHIFT;
+    
+    /* sanity checks on list of MFNs */
+    for ( i = 0; i < num_pages; i++, mfn++ )
+    {
+        if ( mfn_is_ram(mfn) )
+        {
+            printk("ioremap: mfn 0x%ulx is RAM\n", mfn);
+            goto mfn_invalid;
+        }
+    }   
+    va = (unsigned long)map_frames_ex(&mfns, num_pages, 0, 1, 1,
+                                      DOMID_IO, 0, prot);
+    return (void *)(va + offset);
+    
+mfn_invalid:
+    return NULL;
+}
+
+void *ioremap(unsigned long phys_addr, unsigned long size)
+{
+    return __do_ioremap(phys_addr, size, IO_PROT);
+}
+
+void *ioremap_nocache(unsigned long phys_addr, unsigned long size)
+{
+    return __do_ioremap(phys_addr, size, IO_PROT_NOCACHE);
+}
+
+/* Un-map the io-remapped region. Currently no list of existing mappings is
+ * maintained, so the caller has to supply the size */
+void iounmap(void *virt_addr, unsigned long size)
+{   
+    unsigned long num_pages;
+    unsigned long va = (unsigned long)virt_addr;
+
+    /* work out number of frames to unmap */
+    num_pages = ((va & ~PAGE_MASK) + size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+    unmap_frames(va & PAGE_MASK, num_pages);
+}
+
+
+
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 indent-tabs-mode:nil -*- */
index b1c0084fe7420288198fac63f45597be7cef239a..723fc567e9212696c0346228bcc9c93dc1047b2d 100644 (file)
 unsigned long *phys_to_machine_mapping;
 unsigned long mfn_zero;
 extern char stack[];
-extern void page_walk(unsigned long virt_addr);
+extern void page_walk(unsigned long va);
 
-void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, 
-                                unsigned long offset, unsigned long level)
+/*
+ * Make pt_pfn a new 'level' page table frame and hook it into the page
+ * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest
+ * PFN.
+ */
+static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, 
+                         unsigned long offset, unsigned long level)
 {   
     pgentry_t *tab = (pgentry_t *)start_info.pt_base;
     unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn); 
     pgentry_t prot_e, prot_t;
     mmu_update_t mmu_updates[1];
+    int rc;
     
     prot_e = prot_t = 0;
-    DEBUG("Allocating new L%d pt frame for pt_pfn=%lx, "
-           "prev_l_mfn=%lx, offset=%lx", 
-           level, *pt_pfn, prev_l_mfn, offset);
+    DEBUG("Allocating new L%d pt frame for pfn=%lx, "
+          "prev_l_mfn=%lx, offset=%lx", 
+          level, *pt_pfn, prev_l_mfn, offset);
 
     /* We need to clear the page, otherwise we might fail to map it
        as a page table page */
@@ -74,56 +80,63 @@ void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn,
     switch ( level )
     {
     case L1_FRAME:
-         prot_e = L1_PROT;
-         prot_t = L2_PROT;
-         break;
+        prot_e = L1_PROT;
+        prot_t = L2_PROT;
+        break;
     case L2_FRAME:
-         prot_e = L2_PROT;
-         prot_t = L3_PROT;
-         break;
+        prot_e = L2_PROT;
+        prot_t = L3_PROT;
+        break;
 #if defined(__x86_64__)
     case L3_FRAME:
-         prot_e = L3_PROT;
-         prot_t = L4_PROT;
-         break;
+        prot_e = L3_PROT;
+        prot_t = L4_PROT;
+        break;
 #endif
     default:
-         printk("new_pt_frame() called with invalid level number %d\n", level);
-         do_exit();
-         break;
+        printk("new_pt_frame() called with invalid level number %d\n", level);
+        do_exit();
+        break;
     }
 
-    /* Update the entry */
+    /* Make PFN a page table page */
 #if defined(__x86_64__)
     tab = pte_to_virt(tab[l4_table_offset(pt_page)]);
 #endif
     tab = pte_to_virt(tab[l3_table_offset(pt_page)]);
 
     mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + 
-                         sizeof(pgentry_t) * l1_table_offset(pt_page);
+        sizeof(pgentry_t) * l1_table_offset(pt_page);
     mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | 
-                         (prot_e & ~_PAGE_RW);
-    if(HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF) < 0)
+        (prot_e & ~_PAGE_RW);
+    
+    if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 )
     {
-         printk("PTE for new page table page could not be updated\n");
-         do_exit();
+        printk("ERROR: PTE for new page table page could not be updated\n");
+        printk("       mmu_update failed with rc=%d\n", rc);
+        do_exit();
     }
-                        
-    /* Now fill the new page table page with entries.
-       Update the page directory as well. */
-    mmu_updates[0].ptr = ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
+
+    /* Hook the new page table page into the hierarchy */
+    mmu_updates[0].ptr =
+        ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
     mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | prot_t;
-    if(HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF) < 0) 
+
+    if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) 
     {
-       printk("ERROR: mmu_update failed\n");
-       do_exit();
+        printk("ERROR: mmu_update failed with rc=%d\n", rc);
+        do_exit();
     }
 
     *pt_pfn += 1;
 }
 
-/* Checks if a pagetable frame is needed (if weren't allocated by Xen) */
-static int need_pt_frame(unsigned long virt_address, int level)
+/*
+ * Checks if a pagetable frame is needed at 'level' to map a given
+ * address. Note, this function is specific to the initial page table
+ * building.
+ */
+static int need_pt_frame(unsigned long va, int level)
 {
     unsigned long hyp_virt_start = HYPERVISOR_VIRT_START;
 #if defined(__x86_64__)
@@ -135,63 +148,71 @@ static int need_pt_frame(unsigned long virt_address, int level)
     /* In general frames will _not_ be needed if they were already
        allocated to map the hypervisor into our VA space */
 #if defined(__x86_64__)
-    if(level == L3_FRAME)
+    if ( level == L3_FRAME )
     {
-        if(l4_table_offset(virt_address) >= 
-           l4_table_offset(hyp_virt_start) &&
-           l4_table_offset(virt_address) <= 
-           l4_table_offset(hyp_virt_end))
+        if ( l4_table_offset(va) >= 
+             l4_table_offset(hyp_virt_start) &&
+             l4_table_offset(va) <= 
+             l4_table_offset(hyp_virt_end))
             return 0;
         return 1;
-    } else
+    } 
+    else
 #endif
 
-    if(level == L2_FRAME)
+    if ( level == L2_FRAME )
     {
 #if defined(__x86_64__)
-        if(l4_table_offset(virt_address) >= 
-           l4_table_offset(hyp_virt_start) &&
-           l4_table_offset(virt_address) <= 
-           l4_table_offset(hyp_virt_end))
+        if ( l4_table_offset(va) >= 
+             l4_table_offset(hyp_virt_start) &&
+             l4_table_offset(va) <= 
+             l4_table_offset(hyp_virt_end))
 #endif
-            if(l3_table_offset(virt_address) >= 
-               l3_table_offset(hyp_virt_start) &&
-               l3_table_offset(virt_address) <= 
-               l3_table_offset(hyp_virt_end))
+            if ( l3_table_offset(va) >= 
+                 l3_table_offset(hyp_virt_start) &&
+                 l3_table_offset(va) <= 
+                 l3_table_offset(hyp_virt_end))
                 return 0;
 
         return 1;
-    } else 
-
-    /* Always need l1 frames */
-    if(level == L1_FRAME)
-        return 1;
+    } 
+    else 
+        /* Always need l1 frames */
+        if ( level == L1_FRAME )
+            return 1;
 
     printk("ERROR: Unknown frame level %d, hypervisor %llx,%llx\n", 
-        level, hyp_virt_start, hyp_virt_end);
+           level, hyp_virt_start, hyp_virt_end);
     return -1;
 }
 
-void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
+/*
+ * Build the initial pagetable.
+ */
+static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
 {
     unsigned long start_address, end_address;
     unsigned long pfn_to_map, pt_pfn = *start_pfn;
     static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
     pgentry_t *tab = (pgentry_t *)start_info.pt_base, page;
-    unsigned long mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
+    unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
     unsigned long offset;
     int count = 0;
+    int rc;
 
-    pfn_to_map = (start_info.nr_pt_frames - NOT_L1_FRAMES) * L1_PAGETABLE_ENTRIES;
+    pfn_to_map = 
+        (start_info.nr_pt_frames - NOT_L1_FRAMES) * L1_PAGETABLE_ENTRIES;
 
-    if (*max_pfn >= virt_to_pfn(HYPERVISOR_VIRT_START))
+    if ( *max_pfn >= virt_to_pfn(HYPERVISOR_VIRT_START) )
     {
         printk("WARNING: Mini-OS trying to use Xen virtual space. "
                "Truncating memory from %dMB to ",
-               ((unsigned long)pfn_to_virt(*max_pfn) - (unsigned long)&_text)>>20);
+               ((unsigned long)pfn_to_virt(*max_pfn) -
+                (unsigned long)&_text)>>20);
         *max_pfn = virt_to_pfn(HYPERVISOR_VIRT_START - PAGE_SIZE);
         printk("%dMB\n",
-               ((unsigned long)pfn_to_virt(*max_pfn) - (unsigned long)&_text)>>20);
+               ((unsigned long)pfn_to_virt(*max_pfn) - 
+                (unsigned long)&_text)>>20);
     }
 
     start_address = (unsigned long)pfn_to_virt(pfn_to_map);
@@ -200,49 +221,53 @@ void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
     /* We worked out the virtual memory range to map, now mapping loop */
     printk("Mapping memory range 0x%lx - 0x%lx\n", start_address, end_address);
 
-    while(start_address < end_address)
+    while ( start_address < end_address )
     {
         tab = (pgentry_t *)start_info.pt_base;
-        mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
+        pt_mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
 
 #if defined(__x86_64__)
         offset = l4_table_offset(start_address);
         /* Need new L3 pt frame */
-        if(!(start_address & L3_MASK)) 
-            if(need_pt_frame(start_address, L3_FRAME)
-                new_pt_frame(&pt_pfn, mfn, offset, L3_FRAME);
+        if ( !(start_address & L3_MASK) )
+            if ( need_pt_frame(start_address, L3_FRAME) 
+                new_pt_frame(&pt_pfn, pt_mfn, offset, L3_FRAME);
 
         page = tab[offset];
-        mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
+        pt_mfn = pte_to_mfn(page);
+        tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
 #endif
         offset = l3_table_offset(start_address);
         /* Need new L2 pt frame */
-        if(!(start_address & L2_MASK))
-            if(need_pt_frame(start_address, L2_FRAME))
-                new_pt_frame(&pt_pfn, mfn, offset, L2_FRAME);
+        if ( !(start_address & L2_MASK) )
+            if ( need_pt_frame(start_address, L2_FRAME) )
+                new_pt_frame(&pt_pfn, pt_mfn, offset, L2_FRAME);
 
         page = tab[offset];
-        mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
+        pt_mfn = pte_to_mfn(page);
+        tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
         offset = l2_table_offset(start_address);        
         /* Need new L1 pt frame */
-        if(!(start_address & L1_MASK))
-            if(need_pt_frame(start_address, L1_FRAME)) 
-                new_pt_frame(&pt_pfn, mfn, offset, L1_FRAME);
+        if ( !(start_address & L1_MASK) )
+            if ( need_pt_frame(start_address, L1_FRAME) )
+                new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
 
         page = tab[offset];
-        mfn = pte_to_mfn(page);
+        pt_mfn = pte_to_mfn(page);
         offset = l1_table_offset(start_address);
 
-        mmu_updates[count].ptr = ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
-        mmu_updates[count].val = (pgentry_t)pfn_to_mfn(pfn_to_map++) << PAGE_SHIFT | L1_PROT;
+        mmu_updates[count].ptr =
+            ((pgentry_t)pt_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
+        mmu_updates[count].val = 
+            (pgentry_t)pfn_to_mfn(pfn_to_map++) << PAGE_SHIFT | L1_PROT;
         count++;
-        if (count == L1_PAGETABLE_ENTRIES || pfn_to_map == *max_pfn)
+        if ( count == L1_PAGETABLE_ENTRIES || pfn_to_map == *max_pfn )
         {
-            if(HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF) < 0)
+            rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
+            if ( rc < 0 )
             {
-                printk("PTE could not be updated\n");
+                printk("ERROR: build_pagetable(): PTE could not be updated\n");
+                printk("       mmu_update failed with rc=%d\n", rc);
                 do_exit();
             }
             count = 0;
@@ -253,20 +278,26 @@ void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn)
     *start_pfn = pt_pfn;
 }
 
+/*
+ * Mark portion of the address space read only.
+ */
 extern void shared_info;
 static void set_readonly(void *text, void *etext)
 {
-    unsigned long start_address = ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK;
+    unsigned long start_address =
+        ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK;
     unsigned long end_address = (unsigned long) etext;
     static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
     pgentry_t *tab = (pgentry_t *)start_info.pt_base, page;
     unsigned long mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
     unsigned long offset;
     int count = 0;
+    int rc;
 
     printk("setting %p-%p readonly\n", text, etext);
 
-    while (start_address + PAGE_SIZE <= end_address) {
+    while ( start_address + PAGE_SIZE <= end_address )
+    {
         tab = (pgentry_t *)start_info.pt_base;
         mfn = pfn_to_mfn(virt_to_pfn(start_info.pt_base));
 
@@ -287,20 +318,25 @@ static void set_readonly(void *text, void *etext)
 
         offset = l1_table_offset(start_address);
 
-       if (start_address != (unsigned long)&shared_info) {
-           mmu_updates[count].ptr = ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
-           mmu_updates[count].val = tab[offset] & ~_PAGE_RW;
-           count++;
-       } else
-           printk("skipped %p\n", start_address);
+        if ( start_address != (unsigned long)&shared_info )
+        {
+            mmu_updates[count].ptr = 
+                ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
+            mmu_updates[count].val = tab[offset] & ~_PAGE_RW;
+            count++;
+        }
+        else
+            printk("skipped %p\n", start_address);
 
         start_address += PAGE_SIZE;
 
-        if (count == L1_PAGETABLE_ENTRIES || start_address + PAGE_SIZE > end_address)
+        if ( count == L1_PAGETABLE_ENTRIES || 
+             start_address + PAGE_SIZE > end_address )
         {
-            if(HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF) < 0)
+            rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
+            if ( rc < 0 )
             {
-                printk("PTE could not be updated\n");
+                printk("ERROR: set_readonly(): PTE could not be updated\n");
                 do_exit();
             }
             count = 0;
@@ -308,41 +344,73 @@ static void set_readonly(void *text, void *etext)
     }
 
     {
-       mmuext_op_t op = {
-           .cmd = MMUEXT_TLB_FLUSH_ALL,
-       };
-       int count;
-       HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF);
+        mmuext_op_t op = {
+            .cmd = MMUEXT_TLB_FLUSH_ALL,
+        };
+        int count;
+        HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF);
     }
 }
 
-void mem_test(unsigned long *start_add, unsigned long *end_add)
+/*
+ * A useful mem testing function. Write the address to every address in the
+ * range provided and read back the value. If verbose, print page walk to
+ * some VA
+ * 
+ * If we get MEM_TEST_MAX_ERRORS we might as well stop
+ */
+#define MEM_TEST_MAX_ERRORS 10 
+int mem_test(unsigned long *start_va, unsigned long *end_va, int verbose)
 {
     unsigned long mask = 0x10000;
     unsigned long *pointer;
-
-    for(pointer = start_add; pointer < end_add; pointer++)
+    int error_count = 0;
+    /* write values and print page walks */
+    if ( verbose && (((unsigned long)start_va) & 0xfffff) )
+    {
+        printk("MemTest Start: 0x%lx\n", start_va);
+        page_walk((unsigned long)start_va);
+    }
+    for ( pointer = start_va; pointer < end_va; pointer++ )
     {
-        if(!(((unsigned long)pointer) & 0xfffff))
+        if ( verbose && !(((unsigned long)pointer) & 0xfffff) )
         {
             printk("Writing to %lx\n", pointer);
             page_walk((unsigned long)pointer);
         }
         *pointer = (unsigned long)pointer & ~mask;
     }
-
-    for(pointer = start_add; pointer < end_add; pointer++)
+    if ( verbose && (((unsigned long)end_va) & 0xfffff) )
     {
-        if(((unsigned long)pointer & ~mask) != *pointer)
+        printk("MemTest End: %lx\n", end_va-1);
+        page_walk((unsigned long)end_va-1);
+    }
+    /* verify values */
+    for ( pointer = start_va; pointer < end_va; pointer++ )
+    {
+        if ( ((unsigned long)pointer & ~mask) != *pointer )
+        {
             printk("Read error at 0x%lx. Read: 0x%lx, should read 0x%lx\n",
-                (unsigned long)pointer, 
-                *pointer, 
-                ((unsigned long)pointer & ~mask));
+                   (unsigned long)pointer, *pointer, 
+                   ((unsigned long)pointer & ~mask));
+            error_count++;
+            if ( error_count >= MEM_TEST_MAX_ERRORS )
+            {
+                printk("mem_test: too many errors\n");
+                return -1;
+            }
+        }
     }
-
+    return 0;
 }
 
-static pgentry_t *get_pgt(unsigned long addr)
+
+/*
+ * get the PTE for virtual address va if it exists. Otherwise NULL.
+ */
+static pgentry_t *get_pgt(unsigned long va)
 {
     unsigned long mfn;
     pgentry_t *tab;
@@ -352,67 +420,78 @@ static pgentry_t *get_pgt(unsigned long addr)
     mfn = virt_to_mfn(start_info.pt_base);
 
 #if defined(__x86_64__)
-    offset = l4_table_offset(addr);
-    if (!(tab[offset] & _PAGE_PRESENT))
+    offset = l4_table_offset(va);
+    if ( !(tab[offset] & _PAGE_PRESENT) )
         return NULL;
     mfn = pte_to_mfn(tab[offset]);
     tab = mfn_to_virt(mfn);
 #endif
-    offset = l3_table_offset(addr);
-    if (!(tab[offset] & _PAGE_PRESENT))
+    offset = l3_table_offset(va);
+    if ( !(tab[offset] & _PAGE_PRESENT) )
         return NULL;
     mfn = pte_to_mfn(tab[offset]);
     tab = mfn_to_virt(mfn);
-    offset = l2_table_offset(addr);
-    if (!(tab[offset] & _PAGE_PRESENT))
+    offset = l2_table_offset(va);
+    if ( !(tab[offset] & _PAGE_PRESENT) )
         return NULL;
     mfn = pte_to_mfn(tab[offset]);
     tab = mfn_to_virt(mfn);
-    offset = l1_table_offset(addr);
+    offset = l1_table_offset(va);
     return &tab[offset];
 }
 
-pgentry_t *need_pgt(unsigned long addr)
+
+/*
+ * return a valid PTE for a given virtual address. If PTE does not exist,
+ * allocate page-table pages.
+ */
+pgentry_t *need_pgt(unsigned long va)
 {
-    unsigned long mfn;
+    unsigned long pt_mfn;
     pgentry_t *tab;
     unsigned long pt_pfn;
     unsigned offset;
 
     tab = (pgentry_t *)start_info.pt_base;
-    mfn = virt_to_mfn(start_info.pt_base);
+    pt_mfn = virt_to_mfn(start_info.pt_base);
 
 #if defined(__x86_64__)
-    offset = l4_table_offset(addr);
-    if (!(tab[offset] & _PAGE_PRESENT)) {
+    offset = l4_table_offset(va);
+    if ( !(tab[offset] & _PAGE_PRESENT) )
+    {
         pt_pfn = virt_to_pfn(alloc_page());
-        new_pt_frame(&pt_pfn, mfn, offset, L3_FRAME);
+        new_pt_frame(&pt_pfn, pt_mfn, offset, L3_FRAME);
     }
     ASSERT(tab[offset] & _PAGE_PRESENT);
-    mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(mfn);
+    pt_mfn = pte_to_mfn(tab[offset]);
+    tab = mfn_to_virt(pt_mfn);
 #endif
-    offset = l3_table_offset(addr);
-    if (!(tab[offset] & _PAGE_PRESENT)) {
+    offset = l3_table_offset(va);
+    if ( !(tab[offset] & _PAGE_PRESENT) ) 
+    {
         pt_pfn = virt_to_pfn(alloc_page());
-        new_pt_frame(&pt_pfn, mfn, offset, L2_FRAME);
+        new_pt_frame(&pt_pfn, pt_mfn, offset, L2_FRAME);
     }
     ASSERT(tab[offset] & _PAGE_PRESENT);
-    mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(mfn);
-    offset = l2_table_offset(addr);
-    if (!(tab[offset] & _PAGE_PRESENT)) {
+    pt_mfn = pte_to_mfn(tab[offset]);
+    tab = mfn_to_virt(pt_mfn);
+    offset = l2_table_offset(va);
+    if ( !(tab[offset] & _PAGE_PRESENT) )
+    {
         pt_pfn = virt_to_pfn(alloc_page());
-       new_pt_frame(&pt_pfn, mfn, offset, L1_FRAME);
+        new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
     }
     ASSERT(tab[offset] & _PAGE_PRESENT);
-    mfn = pte_to_mfn(tab[offset]);
-    tab = mfn_to_virt(mfn);
+    pt_mfn = pte_to_mfn(tab[offset]);
+    tab = mfn_to_virt(pt_mfn);
 
-    offset = l1_table_offset(addr);
+    offset = l1_table_offset(va);
     return &tab[offset];
 }
 
+/*
+ * Reserve an area of virtual address space for mappings and Heap
+ */
 static unsigned long demand_map_area_start;
 #ifdef __x86_64__
 #define DEMAND_MAP_PAGES ((128ULL << 30) / PAGE_SIZE)
@@ -420,7 +499,9 @@ static unsigned long demand_map_area_start;
 #define DEMAND_MAP_PAGES ((2ULL << 30) / PAGE_SIZE)
 #endif
 
-#ifdef HAVE_LIBC
+#ifndef HAVE_LIBC
+#define HEAP_PAGES 0
+#else
 unsigned long heap, brk, heap_mapped, heap_end;
 #ifdef __x86_64__
 #define HEAP_PAGES ((128ULL << 30) / PAGE_SIZE)
@@ -435,7 +516,8 @@ void arch_init_demand_mapping_area(unsigned long cur_pfn)
 
     demand_map_area_start = (unsigned long) pfn_to_virt(cur_pfn);
     cur_pfn += DEMAND_MAP_PAGES;
-    printk("Demand map pfns at %lx-%lx.\n", demand_map_area_start, pfn_to_virt(cur_pfn));
+    printk("Demand map pfns at %lx-%lx.\n", 
+           demand_map_area_start, pfn_to_virt(cur_pfn));
 
 #ifdef HAVE_LIBC
     cur_pfn++;
@@ -446,111 +528,359 @@ void arch_init_demand_mapping_area(unsigned long cur_pfn)
 #endif
 }
 
-#define MAP_BATCH ((STACK_SIZE / 2) / sizeof(mmu_update_t))
-void do_map_frames(unsigned long addr,
-        unsigned long *f, unsigned long n, unsigned long stride,
-       unsigned long increment, domid_t id, int may_fail, unsigned long prot)
-{
-    pgentry_t *pgt = NULL;
-    unsigned long done = 0;
-    unsigned long i;
-    int rc;
-
-    while (done < n) {
-       unsigned long todo;
-
-       if (may_fail)
-           todo = 1;
-       else
-           todo = n - done;
-
-       if (todo > MAP_BATCH)
-               todo = MAP_BATCH;
-
-       {
-           mmu_update_t mmu_updates[todo];
-
-           for (i = 0; i < todo; i++, addr += PAGE_SIZE, pgt++) {
-                if (!pgt || !(addr & L1_MASK))
-                    pgt = need_pgt(addr);
-               mmu_updates[i].ptr = virt_to_mach(pgt);
-               mmu_updates[i].val = ((pgentry_t)(f[(done + i) * stride] + (done + i) * increment) << PAGE_SHIFT) | prot;
-           }
-
-           rc = HYPERVISOR_mmu_update(mmu_updates, todo, NULL, id);
-           if (rc < 0) {
-               if (may_fail)
-                   f[done * stride] |= 0xF0000000;
-               else {
-                   printk("Map %ld (%lx, ...) at %p failed: %d.\n", todo, f[done * stride] + done * increment, addr, rc);
-                    do_exit();
-               }
-           }
-       }
-
-       done += todo;
-    }
-}
-
 unsigned long allocate_ondemand(unsigned long n, unsigned long alignment)
 {
     unsigned long x;
     unsigned long y = 0;
 
     /* Find a properly aligned run of n contiguous frames */
-    for (x = 0; x <= DEMAND_MAP_PAGES - n; x = (x + y + 1 + alignment - 1) & ~(alignment - 1)) {
+    for ( x = 0;
+          x <= DEMAND_MAP_PAGES - n; 
+          x = (x + y + 1 + alignment - 1) & ~(alignment - 1) )
+    {
         unsigned long addr = demand_map_area_start + x * PAGE_SIZE;
         pgentry_t *pgt = get_pgt(addr);
-        for (y = 0; y < n; y++, addr += PAGE_SIZE) {
-            if (!(addr & L1_MASK))
+        for ( y = 0; y < n; y++, addr += PAGE_SIZE ) 
+        {
+            if ( !(addr & L1_MASK) )
                 pgt = get_pgt(addr);
-            if (pgt) {
-                if (*pgt & _PAGE_PRESENT)
+            if ( pgt )
+            {
+                if ( *pgt & _PAGE_PRESENT )
                     break;
                 pgt++;
             }
         }
-        if (y == n)
+        if ( y == n )
             break;
     }
-    if (y != n) {
+    if ( y != n )
+    {
         printk("Failed to find %ld frames!\n", n);
         return 0;
     }
     return demand_map_area_start + x * PAGE_SIZE;
 }
 
-void *map_frames_ex(unsigned long *f, unsigned long n, unsigned long stride,
-       unsigned long increment, unsigned long alignment, domid_t id,
-       int may_fail, unsigned long prot)
+/*
+ * Map an array of MFNs contiguously into virtual address space starting at
+ * va. map f[i*stride]+i*increment for i in 0..n-1.
+ */
+#define MAP_BATCH ((STACK_SIZE / 2) / sizeof(mmu_update_t))
+void do_map_frames(unsigned long va,
+                   unsigned long *mfns, unsigned long n, 
+                   unsigned long stride, unsigned long incr, 
+                   domid_t id, int may_fail,
+                   unsigned long prot)
 {
-    unsigned long addr = allocate_ondemand(n, alignment);
+    pgentry_t *pgt = NULL;
+    unsigned long done = 0;
+    unsigned long i;
+    int rc;
 
-    if (!addr)
+    if ( !mfns ) 
+    {
+        printk("do_map_frames: no mfns supplied\n");
+        return;
+    }
+    DEBUG("va=%p n=0x%lx, mfns[0]=0x%lx stride=0x%lx incr=0x%lx prot=0x%lx\n",
+          va, n, mfns[0], stride, incr, prot);
+    while ( done < n )
+    {
+        unsigned long todo;
+
+        if ( may_fail )
+            todo = 1;
+        else
+            todo = n - done;
+
+        if ( todo > MAP_BATCH )
+            todo = MAP_BATCH;
+
+        {
+            mmu_update_t mmu_updates[todo];
+
+            for ( i = 0; i < todo; i++, va += PAGE_SIZE, pgt++) 
+            {
+                if ( !pgt || !(va & L1_MASK) )
+                    pgt = need_pgt(va);
+                
+                mmu_updates[i].ptr = virt_to_mach(pgt) | MMU_NORMAL_PT_UPDATE;
+                mmu_updates[i].val = ((pgentry_t)(mfns[(done + i) * stride] +
+                                                  (done + i) * incr)
+                                      << PAGE_SHIFT) | prot;
+            }
+
+            rc = HYPERVISOR_mmu_update(mmu_updates, todo, NULL, id);
+            if ( rc < 0 )
+            {
+                if (may_fail)
+                    mfns[done * stride] |= 0xF0000000;
+                else {
+                    printk("Map %ld (%lx, ...) at %p failed: %d.\n",
+                           todo, mfns[done * stride] + done * incr, va, rc);
+                    do_exit();
+                }
+            }
+        }
+        done += todo;
+    }
+}
+
+/*
+ * Map an array of MFNs contiguous into virtual address space. Virtual
+ * addresses are allocated from the on demand area.
+ */
+void *map_frames_ex(unsigned long *mfns, unsigned long n, 
+                    unsigned long stride, unsigned long incr,
+                    unsigned long alignment,
+                    domid_t id, int may_fail, unsigned long prot)
+{
+    unsigned long va = allocate_ondemand(n, alignment);
+
+    if ( !va )
         return NULL;
 
-    /* Found it at x.  Map it in. */
-    do_map_frames(addr, f, n, stride, increment, id, may_fail, prot);
+    do_map_frames(va, mfns, n, stride, incr, id, may_fail, prot);
+
+    return (void *)va;
+}
+
+/*
+ * Unmap nun_frames frames mapped at virtual address va.
+ */
+#define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
+int unmap_frames(unsigned long va, unsigned long num_frames)
+{
+    int n = UNMAP_BATCH;
+    multicall_entry_t call[n];
+    int ret;
+    int i;
+
+    ASSERT(!((unsigned long)va & ~PAGE_MASK));
+
+    DEBUG("va=%p, num=0x%lx\n", va, num_frames);
+
+    while ( num_frames ) {
+        if ( n > num_frames )
+            n = num_frames;
+
+        for ( i = 0; i < n; i++ )
+        {
+            int arg = 0;
+            /* simply update the PTE for the VA and invalidate TLB */
+            call[i].op = __HYPERVISOR_update_va_mapping;
+            call[i].args[arg++] = va;
+            call[i].args[arg++] = 0;
+#ifdef __i386__
+            call[i].args[arg++] = 0;
+#endif  
+            call[i].args[arg++] = UVMF_INVLPG;
+
+            va += PAGE_SIZE;
+        }
+
+        ret = HYPERVISOR_multicall(call, n);
+        if ( ret )
+        {
+            printk("update_va_mapping hypercall failed with rc=%d.\n", ret);
+            return -ret;
+        }
 
-    return (void *)addr;
+        for ( i = 0; i < n; i++ )
+        {
+            if ( call[i].result ) 
+            {
+                printk("update_va_mapping failed for with rc=%d.\n", ret);
+                return -(call[i].result);
+            }
+        }
+        num_frames -= n;
+    }
+    return 0;
 }
 
+/*
+ * Allocate pages which are contiguous in machine memory.
+ * Returns a VA to where they are mapped or 0 on failure.
+ * 
+ * addr_bits indicates if the region has restrictions on where it is
+ * located. Typical values are 32 (if for example PCI devices can't access
+ * 64bit memory) or 0 for no restrictions.
+ *
+ * Allocated pages can be freed using the page allocators free_pages() 
+ * function.
+ *
+ * based on Linux function xen_create_contiguous_region()
+ */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+unsigned long alloc_contig_pages(int order, unsigned int addr_bits)
+{
+    unsigned long in_va, va;
+    unsigned long in_frames[1UL << order], out_frames, mfn;
+    multicall_entry_t call[1UL << order];
+    unsigned int i, num_pages = 1UL << order;
+    int ret, exch_success;
+
+    /* pass in num_pages 'extends' of size 1 and
+     * request 1 extend of size 'order */
+    struct xen_memory_exchange exchange = {
+        .in = {
+            .nr_extents   = num_pages,
+            .extent_order = 0,
+            .domid        = DOMID_SELF
+        },
+        .out = {
+            .nr_extents   = 1,
+            .extent_order = order,
+            .address_bits = addr_bits,
+            .domid        = DOMID_SELF
+        },
+        .nr_exchanged = 0
+    };
+
+    if ( order > MAX_CONTIG_ORDER )
+    {
+        printk("alloc_contig_pages: order too large 0x%x > 0x%x\n",
+               order, MAX_CONTIG_ORDER);
+        return 0;
+    }
+
+    /* Allocate some potentially discontiguous pages */
+    in_va = alloc_pages(order);
+    if ( !in_va )
+    {
+        printk("alloc_contig_pages: could not get enough pages (order=0x%x\n",
+               order);
+        return 0;
+    }
+
+    /* set up arguments for exchange hyper call */
+    set_xen_guest_handle(exchange.in.extent_start, in_frames);
+    set_xen_guest_handle(exchange.out.extent_start, &out_frames);
+
+    /* unmap current frames, keep a list of MFNs */
+    for ( i = 0; i < num_pages; i++ )
+    {
+        int arg = 0;
+
+        va = in_va + (PAGE_SIZE * i);
+        in_frames[i] = virt_to_mfn(va);
+
+        /* update P2M mapping */
+        phys_to_machine_mapping[virt_to_pfn(va)] = INVALID_P2M_ENTRY;
+
+        /* build multi call */
+        call[i].op = __HYPERVISOR_update_va_mapping;
+        call[i].args[arg++] = va;
+        call[i].args[arg++] = 0;
+#ifdef __i386__
+        call[i].args[arg++] = 0;
+#endif  
+        call[i].args[arg++] = UVMF_INVLPG;
+    }
+
+    ret = HYPERVISOR_multicall(call, i);
+    if ( ret )
+    {
+        printk("Odd, update_va_mapping hypercall failed with rc=%d.\n", ret);
+        return 0;
+    }
+
+    /* try getting a contig range of MFNs */
+    out_frames = virt_to_pfn(in_va); /* PFNs to populate */
+    ret = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+    if ( ret ) {
+        printk("mem exchanged order=0x%x failed with rc=%d, nr_exchanged=%d\n", 
+               order, ret, exchange.nr_exchanged);
+        /* we still need to return the allocated pages above to the pool
+         * ie. map them back into the 1:1 mapping etc. so we continue but 
+         * in the end return the pages to the page allocator and return 0. */
+        exch_success = 0;
+    }
+    else
+        exch_success = 1;
+
+    /* map frames into 1:1 and update p2m */
+    for ( i = 0; i < num_pages; i++ )
+    {
+        int arg = 0;
+        pte_t pte;
+
+        va = in_va + (PAGE_SIZE * i);
+        mfn = i < exchange.nr_exchanged ? (out_frames + i) : in_frames[i];
+        pte = __pte(mfn << PAGE_SHIFT | L1_PROT);
+
+        /* update P2M mapping */
+        phys_to_machine_mapping[virt_to_pfn(va)] = mfn;
+
+        /* build multi call */
+        call[i].op = __HYPERVISOR_update_va_mapping;
+        call[i].args[arg++] = va;
+#ifdef __x86_64__
+        call[i].args[arg++] = (pgentry_t)pte.pte;
+#else
+        call[i].args[arg++] = pte.pte_low;
+        call[i].args[arg++] = pte.pte_high;
+#endif  
+        call[i].args[arg++] = UVMF_INVLPG;
+    }
+    ret = HYPERVISOR_multicall(call, i);
+    if ( ret )
+    {
+        printk("update_va_mapping hypercall no. 2 failed with rc=%d.\n", ret);
+        return 0;
+    }
+
+    if ( !exch_success )
+    {
+        /* since the exchanged failed we just free the pages as well */
+        free_pages((void *) in_va, order);
+        return 0;
+    }
+    
+    return in_va;
+}
+
+/*
+ * Check if a given MFN refers to real memory
+ */
+static long system_ram_end_mfn;
+int mfn_is_ram(unsigned long mfn)
+{
+    /* very crude check if a given MFN is memory or not. Probably should
+     * make this a little more sophisticated ;) */
+    return (mfn <= system_ram_end_mfn) ? 1 : 0;
+}
+
+
+/*
+ * Clear some of the bootstrap memory
+ */
 static void clear_bootstrap(void)
 {
     pte_t nullpte = { };
+    int rc;
 
     /* Use first page as the CoW zero page */
     memset(&_text, 0, PAGE_SIZE);
     mfn_zero = virt_to_mfn((unsigned long) &_text);
-    if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG))
-       printk("Unable to unmap NULL page\n");
+    if ( (rc = HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) )
+        printk("Unable to unmap NULL page. rc=%d\n", rc);
 }
 
 void arch_init_p2m(unsigned long max_pfn)
 {
+#ifdef __x86_64__
 #define L1_P2M_SHIFT    9
 #define L2_P2M_SHIFT    18    
 #define L3_P2M_SHIFT    27    
+#else
+#define L1_P2M_SHIFT    10
+#define L2_P2M_SHIFT    20    
+#define L3_P2M_SHIFT    30    
+#endif
 #define L1_P2M_ENTRIES  (1 << L1_P2M_SHIFT)    
 #define L2_P2M_ENTRIES  (1 << (L2_P2M_SHIFT - L1_P2M_SHIFT))    
 #define L3_P2M_ENTRIES  (1 << (L3_P2M_SHIFT - L2_P2M_SHIFT))    
@@ -562,19 +892,19 @@ void arch_init_p2m(unsigned long max_pfn)
     unsigned long pfn;
     
     l3_list = (unsigned long *)alloc_page(); 
-    for(pfn=0; pfn<max_pfn; pfn++)
+    for ( pfn=0; pfn<max_pfn; pfn++ )
     {
-        if(!(pfn % (L1_P2M_ENTRIES * L2_P2M_ENTRIES)))
+        if ( !(pfn % (L1_P2M_ENTRIES * L2_P2M_ENTRIES)) )
         {
             l2_list = (unsigned long*)alloc_page();
-            if((pfn >> L3_P2M_SHIFT) > 0)
+            if ( (pfn >> L3_P2M_SHIFT) > 0 )
             {
                 printk("Error: Too many pfns.\n");
                 do_exit();
             }
             l3_list[(pfn >> L2_P2M_SHIFT)] = virt_to_mfn(l2_list);  
         }
-        if(!(pfn % (L1_P2M_ENTRIES)))
+        if ( !(pfn % (L1_P2M_ENTRIES)) )
         {
             l1_list = (unsigned long*)alloc_page();
             l2_list[(pfn >> L1_P2M_SHIFT) & L2_P2M_MASK] = 
@@ -590,28 +920,40 @@ void arch_init_p2m(unsigned long max_pfn)
 
 void arch_init_mm(unsigned long* start_pfn_p, unsigned long* max_pfn_p)
 {
-
     unsigned long start_pfn, max_pfn;
 
-    printk("  _text:        %p\n", &_text);
-    printk("  _etext:       %p\n", &_etext);
-    printk("  _erodata:     %p\n", &_erodata);
-    printk("  _edata:       %p\n", &_edata);
-    printk("  stack start:  %p\n", stack);
-    printk("  _end:         %p\n", &_end);
+    printk("      _text: %p(VA)\n", &_text);
+    printk("     _etext: %p(VA)\n", &_etext);
+    printk("   _erodata: %p(VA)\n", &_erodata);
+    printk("     _edata: %p(VA)\n", &_edata);
+    printk("stack start: %p(VA)\n", stack);
+    printk("       _end: %p(VA)\n", &_end);
 
     /* First page follows page table pages and 3 more pages (store page etc) */
     start_pfn = PFN_UP(to_phys(start_info.pt_base)) + 
-                start_info.nr_pt_frames + 3;
+        start_info.nr_pt_frames + 3;
     max_pfn = start_info.nr_pages;
-   
-    printk("  start_pfn:    %lx\n", start_pfn);
-    printk("  max_pfn:      %lx\n", max_pfn);
+
+    /* We need room for demand mapping and heap, clip available memory */
+#if defined(__i386__)
+    {
+        unsigned long virt_pfns = 1 + DEMAND_MAP_PAGES + 1 + HEAP_PAGES;
+        if (max_pfn + virt_pfns >= 0x100000)
+            max_pfn = 0x100000 - virt_pfns - 1;
+    }
+#endif
+
+    printk("  start_pfn: %lx\n", start_pfn);
+    printk("    max_pfn: %lx\n", max_pfn);
 
     build_pagetable(&start_pfn, &max_pfn);
     clear_bootstrap();
     set_readonly(&_text, &_erodata);
 
+    /* get the number of physical pages the system has. Used to check for
+     * system memory. */
+    system_ram_end_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+
     *start_pfn_p = start_pfn;
     *max_pfn_p = max_pfn;
 }
index 3671fee332389d8078bd644d70e1fdf4f2787c5e..8509079d3eeadb05faa76f650a415b8145db2aee 100644 (file)
@@ -63,10 +63,12 @@ void failsafe_callback(void);
 static
 shared_info_t *map_shared_info(unsigned long pa)
 {
-       if ( HYPERVISOR_update_va_mapping(
-               (unsigned long)shared_info, __pte(pa | 7), UVMF_INVLPG) )
+    int rc;
+
+       if ( (rc = HYPERVISOR_update_va_mapping(
+              (unsigned long)shared_info, __pte(pa | 7), UVMF_INVLPG)) )
        {
-               printk("Failed to map shared_info!!\n");
+               printk("Failed to map shared_info!! rc=%d\n", rc);
                do_exit();
        }
        return (shared_info_t *)shared_info;
index 95ad4e4ed508405d7259c3fac7fb04911af9141f..9ca78d0016dadb6c327c5b3a509fdd49b5f5dd27 100644 (file)
@@ -42,19 +42,23 @@ void unbind_all_ports(void)
     int cpu = 0;
     shared_info_t *s = HYPERVISOR_shared_info;
     vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
+    int rc;
 
-    for (i = 0; i < NR_EVS; i++)
+    for ( i = 0; i < NR_EVS; i++ )
     {
-        if (i == start_info.console.domU.evtchn ||
-            i == start_info.store_evtchn)
+        if ( i == start_info.console.domU.evtchn ||
+             i == start_info.store_evtchn)
             continue;
-        if (test_and_clear_bit(i, bound_ports))
+
+        if ( test_and_clear_bit(i, bound_ports) )
         {
             struct evtchn_close close;
             printk("port %d still bound!\n", i);
             mask_evtchn(i);
             close.port = i;
-            HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+            rc = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+            if ( rc )
+                printk("WARN: close_port %s failed rc=%d. ignored\n", i, rc);
             clear_evtchn(i);
         }
     }
@@ -71,8 +75,9 @@ int do_event(evtchn_port_t port, struct pt_regs *regs)
 
     clear_evtchn(port);
 
-    if (port >= NR_EVS) {
-        printk("Port number too large: %d\n", port);
+    if ( port >= NR_EVS )
+    {
+        printk("WARN: do_event(): Port number too large: %d\n", port);
         return 1;
     }
 
@@ -89,9 +94,9 @@ int do_event(evtchn_port_t port, struct pt_regs *regs)
 evtchn_port_t bind_evtchn(evtchn_port_t port, evtchn_handler_t handler,
                                                  void *data)
 {
-       if(ev_actions[port].handler != default_handler)
+       if ( ev_actions[port].handler != default_handler )
         printk("WARN: Handler for port %d already registered, replacing\n",
-                               port);
+               port);
 
        ev_actions[port].data = data;
        wmb();
@@ -104,8 +109,9 @@ evtchn_port_t bind_evtchn(evtchn_port_t port, evtchn_handler_t handler,
 void unbind_evtchn(evtchn_port_t port )
 {
        struct evtchn_close close;
+    int rc;
 
-       if (ev_actions[port].handler == default_handler)
+       if ( ev_actions[port].handler == default_handler )
                printk("WARN: No handler for port %d when unbinding\n", port);
        mask_evtchn(port);
        clear_evtchn(port);
@@ -116,37 +122,43 @@ void unbind_evtchn(evtchn_port_t port )
        clear_bit(port, bound_ports);
 
        close.port = port;
-       HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+       rc = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+    if ( rc )
+        printk("WARN: close_port %s failed rc=%d. ignored\n", port, rc);
+        
 }
 
 evtchn_port_t bind_virq(uint32_t virq, evtchn_handler_t handler, void *data)
 {
        evtchn_bind_virq_t op;
+    int rc;
 
        /* Try to bind the virq to a port */
        op.virq = virq;
        op.vcpu = smp_processor_id();
 
-       if ( HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &op) != 0 )
+       if ( (rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &op)) != 0 )
        {
-               printk("Failed to bind virtual IRQ %d\n", virq);
+               printk("Failed to bind virtual IRQ %d with rc=%d\n", virq, rc);
                return -1;
     }
     bind_evtchn(op.port, handler, data);
        return op.port;
 }
 
-evtchn_port_t bind_pirq(uint32_t pirq, int will_share, evtchn_handler_t handler, void *data)
+evtchn_port_t bind_pirq(uint32_t pirq, int will_share,
+                        evtchn_handler_t handler, void *data)
 {
        evtchn_bind_pirq_t op;
+    int rc;
 
        /* Try to bind the pirq to a port */
        op.pirq = pirq;
        op.flags = will_share ? BIND_PIRQ__WILL_SHARE : 0;
 
-       if ( HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &op) != 0 )
+       if ( (rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &op)) != 0 )
        {
-               printk("Failed to bind physical IRQ %d\n", pirq);
+               printk("Failed to bind physical IRQ %d with rc=%d\n", pirq, rc);
                return -1;
        }
        bind_evtchn(op.port, handler, data);
@@ -173,7 +185,8 @@ void init_events(void)
     asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
     wrmsrl(0xc0000101, &cpu0_pda); /* 0xc0000101 is MSR_GS_BASE */
     cpu0_pda.irqcount = -1;
-    cpu0_pda.irqstackptr = (void*) (((unsigned long)irqstack + 2 * STACK_SIZE) & ~(STACK_SIZE - 1));
+    cpu0_pda.irqstackptr = (void*) (((unsigned long)irqstack + 2 * STACK_SIZE)
+                                    & ~(STACK_SIZE - 1));
 #endif
     /* initialize event handler */
     for ( i = 0; i < NR_EVS; i++ )
@@ -207,15 +220,19 @@ void default_handler(evtchn_port_t port, struct pt_regs *regs, void *ignore)
 int evtchn_alloc_unbound(domid_t pal, evtchn_handler_t handler,
                                                 void *data, evtchn_port_t *port)
 {
-    int err;
+    int rc;
+
     evtchn_alloc_unbound_t op;
     op.dom = DOMID_SELF;
     op.remote_dom = pal;
-    err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &op);
-    if (err)
-               return err;
+    rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &op);
+    if ( rc )
+    {
+        printk("ERROR: alloc_unbound failed with rc=%d", rc);
+               return rc;
+    }
     *port = bind_evtchn(op.port, handler, data);
-    return err;
+    return rc;
 }
 
 /* Connect to a port so as to allow the exchange of notifications with
@@ -225,15 +242,28 @@ int evtchn_bind_interdomain(domid_t pal, evtchn_port_t remote_port,
                            evtchn_handler_t handler, void *data,
                            evtchn_port_t *local_port)
 {
-    int err;
+    int rc;
     evtchn_port_t port;
     evtchn_bind_interdomain_t op;
     op.remote_dom = pal;
     op.remote_port = remote_port;
-    err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &op);
-    if (err)
-               return err;
+    rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &op);
+    if ( rc )
+    {
+        printk("ERROR: bind_interdomain failed with rc=%d", rc);
+               return rc;
+    }
     port = op.local_port;
     *local_port = bind_evtchn(port, handler, data);
-    return err;
+    return rc;
 }
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
index 1cecab0a142166d637fd6694ebcb0328460f02eb..b72fadaacbf75fe02ad0e9f98df331d8b2fdfca4 100644 (file)
@@ -869,18 +869,6 @@ moretodo:
     in_irq = 0;
 }
 
-/* Small utility function to figure out our domain id */
-static domid_t get_self_id(void)
-{
-    char *dom_id;
-    domid_t ret; 
-
-    BUG_ON(xenbus_read(XBT_NIL, "domid", &dom_id));
-    sscanf(dom_id, "%d", &ret);
-
-    return ret;
-}
-
 static void alloc_request_table(struct fs_import *import)
 {
     struct fs_request *requests;
@@ -1066,7 +1054,7 @@ static int init_fs_import(struct fs_import *import)
     unmask_evtchn(import->local_port);
 
     
-    self_id = get_self_id(); 
+    self_id = xenbus_get_self_id(); 
     /* Write the frontend info to a node in our Xenbus */
     sprintf(nodename, "/local/domain/%d/device/vfs/%d", 
                         self_id, import->import_id);
index 618ff82abca320f1a61c3f07ba8325efb2af0254..ba5aa165171c70b206164e7d5d868e871d512295 100644 (file)
@@ -427,11 +427,10 @@ atomic_readandclear_64(volatile uint64_t* p)
 static inline void
 set_bit(int num, volatile void *addr)
 {
-       uint32_t bit, b, old, new;
+       uint32_t bit, old, new;
        volatile uint32_t *p;
        p = (volatile uint32_t *) addr + (num >> 5);
-       b = 1 << (num & 31);
-       bit = SWAP(b);
+       bit = 1 << (num & 31);
        do
        {
                old = *p;
@@ -442,11 +441,10 @@ set_bit(int num, volatile void *addr)
 static __inline__ void
 clear_bit(int num, volatile void *addr)
 {
-       uint32_t mask, m,  old, new;
+       uint32_t mask, old, new;
        volatile uint32_t *p;
        p = (volatile uint32_t *) addr + (num >> 5);
-       m = ~(1 << (num & 31));
-       mask = SWAP(m);
+       mask = ~(1 << (num & 31));
        do {
                old = *p;
                new = old & mask;
@@ -456,7 +454,7 @@ clear_bit(int num, volatile void *addr)
 static __inline__ int
 test_bit(int num, const volatile void *addr)
 {
-       uint32_t val = SWAP(1);
+       uint32_t val = 1;
         return val & (((const volatile uint32_t *) addr)[num >> 5] >> (num & 31));
 }
 
@@ -468,12 +466,11 @@ test_bit(int num, const volatile void *addr)
 static inline int
 test_and_set_bit (int num, volatile void *addr)
 {
-        uint32_t bit, b, old, new;
+        uint32_t bit, old, new;
         volatile uint32_t *m;
 
         m = (volatile uint32_t *) addr + (num >> 5);
-        b = 1 << (num & 31);
-        bit = SWAP(b);
+        bit = 1 << (num & 31);
         do {
                 old = *m;
                 new = old | bit;
@@ -489,12 +486,11 @@ test_and_set_bit (int num, volatile void *addr)
 static
 inline int test_and_clear_bit(int num, volatile unsigned long * addr)
 {
-        uint32_t bit, b, old, new;
+        uint32_t bit, old, new;
         volatile uint32_t* a;
 
         a = (volatile uint32_t *) addr + (num >> 5);
-        b = ~(1 << (num & 31));
-        bit = SWAP(b);
+        bit = ~(1 << (num & 31));
         do {
                 old = *a;
                 new = old & bit;
index dbab3d55bf5c4b4720b104e9f8e62602aa1fca3a..06ddc8c774fb0ce5ada52f35245bcf086f81c5ba 100644 (file)
@@ -34,8 +34,6 @@
 #ifndef __HYPERCALL_H__
 #define __HYPERCALL_H__
 
-#include <mini-os/lib.h>       /* memcpy() */
-#include <mini-os/errno.h>     /* ENOSYS() */
 #include <xen/event_channel.h>
 #include <xen/sched.h>
 #include <xen/version.h>
@@ -114,123 +112,24 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2,
 })
 
 
-extern unsigned long xencomm_vaddr_to_paddr(unsigned long vaddr);
-struct xencomm_handle;
-
-/* Inline version.  To be used only on linear space (kernel space).  */
-static inline struct xencomm_handle *
-xencomm_create_inline(void *buffer)
-{
-       unsigned long paddr;
-
-       paddr = xencomm_vaddr_to_paddr((unsigned long)buffer);
-       return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG);
-}
-
-static inline int
-xencomm_arch_event_channel_op(int cmd, void *arg)
-{
-       int rc;
-       struct xencomm_handle *newArg;
-
-       newArg = xencomm_create_inline(arg);
-       rc = _hypercall2(int, event_channel_op, cmd, newArg);
-       if (unlikely(rc == -ENOSYS)) {
-               struct evtchn_op op;
-
-               op.cmd = SWAP(cmd);
-               memcpy(&op.u, arg, sizeof(op.u));
-               rc = _hypercall1(int, event_channel_op_compat, &op);
-       }
-       return rc;
-}
-#define HYPERVISOR_event_channel_op xencomm_arch_event_channel_op
-
-static inline int
-xencomm_arch_xen_version(int cmd, struct xencomm_handle *arg)
-{
-       return _hypercall2(int, xen_version, cmd, arg);
-}
-
-static inline int
-xencomm_arch_xen_feature(int cmd, struct xencomm_handle *arg)
-{
-       struct xencomm_handle *newArg;
-
-       newArg = xencomm_create_inline(arg);
-       return _hypercall2(int, xen_version, cmd, newArg);
-}
-
-static inline int
-HYPERVISOR_xen_version(int cmd, void *arg)
-{
-       switch(cmd) {
-               case XENVER_version:
-                       return xencomm_arch_xen_version(cmd, 0);
-               case XENVER_get_features:
-                       return xencomm_arch_xen_feature(cmd, arg);
-               default:
-                       return -1;
-       }
-}
-
-static inline int
-xencomm_arch_console_io(int cmd, int count, char *str)
-{
-       struct xencomm_handle *newStr;
-
-       newStr = xencomm_create_inline(str);
-       return _hypercall3(int, console_io, cmd, count, newStr);
-}
-
-
-#define HYPERVISOR_console_io xencomm_arch_console_io
-
-static inline int
-HYPERVISOR_sched_op_compat(int cmd, unsigned long arg)
-{
-       return _hypercall2(int, sched_op_compat, cmd, arg);
-}
-
-static inline int
-xencomm_arch_sched_op(int cmd, void *arg)
-{
-       struct xencomm_handle *newArg;
-
-       newArg = xencomm_create_inline(arg);
-       return _hypercall2(int, sched_op, cmd, newArg);
-}
-
-#define HYPERVISOR_sched_op xencomm_arch_sched_op
-
-static inline int
-xencomm_arch_callback_op(int cmd, void *arg)
-{
-       struct xencomm_handle *newArg;
-
-       newArg = xencomm_create_inline(arg);
-       return _hypercall2(int, callback_op, cmd, newArg);
-}
-#define HYPERVISOR_callback_op xencomm_arch_callback_op
-
-static inline int
-xencomm_arch_hypercall_grant_table_op(unsigned int cmd,
-                                      struct xencomm_handle *uop,
-                                      unsigned int count)
-{
-       return _hypercall3(int, grant_table_op, cmd, uop, count);
-}
+int HYPERVISOR_event_channel_op(int cmd, void *arg);
+
+int HYPERVISOR_xen_version(int cmd, void *arg);
+
+int HYPERVISOR_console_io(int cmd, int count, char *str);
+
+int HYPERVISOR_sched_op_compat(int cmd, unsigned long arg);
+
+int HYPERVISOR_sched_op(int cmd, void *arg);
+
+int HYPERVISOR_callback_op(int cmd, void *arg);
 
 int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count);
 
-static inline int
-HYPERVISOR_opt_feature(void *arg)
-{
-       struct xencomm_handle *new_arg;
+int HYPERVISOR_opt_feature(void *arg);
 
-       new_arg = xencomm_create_inline(arg);
+int HYPERVISOR_suspend(unsigned long srec);
 
-       return _hypercall1(int, opt_feature, new_arg);
-}
+int HYPERVISOR_shutdown(unsigned int reason);
 
 #endif /* __HYPERCALL_H__ */
index f9ebf7329f66bf6b8c22f0e2e0df007e3c5f1eca..3e2e9147edbfc3d370c99b7b54e86e8d5bab4ceb 100644 (file)
 #define IA64_PSR_IA            0x0000200000000000
 
 
-/* Endianess of mini-os. */
-#if defined(BIG_ENDIAN)
-#define MOS_IA64_PSR_BE        IA64_PSR_BE
-#else
-#define MOS_IA64_PSR_BE        0
-#endif
-
 #define STARTUP_PSR (IA64_PSR_IT | IA64_PSR_PK | \
-                    IA64_PSR_DT | IA64_PSR_RT | MOS_IA64_PSR_BE | \
+                    IA64_PSR_DT | IA64_PSR_RT | \
                     IA64_PSR_BN | IA64_PSR_CPL_KERN | IA64_PSR_AC)
 
 #define MOS_SYS_PSR (IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT | \
-                    IA64_PSR_DT | IA64_PSR_RT | MOS_IA64_PSR_BE | \
+                    IA64_PSR_DT | IA64_PSR_RT | \
                     IA64_PSR_BN | IA64_PSR_CPL_KERN | IA64_PSR_AC)
 
 #define MOS_USR_PSR (IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT | \
-                    IA64_PSR_DT | IA64_PSR_RT | MOS_IA64_PSR_BE | \
+                    IA64_PSR_DT | IA64_PSR_RT | \
                     IA64_PSR_BN | IA64_PSR_CPL_USER | IA64_PSR_AC)
 
 /*
 #define IA64_DCR_MBZ1_V                0xffffffffffffULL
 
 
-       /* Endianess of DCR register. */
-#if defined(BIG_ENDIAN)
-#define MOS_IA64_DCR_BE        (1 << IA64_DCR_BE)
-#else
-#define MOS_IA64_DCR_BE        (0 << IA64_DCR_BE)
-#endif
-
-#define IA64_DCR_DEFAULT (MOS_IA64_DCR_BE)
+#define IA64_DCR_DEFAULT (IA64_DCR_BE)
 
 /*
  * Vector numbers for various ia64 interrupts.
 #define IA64_RSC_MODE_LI (0x2)                 /* Load intensive */
 #define IA64_RSC_MODE_EA (0x3)                 /* Eager */
 
-/* RSE endian mode. */
-#if defined(BIG_ENDIAN)
-#define MOS_IA64_RSC_BE        1               /* Big endian rse. */
-#else
-#define MOS_IA64_RSC_BE        0               /* Little endian rse. */
-#endif
-
-#define IA64_RSE_EAGER ((IA64_RSC_MODE_EA<<IA64_RSC_MODE) |    \
-                          (MOS_IA64_RSC_BE << IA64_RSC_BE)     )
-
-#define IA64_RSE_LAZY ((IA64_RSC_MODE_LY<<IA64_RSC_MODE) |     \
-                          (MOS_IA64_RSC_BE << IA64_RSC_BE)     )
+#define IA64_RSE_EAGER (IA64_RSC_MODE_EA<<IA64_RSC_MODE)
+#define IA64_RSE_LAZY (IA64_RSC_MODE_LY<<IA64_RSC_MODE)
 
 
 
@@ -719,19 +695,6 @@ typedef struct trap_frame trap_frame_t;
  */
 typedef struct
 {
-#if defined(BIG_ENDIAN)
-       uint64_t pte_ig :11;    /* bits 53..63 */
-       uint64_t pte_ed :1;     /* bits 52..52 */
-       uint64_t pte_rv2:2;     /* bits 50..51 */
-       uint64_t pte_ppn:38;    /* bits 12..49 */
-       uint64_t pte_ar :3;     /* bits 9..11 */
-       uint64_t pte_pl :2;     /* bits 7..8 */
-       uint64_t pte_d  :1;     /* bits 6..6 */
-       uint64_t pte_a  :1;     /* bits 5..5 */
-       uint64_t pte_ma :3;     /* bits 2..4 */
-       uint64_t pte_rv1:1;     /* bits 1..1 */
-       uint64_t pte_p  :1;     /* bits 0..0 */
-#else
        uint64_t pte_p  :1;     /* bits 0..0 */
        uint64_t pte_rv1:1;     /* bits 1..1 */
        uint64_t pte_ma :3;     /* bits 2..4 */
@@ -743,7 +706,6 @@ typedef struct
        uint64_t pte_rv2:2;     /* bits 50..51 */
        uint64_t pte_ed :1;     /* bits 52..52 */
        uint64_t pte_ig :11;    /* bits 53..63 */
-#endif
 } ia64_pte_t;
 
 
index 2cbfa424b721b275d088944268ba073bd825caae..9a79e9bdc592036d6ec1384c6e478794bab8d774 100644 (file)
@@ -28,7 +28,6 @@
 #if !defined(__ASSEMBLY__)
 
 #include <mini-os/types.h>
-#include "endian.h"
 #include "ia64_cpu.h"
 #include "atomic.h"
 #include "efi.h"
@@ -192,21 +191,6 @@ __synch_cmpxchg(volatile void *ptr, uint64_t old, uint64_t new, int size)
 
 extern shared_info_t *HYPERVISOR_shared_info;
 
-static inline int
-HYPERVISOR_shutdown(unsigned int reason)
-{
-       struct sched_shutdown sched_shutdown = {
-               .reason = reason
-       };
-
-       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
-
-       if (rc == -ENOSYS)
-               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
-
-       return rc;
-}
-
 
 /*
  * This code is from the originally os.h and should be put in a
@@ -225,7 +209,7 @@ HYPERVISOR_shutdown(unsigned int reason)
 do {                                                                   \
        vcpu_info_t *_vcpu;                                             \
        _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \
-       _vcpu->evtchn_upcall_mask = SWAP(1);                            \
+       _vcpu->evtchn_upcall_mask = 1;                                  \
        barrier();                                                      \
 } while (0)
 
@@ -236,7 +220,7 @@ do {                                                                        \
        _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \
        _vcpu->evtchn_upcall_mask = 0;                                  \
        barrier(); /* unmask then check (avoid races) */                \
-       if (unlikely(SWAP(_vcpu->evtchn_upcall_pending)))               \
+       if (unlikely(_vcpu->evtchn_upcall_pending))                     \
                force_evtchn_callback();                                \
 } while (0)
 
@@ -244,7 +228,7 @@ do {                                                                        \
 do {                                                                   \
        vcpu_info_t *_vcpu;                                             \
        _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \
-       (x) = SWAP(_vcpu->evtchn_upcall_mask);                          \
+       (x) = _vcpu->evtchn_upcall_mask;                                \
 } while (0)
 
 #define __restore_flags(x)                                             \
@@ -254,7 +238,7 @@ do {                                                                        \
        _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \
        if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
                barrier(); /* unmask then check (avoid races) */        \
-               if ( unlikely(SWAP(_vcpu->evtchn_upcall_pending)) )     \
+               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
                        force_evtchn_callback();                        \
        }\
 } while (0)
@@ -265,8 +249,8 @@ do {                                                                        \
 do {                                                                   \
        vcpu_info_t *_vcpu;                                             \
        _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \
-       (x) = SWAP(_vcpu->evtchn_upcall_mask);                          \
-       _vcpu->evtchn_upcall_mask = SWAP(1);                            \
+       (x) = _vcpu->evtchn_upcall_mask;                                \
+       _vcpu->evtchn_upcall_mask = 1;                                  \
        barrier();                                                      \
 } while (0)
 
@@ -277,7 +261,7 @@ do {                                                                        \
 #define local_irq_enable()     __sti()
 
 #define irqs_disabled()                        \
-       SWAP(HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].evtchn_upcall_mask)
+       (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].evtchn_upcall_mask)
 
 /* This is a barrier for the compiler only, NOT the processor! */
 #define barrier() __asm__ __volatile__("": : :"memory")
diff --git a/extras/mini-os/include/ioremap.h b/extras/mini-os/include/ioremap.h
new file mode 100644 (file)
index 0000000..7f246e3
--- /dev/null
@@ -0,0 +1,33 @@
+/**
+ * Copyright (C) 2009 Netronome Systems, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _IOREMAP_H_
+#define _IOREMAP_H_
+
+void *ioremap(unsigned long phys_addr, unsigned long size);
+void *ioremap_nocache(unsigned long phys_addr, unsigned long size);
+void iounmap(void *virt_addr, unsigned long size);
+
+#endif /* _IOREMAP_H_ */
+
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 indent-tabs-mode:nil -*- */
index 32ce7d93f6cba0d5817f0d5cc65ee7bc1f6544af..18622d85422e0f20c91d0595a63563df7cdab779 100644 (file)
@@ -71,6 +71,8 @@ void *map_frames_ex(unsigned long *f, unsigned long n, unsigned long stride,
 void do_map_frames(unsigned long addr,
         unsigned long *f, unsigned long n, unsigned long stride,
        unsigned long increment, domid_t id, int may_fail, unsigned long prot);
+int unmap_frames(unsigned long va, unsigned long num_frames);
+unsigned long alloc_contig_pages(int order, unsigned int addr_bits);
 #ifdef HAVE_LIBC
 extern unsigned long heap, brk, heap_mapped, heap_end;
 #endif
diff --git a/extras/mini-os/include/posix/net/if.h b/extras/mini-os/include/posix/net/if.h
new file mode 100644 (file)
index 0000000..5be77d4
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * This code is mostly taken from NetBSD net/if.h 
+ * Changes: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
+ *
+ ******************************************************************************
+ *
+ * Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by William Studenmund and Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NET_IF_H_
+#define _NET_IF_H_
+
+/*
+ * Length of interface external name, including terminating '\0'.
+ * Note: this is the same size as a generic device's external name.
+ */
+#define IF_NAMESIZE 16
+
+struct if_nameindex {
+        unsigned int    if_index;       /* 1, 2, ... */
+        char            *if_name;       /* null terminated name: "le0", ... */
+};
+
+unsigned int if_nametoindex(const char *);
+char *  if_indextoname(unsigned int, char *);
+struct  if_nameindex * if_nameindex(void);
+void    if_freenameindex(struct if_nameindex *);
+
+#endif /* !_NET_IF_H_ */
+
index 3359439b55d5f62fc5d273bd98f5f744839b4642..c60e61e94ae2c6510e23a464cada3eddfc750939 100644 (file)
@@ -48,8 +48,9 @@ struct thread* create_thread(char *name, void (*function)(void *), void *data);
 void exit_thread(void) __attribute__((noreturn));
 void schedule(void);
 
+#ifdef __INSIDE_MINIOS__
 #define current get_current()
-
+#endif
 
 void wake(struct thread *thread);
 void block(struct thread *thread);
index 14e98ba755df5099c423a3e2652eb7ed8c39b1a7..10b9f29b078d47d099321bd1aab6e18064192630 100644 (file)
@@ -7,7 +7,7 @@
 
 #define DEFINE_WAIT(name)                               \
 struct wait_queue name = {                              \
-    .thread       = current,                            \
+    .thread       = get_current(),                            \
     .thread_list  = MINIOS_LIST_HEAD_INIT((name).thread_list), \
 }
 
@@ -53,7 +53,7 @@ static inline void wake_up(struct wait_queue_head *head)
     unsigned long flags;        \
     local_irq_save(flags);      \
     add_wait_queue(&wq, &w);    \
-    block(current);             \
+    block(get_current());       \
     local_irq_restore(flags);   \
 } while (0)
 
@@ -74,8 +74,8 @@ static inline void wake_up(struct wait_queue_head *head)
         /* protect the list */                                  \
         local_irq_save(flags);                                  \
         add_wait_queue(&wq, &__wait);                           \
-        current->wakeup_time = deadline;                        \
-        clear_runnable(current);                                \
+        get_current()->wakeup_time = deadline;                  \
+        clear_runnable(get_current());                          \
         local_irq_restore(flags);                               \
         if((condition) || (deadline && NOW() >= deadline))      \
             break;                                              \
@@ -83,7 +83,7 @@ static inline void wake_up(struct wait_queue_head *head)
     }                                                           \
     local_irq_save(flags);                                      \
     /* need to wake up */                                       \
-    wake(current);                                              \
+    wake(get_current());                                        \
     remove_wait_queue(&__wait);                                 \
     local_irq_restore(flags);                                   \
 } while(0) 
index 4a04812891be334cd35f842c12c6f735c768c314..786064bf89375af25398dbe01309bb14bfdae5be 100644 (file)
@@ -133,6 +133,13 @@ typedef unsigned long pgentry_t;
 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
 #endif /* __i386__ || __x86_64__ */
 
+/* flags for ioremap */
+#define IO_PROT (L1_PROT)
+#define IO_PROT_NOCACHE (L1_PROT | _PAGE_PCD)
+
+/* for P2M */
+#define INVALID_P2M_ENTRY (~0UL)
+
 #include "arch_limits.h"
 #define PAGE_SIZE       __PAGE_SIZE
 #define PAGE_SHIFT      __PAGE_SHIFT
@@ -222,5 +229,6 @@ static __inline__ paddr_t machine_to_phys(maddr_t machine)
 #define do_map_zero(start, n) do_map_frames(start, &mfn_zero, n, 0, 0, DOMID_SELF, 0, L1_PROT_RO)
 
 pgentry_t *need_pgt(unsigned long addr);
+int mfn_is_ram(unsigned long mfn);
 
 #endif /* _ARCH_MM_H_ */
index e8de09f282cd27023f3d9825778956613a5afbc2..30a1c081432c8c4f00ea142bfcf4c79f99469e45 100644 (file)
@@ -91,6 +91,9 @@ char* xenbus_printf(xenbus_transaction_t xbt,
                                   const char* fmt, ...)
                    __attribute__((__format__(printf, 4, 5)));
 
+/* Utility function to figure out our domain id */
+domid_t xenbus_get_self_id(void);
+
 /* Reset the XenBus system. */
 void fini_xenbus(void);
 
index 877168c750ed9e8cc860304424ad5775559dbe23..bdd866462b011f671b0e7be8613829b831e0b1ed 100644 (file)
@@ -434,25 +434,25 @@ static void kbdfront_thread(void *p)
 
 static struct pcifront_dev *pci_dev;
 
-static void pcifront_thread(void *p)
+static void print_pcidev(unsigned int domain, unsigned int bus, unsigned int slot, unsigned int fun)
 {
-    void print(unsigned int domain, unsigned int bus, unsigned int slot, unsigned int fun)
-    {
-        unsigned int vendor, device, rev, class;
+    unsigned int vendor, device, rev, class;
 
-        pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x00, 2, &vendor);
-        pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x02, 2, &device);
-        pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x08, 1, &rev);
-        pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x0a, 2, &class);
+    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x00, 2, &vendor);
+    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x02, 2, &device);
+    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x08, 1, &rev);
+    pcifront_conf_read(pci_dev, domain, bus, slot, fun, 0x0a, 2, &class);
 
-        printk("%04x:%02x:%02x.%02x %04x: %04x:%04x (rev %02x)\n", domain, bus, slot, fun, class, vendor, device, rev);
-    }
+    printk("%04x:%02x:%02x.%02x %04x: %04x:%04x (rev %02x)\n", domain, bus, slot, fun, class, vendor, device, rev);
+}
 
+static void pcifront_thread(void *p)
+{
     pci_dev = init_pcifront(NULL);
     if (!pci_dev)
         return;
     printk("PCI devices:\n");
-    pcifront_scan(pci_dev, print);
+    pcifront_scan(pci_dev, print_pcidev);
 }
 
 static void fs_thread(void *p)
@@ -490,14 +490,16 @@ void start_kernel(start_info_t *si)
 
     /* print out some useful information  */
     printk("Xen Minimal OS!\n");
-    printk("start_info:   %p\n",    si);
-    printk("  nr_pages:   %lu",     si->nr_pages);
-    printk("  shared_inf: %08lx\n", si->shared_info);
-    printk("  pt_base:    %p",      (void *)si->pt_base); 
-    printk("  mod_start:  0x%lx\n", si->mod_start);
-    printk("  mod_len:    %lu\n",   si->mod_len); 
-    printk("  flags:      0x%x\n",  (unsigned int)si->flags);
-    printk("  cmd_line:   %s\n",  
+    printk("  start_info: %p(VA)\n", si);
+    printk("    nr_pages: 0x%lx\n", si->nr_pages);
+    printk("  shared_inf: 0x%08lx(MA)\n", si->shared_info);
+    printk("     pt_base: %p(VA)\n", (void *)si->pt_base); 
+    printk("nr_pt_frames: 0x%lx\n", si->nr_pt_frames);
+    printk("    mfn_list: %p(VA)\n", (void *)si->mfn_list); 
+    printk("   mod_start: 0x%lx(VA)\n", si->mod_start);
+    printk("     mod_len: %lu\n", si->mod_len); 
+    printk("       flags: 0x%x\n", (unsigned int)si->flags);
+    printk("    cmd_line: %s\n",  
            si->cmd_line ? (const char *)si->cmd_line : "NULL");
 
     /* Set up events. */
index 34e4fb66666543cbea9e8696879b05dc1be2140f..7c5f05cd41de0fcee3c0d5a735e0d2bd79083435 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/unistd.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
+#include <net/if.h>
 #include <time.h>
 #include <errno.h>
 #include <fcntl.h>
@@ -1205,47 +1206,15 @@ void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset
     } else ASSERT(0);
 }
 
-#define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
 int munmap(void *start, size_t length)
 {
     int total = length / PAGE_SIZE;
-    ASSERT(!((unsigned long)start & ~PAGE_MASK));
-    while (total) {
-        int n = UNMAP_BATCH;
-        if (n > total)
-            n = total;
-        {
-            int i;
-            multicall_entry_t call[n];
-            unsigned char (*data)[PAGE_SIZE] = start;
-            int ret;
-
-            for (i = 0; i < n; i++) {
-                int arg = 0;
-                call[i].op = __HYPERVISOR_update_va_mapping;
-                call[i].args[arg++] = (unsigned long) &data[i];
-                call[i].args[arg++] = 0;
-#ifdef __i386__
-                call[i].args[arg++] = 0;
-#endif
-                call[i].args[arg++] = UVMF_INVLPG;
-            }
-
-            ret = HYPERVISOR_multicall(call, n);
-            if (ret) {
-                errno = -ret;
-                return -1;
-            }
+    int ret;
 
-            for (i = 0; i < n; i++) {
-                if (call[i].result) {
-                    errno = call[i].result;
-                    return -1;
-                }
-            }
-        }
-        start = (char *)start + n * PAGE_SIZE;
-        total -= n;
+    ret = unmap_frames((unsigned long)start, (unsigned long)total);
+    if (ret) {
+        errno = ret;
+        return -1;
     }
     return 0;
 }
@@ -1324,6 +1293,12 @@ unsupported_function(int, tcsetattr, -1);
 unsupported_function(int, tcgetattr, 0);
 unsupported_function(int, poll, -1);
 
+/* net/if.h */
+unsupported_function_log(unsigned int, if_nametoindex, -1);
+unsupported_function_log(char *, if_indextoname, (char *) NULL);
+unsupported_function_log(struct  if_nameindex *, if_nameindex, (struct  if_nameindex *) NULL);
+unsupported_function_crash(if_freenameindex);
+
 /* Linuxish abi for the Caml runtime, don't support */
 unsupported_function_log(struct dirent *, readdir64, NULL);
 unsupported_function_log(int, getrusage, -1);
index 7ee19b3a86c0d241fb00f4740396772b15750c75..698648a904aa4791b74f3327f567c2bdc116c97c 100644 (file)
@@ -26,6 +26,9 @@ else
 DEF_CFLAGS += -O3
 endif
 
+# Make the headers define our internal stuff
+DEF_CFLAGS += -D__INSIDE_MINIOS__
+
 # Build the CFLAGS and ASFLAGS for compiling and assembling.
 # DEF_... flags are the common mini-os flags,
 # ARCH_... flags may be defined in arch/$(TARGET_ARCH_FAM/rules.mk
index 7b9c35a1dc45859d6a03a9c551dd5072827c4956..5ed42a3da75db4b308ed576c3c926e9e935bd84c 100644 (file)
@@ -666,6 +666,17 @@ char* xenbus_printf(xenbus_transaction_t xbt,
     return xenbus_write(xbt,fullpath,val);
 }
 
+domid_t xenbus_get_self_id(void)
+{
+    char *dom_id;
+    domid_t ret;
+
+    BUG_ON(xenbus_read(XBT_NIL, "domid", &dom_id));
+    sscanf(dom_id, "%d", &ret);
+
+    return ret;
+}
+
 static void do_ls_test(const char *pre)
 {
     char **dirs, *msg;
index da7be4e3330e9c133463f6c903ced84d9d7b090d..dcb614fc85cd5f9a39d81c4ea3a886266b67c1b5 100644 (file)
@@ -7,16 +7,24 @@ export stubdom=y
 export debug=y
 include $(XEN_ROOT)/Config.mk
 
-IOEMU_OPTIONS=--disable-sdl --disable-opengl --disable-vnc-tls --disable-brlapi --disable-kqemu
-ZLIB_URL?=http://www.zlib.net
+#ZLIB_URL?=http://www.zlib.net
+ZLIB_URL=$(XEN_EXTFILES_URL)
 ZLIB_VERSION=1.2.3
-LIBPCI_URL?=http://www.kernel.org/pub/software/utils/pciutils
+
+#LIBPCI_URL?=http://www.kernel.org/pub/software/utils/pciutils
+LIBPCI_URL?=$(XEN_EXTFILES_URL)
 LIBPCI_VERSION=2.2.9
-NEWLIB_URL?=ftp://sources.redhat.com/pub/newlib
+
+#NEWLIB_URL?=ftp://sources.redhat.com/pub/newlib
+NEWLIB_URL?=$(XEN_EXTFILES_URL)
 NEWLIB_VERSION=1.16.0
-LWIP_URL?=http://download.savannah.gnu.org/releases/lwip
+
+#LWIP_URL?=http://download.savannah.gnu.org/releases/lwip
+LWIP_URL?=$(XEN_EXTFILES_URL)
 LWIP_VERSION=1.3.0
-GRUB_URL?=http://alpha.gnu.org/gnu/grub
+
+#GRUB_URL?=http://alpha.gnu.org/gnu/grub
+GRUB_URL?=$(XEN_EXTFILES_URL)
 GRUB_VERSION=0.97
 
 WGET=wget -c
@@ -184,6 +192,7 @@ mk-headers-$(XEN_TARGET_ARCH): ioemu/linkfarm.stamp
           ln -sf $(addprefix ../../,$(wildcard $(XEN_ROOT)/xen/include/public/*.h)) include/xen && \
           ln -sf $(addprefix ../../$(XEN_ROOT)/xen/include/public/,arch-ia64 arch-x86 hvm io xsm) include/xen && \
           ( [ -h include/xen/sys ] || ln -sf ../../$(XEN_ROOT)/tools/include/xen-sys/MiniOS include/xen/sys ) && \
+          ( [ -h include/xen/libelf ] || ln -sf ../../$(XEN_ROOT)/tools/include/xen/libelf include/xen/libelf ) && \
          mkdir -p include/xen-foreign && \
          ln -sf $(addprefix ../../,$(wildcard $(XEN_ROOT)/tools/include/xen-foreign/*)) include/xen-foreign/ && \
          $(MAKE) -C include/xen-foreign/ && \
@@ -226,8 +235,12 @@ ioemu: cross-zlib cross-libpci libxc
        [ -f ioemu/config-host.mak ] || \
          ( $(absolutify_xen_root); \
            cd ioemu ; \
-          CONFIG_STUBDOM=yes XEN_TARGET_ARCH=$(XEN_TARGET_ARCH) CFLAGS="$(TARGET_CFLAGS)" sh ./xen-setup --cc=$(CC) --disable-gcc-check $(IOEMU_OPTIONS))
-       CPPFLAGS= TARGET_CPPFLAGS="$(TARGET_CPPFLAGS)" $(MAKE) -C ioemu LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) TOOLS= CONFIG_STUBDOM=yes
+           LWIPDIR=$(CURDIR)/lwip-$(XEN_TARGET_ARCH) \
+           TARGET_CPPFLAGS="$(TARGET_CPPFLAGS)" \
+           TARGET_CFLAGS="$(TARGET_CFLAGS)" \
+           TARGET_LDFLAGS="$(TARGET_LDFLAGS)" \
+           ./xen-setup-stubdom )
+       $(MAKE) -C ioemu
 
 ######
 # caml
@@ -301,14 +314,14 @@ install-readme:
        $(INSTALL_DATA) README $(DESTDIR)$(DOCDIR)/README.stubdom
 
 install-ioemu: ioemu-stubdom
-       $(INSTALL_DIR) "$(DESTDIR)/usr/lib/xen/bin"
-       $(INSTALL_PROG) stubdom-dm "$(DESTDIR)/usr/lib/xen/bin"
-       $(INSTALL_DIR) "$(DESTDIR)/usr/lib/xen/boot"
-       $(INSTALL_DATA) mini-os-$(XEN_TARGET_ARCH)-ioemu/mini-os.gz "$(DESTDIR)/usr/lib/xen/boot/ioemu-stubdom.gz"
+       $(INSTALL_DIR) "$(DESTDIR)$(LIBEXEC)"
+       $(INSTALL_PROG) stubdom-dm "$(DESTDIR)$(LIBEXEC)"
+       $(INSTALL_DIR) "$(DESTDIR)$(LIBDIR_x86_32)/xen/boot"
+       $(INSTALL_DATA) mini-os-$(XEN_TARGET_ARCH)-ioemu/mini-os.gz "$(DESTDIR)$(LIBDIR_x86_32)/xen/boot/ioemu-stubdom.gz"
 
 install-grub: pv-grub
-       $(INSTALL_DIR) "$(DESTDIR)/usr/lib/xen/boot"
-       $(INSTALL_DATA) mini-os-$(XEN_TARGET_ARCH)-grub/mini-os.gz "$(DESTDIR)/usr/lib/xen/boot/pv-grub-$(XEN_TARGET_ARCH).gz"
+       $(INSTALL_DIR) "$(DESTDIR)$(LIBDIR_x86_32)/xen/boot"
+       $(INSTALL_DATA) mini-os-$(XEN_TARGET_ARCH)-grub/mini-os.gz "$(DESTDIR)$(LIBDIR_x86_32)/xen/boot/pv-grub-$(XEN_TARGET_ARCH).gz"
 
 #######
 # clean
index 3b25196df37bf143b157e9d95f37f49cddd68f5a..5bc2211d61c115db9b1a50707b1e945a6592cad4 100644 (file)
@@ -56,6 +56,11 @@ sdl = 0
 
 vfb = [ 'type=sdl' ]
 
+    by default qemu will use sdl together with opengl for rendering, if
+    you do not want qemu to use opengl then also pass opengl=0:
+
+vfb = [ 'type=sdl, opengl=0' ]
+
 * Using a VNC server in the stub domain
 
   - In hvmconfig, set vnclisten to "172.30.206.1" for instance.  Do not use a
index ecb1191ab74e4c8873a1507b995247559e218c53..8bf10502156b968a771376cbf3dbe7d2fb0f3049 100644 (file)
@@ -649,7 +649,7 @@ int getrtsecs (void)
 {
     struct timeval tv;
     gettimeofday(&tv, NULL);
-    return tv.tv_sec;
+    return tv.tv_sec % 10 + ((tv.tv_sec / 10) % 6) * 0x10;
 }
 
 int currticks (void)
index a800bc47465a5b84025b960e8c05463432f0d2e1..f471e82b366e14b70601c6b0f8cc92f6e140a0e0 100644 (file)
@@ -15,6 +15,7 @@ domname=
 vncviewer=0
 vncpid=
 extra=
+videoram=4
 while [ "$#" -gt 0 ];
 do
     if [ "$#" -ge 2 ];
@@ -38,6 +39,10 @@ do
                 extra="$extra -loadvm $2";
                 shift
                 ;;
+           -videoram)
+               videoram="$2"
+               shift
+               ;;
        esac
     fi
     case "$1" in
@@ -72,7 +77,7 @@ do
        sleep 1
 done
 
-creation="xm create -c $domname-dm target=$domid memory=32 extra=\"$extra\""
+creation="xm create -c $domname-dm target=$domid memory=32 videoram=$videoram extra=\"$extra\""
 
 (while true ; do sleep 60 ; done) | /bin/sh -c "$creation" &
 #xterm -geometry +0+0 -e /bin/sh -c "$creation ; echo ; echo press ENTER to shut down ; read" &
index 4d27b26fe0195e0e9df76a9ce2a2e05b878e7fab..00e398170fde853f2efc7688218e2b7b9946dfce 100644 (file)
@@ -9,6 +9,7 @@ SUBDIRS-y += flask
 SUBDIRS-y += xenstore
 SUBDIRS-y += misc
 SUBDIRS-y += examples
+SUBDIRS-y += hotplug
 SUBDIRS-y += xentrace
 SUBDIRS-$(CONFIG_XCUTILS) += xcutils
 SUBDIRS-$(CONFIG_X86) += firmware
@@ -18,12 +19,13 @@ SUBDIRS-y += xenmon
 SUBDIRS-$(VTPM_TOOLS) += vtpm_manager
 SUBDIRS-$(VTPM_TOOLS) += vtpm
 SUBDIRS-y += xenstat
-SUBDIRS-y += libaio
-SUBDIRS-y += blktap
+SUBDIRS-$(CONFIG_Linux) += libaio
+SUBDIRS-$(CONFIG_Linux) += blktap
 SUBDIRS-y += libfsimage
 SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
-SUBDIRS-y += fs-back
+SUBDIRS-$(CONFIG_Linux) += fs-back
 SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
+SUBDIRS-y += xenpmd
 
 # These don't cross-compile
 ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
index a477c8253b9db257e8485bcc62868daad1b46518..fbd69ea215c995cefb2311ec77d0d03f7301b780 100644 (file)
@@ -29,6 +29,10 @@ X11_LDPATH = -L/usr/X11R6/$(LIBLEAFDIR)
 
 CFLAGS += -D__XEN_TOOLS__
 
+# Get gcc to generate the dependencies for us.
+CFLAGS += -MMD -MF .$(@F).d
+DEPS = .*.d
+
 # Enable implicit LFS support *and* explicit LFS names.
 CFLAGS  += $(shell getconf LFS_CFLAGS)
 CFLAGS  += -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
@@ -59,4 +63,3 @@ subdirs-all subdirs-clean subdirs-install: .phony
 
 subdir-all-% subdir-clean-% subdir-install-%: .phony
        $(MAKE) -C $* $(patsubst subdir-%-$*,%,$@)
-
index 7c3e088bf101e88f2023e472041a81bbf27c7bab..469c3147e60e134a5a41f43a37e1ab4656b47ad2 100644 (file)
@@ -13,16 +13,12 @@ CFLAGS   += $(CFLAGS_libxenstore)
 CFLAGS   += -I $(LIBAIO_DIR)
 CFLAGS   += -D_GNU_SOURCE
 
-# Get gcc to generate the dependencies for us.
-CFLAGS   += -Wp,-MD,.$(@F).d
-DEPS      = .*.d
-
-ifeq ($(shell . ./check_gcrypt),"yes")
+ifeq ($(shell . ./check_gcrypt $(CC)),yes)
 CFLAGS += -DUSE_GCRYPT
 CRYPT_LIB := -lgcrypt
 else
 CRYPT_LIB := -lcrypto
-$(warning *** libgcrypt not installed: falling back to libcrypto ***)
+$(warning === libgcrypt not installed: falling back to libcrypto ===)
 endif
 
 LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib -lblktap
index b8a872bbbb5329fb79768d98de4a6b55fc51934d..8a5630230e20762562503c95f36ff34e15e61790 100644 (file)
@@ -148,7 +148,8 @@ static int get_tapdisk_pid(blkif_t *blkif)
  *   return 0 on success, -1 on error.
  */
 
-static int test_path(char *path, char **dev, int *type, blkif_t **blkif)
+static int test_path(char *path, char **dev, int *type, blkif_t **blkif,
+       int* use_ioemu)
 {
        char *ptr, handle[10];
        int i, size, found = 0;
@@ -158,6 +159,17 @@ static int test_path(char *path, char **dev, int *type, blkif_t **blkif)
        *type = MAX_DISK_TYPES + 1;
         *blkif = NULL;
 
+       if (!strncmp(path, "tapdisk:", strlen("tapdisk:"))) {
+               *use_ioemu = 0;
+               path += strlen("tapdisk:");
+       } else if (!strncmp(path, "ioemu:", strlen("ioemu:"))) {
+               *use_ioemu = 1;
+               path += strlen("ioemu:");
+       } else {
+               // Use the default for the image type
+               *use_ioemu = -1;
+       }
+
        if ( (ptr = strstr(path, ":"))!=NULL) {
                handle_len = (ptr - path);
                memcpy(handle, path, handle_len);
@@ -174,6 +186,8 @@ static int test_path(char *path, char **dev, int *type, blkif_t **blkif)
                         }
 
                        if (found) {
+                               if (*use_ioemu == -1)
+                                       *use_ioemu = dtypes[i]->use_ioemu;
                                *type = dtypes[i]->idnum;
                         
                         if (dtypes[i]->single_handler == 1) {
@@ -185,6 +199,7 @@ static int test_path(char *path, char **dev, int *type, blkif_t **blkif)
                                         *blkif = active_disks[dtypes[i]
                                                              ->idnum]->blkif;
                         }
+
                         return 0;
                 }
             }
@@ -216,6 +231,24 @@ static void add_disktype(blkif_t *blkif, int type)
        entry->pprev = pprev;
 }
 
+static int qemu_instance_has_disks(pid_t pid)
+{
+       int i;
+       int count = 0;
+       driver_list_entry_t *entry;
+
+       for (i = 0; i < MAX_DISK_TYPES; i++) {
+               entry = active_disks[i];
+               while (entry) {
+                       if ((entry->blkif->tappid == pid) && dtypes[i]->use_ioemu)
+                               count++;
+                       entry = entry->next;
+               }
+       }
+
+       return (count != 0);
+}
+
 static int del_disktype(blkif_t *blkif)
 {
        driver_list_entry_t *entry, **pprev;
@@ -240,6 +273,14 @@ static int del_disktype(blkif_t *blkif)
        DPRINTF("DEL_DISKTYPE: Freeing entry\n");
        free(entry);
 
+       /*
+        * When using ioemu, all disks of one VM are connected to the same
+        * qemu-dm instance. We may close the file handle only if there is
+        * no other disk left for this domain.
+        */
+       if (dtypes[type]->use_ioemu)
+               return !qemu_instance_has_disks(blkif->tappid);
+
        /* Caller should close() if no single controller, or list is empty. */
        return (!dtypes[type]->single_handler || (active_disks[type] == NULL));
 }
@@ -504,7 +545,8 @@ static int connect_qemu(blkif_t *blkif, int domid)
        static int tapdisk_ioemu_pid = 0;
        static int dom0_readfd = 0;
        static int dom0_writefd = 0;
-       
+       int refresh_pid = 0;
+
        if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) < 0)
                return -1;
 
@@ -523,15 +565,23 @@ static int connect_qemu(blkif_t *blkif, int domid)
                if (tapdisk_ioemu_pid == 0 || kill(tapdisk_ioemu_pid, 0)) {
                        /* No device model and tapdisk-ioemu doesn't run yet */
                        DPRINTF("Launching tapdisk-ioemu\n");
-                       tapdisk_ioemu_pid = launch_tapdisk_ioemu();
+                       launch_tapdisk_ioemu();
                        
                        dom0_readfd = open_ctrl_socket(wrctldev);
                        dom0_writefd = open_ctrl_socket(rdctldev);
+
+                       refresh_pid = 1;
                }
 
                DPRINTF("Using tapdisk-ioemu connection\n");
                blkif->fds[READ] = dom0_readfd;
                blkif->fds[WRITE] = dom0_writefd;
+
+               if (refresh_pid) {
+                       get_tapdisk_pid(blkif);
+                       tapdisk_ioemu_pid = blkif->tappid;
+               }
+
        } else if (access(rdctldev, R_OK | W_OK) == 0) {
                /* Use existing pipe to the device model */
                DPRINTF("Using qemu-dm connection\n");
@@ -605,13 +655,11 @@ static int blktapctrl_new_blkif(blkif_t *blkif)
        image_t *image;
        blkif_t *exist = NULL;
        static uint16_t next_cookie = 0;
+       int use_ioemu;
 
        DPRINTF("Received a poll for a new vbd\n");
        if ( ((blk=blkif->info) != NULL) && (blk->params != NULL) ) {
-               if (blktap_interface_create(ctlfd, &major, &minor, blkif) < 0)
-                       return -1;
-
-               if (test_path(blk->params, &ptr, &type, &exist) != 0) {
+               if (test_path(blk->params, &ptr, &type, &exist, &use_ioemu) != 0) {
                         DPRINTF("Error in blktap device string(%s).\n",
                                 blk->params);
                         goto fail;
@@ -620,7 +668,7 @@ static int blktapctrl_new_blkif(blkif_t *blkif)
                blkif->cookie = next_cookie++;
 
                if (!exist) {
-                       if (type == DISK_TYPE_IOEMU) {
+                       if (use_ioemu) {
                                if (connect_qemu(blkif, blkif->domid))
                                        goto fail;
                        } else {
@@ -634,10 +682,6 @@ static int blktapctrl_new_blkif(blkif_t *blkif)
                        blkif->fds[WRITE] = exist->fds[WRITE];
                }
 
-               add_disktype(blkif, type);
-               blkif->major = major;
-               blkif->minor = minor;
-
                image = (image_t *)malloc(sizeof(image_t));
                blkif->prv = (void *)image;
                blkif->ops = &tapdisk_ops;
@@ -661,11 +705,18 @@ static int blktapctrl_new_blkif(blkif_t *blkif)
                        goto fail;
                }
 
+               if (blktap_interface_create(ctlfd, &major, &minor, blkif) < 0)
+                       return -1;
+
+               blkif->major = major;
+               blkif->minor = minor;
+
+               add_disktype(blkif, type);
+
        } else return -1;
 
        return 0;
 fail:
-       ioctl(ctlfd, BLKTAP_IOCTL_FREEINTF, minor);
        return -EINVAL;
 }
 
@@ -696,6 +747,7 @@ static int unmap_blktapctrl(blkif_t *blkif)
        }
 
        if (del_disktype(blkif)) {
+               DPRINTF("Closing communication pipe to pid %d\n", blkif->tappid);
                close(blkif->fds[WRITE]);
                close(blkif->fds[READ]);
        }
index 51a2e324a140907a2b0176135c1b60ea857a59d5..c6d32a40776815f4fd3b192675eb9eec866e68e4 100644 (file)
@@ -722,11 +722,11 @@ static inline void init_fds(struct disk_driver *dd)
 /* Open the disk file and initialize qcow state. */
 static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
 {
-       int fd, len, i, shift, ret, size, l1_table_size, o_flags;
+       int fd, len, i, shift, ret, size, l1_table_size, o_flags, l1_table_block;
        int max_aio_reqs;
        struct td_state     *bs = dd->td_state;
        struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
-       char *buf;
+       char *buf, *buf2;
        QCowHeader *header;
        QCowHeader_ext *exthdr;
        uint32_t cksum;
@@ -819,9 +819,14 @@ static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flag
                (int) (s->l1_size * sizeof(uint64_t)), 
                l1_table_size);
 
-       lseek(fd, s->l1_table_offset, SEEK_SET);
-       if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
+       lseek(fd, 0, SEEK_SET);
+       l1_table_block = l1_table_size + s->l1_table_offset;
+       l1_table_block = l1_table_block + 512 - (l1_table_block % 512); 
+       ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
+       if (ret != 0) goto fail;
+       if (read(fd, buf2, l1_table_block) != l1_table_block)
                goto fail;
+       memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
 
        for(i = 0; i < s->l1_size; i++) {
                be64_to_cpus(&s->l1_table[i]);
@@ -871,8 +876,9 @@ static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flag
 
                        DPRINTF("qcow: Converting image to big endian L1 table\n");
 
-                       lseek(fd, s->l1_table_offset, SEEK_SET);
-                       if (write(fd, s->l1_table, l1_table_size) != l1_table_size) {
+                       memcpy(buf2 + s->l1_table_offset, s->l1_table, l1_table_size);
+                       lseek(fd, 0, SEEK_SET);
+                       if (write(fd, buf2, l1_table_block) != l1_table_block) {
                                DPRINTF("qcow: Failed to write new L1 table\n");
                                goto fail;
                        }
@@ -917,7 +923,7 @@ static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flag
        init_fds(dd);
 
        if (!final_cluster)
-               s->fd_end = s->l1_table_offset + l1_table_size;
+               s->fd_end = l1_table_block;
        else {
                s->fd_end = lseek(fd, 0, SEEK_END);
                if (s->fd_end == (off_t)-1)
@@ -1385,7 +1391,7 @@ static int tdqcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
        filename[len]  = '\0';
 
        id->name       = strdup(filename);
-       id->drivertype = DISK_TYPE_QCOW;
+       id->drivertype = DISK_TYPE_AIO;
        err            = 0;
  out:
        free(buf);
@@ -1397,17 +1403,15 @@ static int tdqcow_validate_parent(struct disk_driver *child,
 {
        struct stat stats;
        uint64_t psize, csize;
-       struct tdqcow_state *c = (struct tdqcow_state *)child->private;
-       struct tdqcow_state *p = (struct tdqcow_state *)parent->private;
        
-       if (stat(p->name, &stats))
+       if (stat(parent->name, &stats))
                return -EINVAL;
-       if (get_filesize(p->name, &psize, &stats))
+       if (get_filesize(parent->name, &psize, &stats))
                return -EINVAL;
 
-       if (stat(c->name, &stats))
+       if (stat(child->name, &stats))
                return -EINVAL;
-       if (get_filesize(c->name, &csize, &stats))
+       if (get_filesize(child->name, &csize, &stats))
                return -EINVAL;
 
        if (csize != psize)
index fe28a2ecb005aa154792ef80227c8cc1e9df379d..b0392e1802d8504bed7e361ca54fcd5975bb5780 100644 (file)
@@ -34,6 +34,7 @@
 #include "tapdisk.h"
 #include "tapaio.h"
 #include "bswap.h"
+#include "blk.h"
 
 #define USE_AIO
 
@@ -1902,6 +1903,42 @@ repeat:
 
 #endif 
 
+static int get_filesize(char *filename, uint64_t *size, struct stat *st)
+{
+       int fd;
+       QCowHeader header;
+
+       /*Set to the backing file size*/
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       if (read(fd, &header, sizeof(header)) < sizeof(header)) {
+               close(fd);
+               return -1;
+       }
+       close(fd);
+       
+       be32_to_cpus(&header.magic);
+       be32_to_cpus(&header.version);
+       be64_to_cpus(&header.size);
+       if (header.magic == QCOW_MAGIC && header.version == QCOW_VERSION) {
+               *size = header.size >> SECTOR_SHIFT;
+               return 0;
+       }
+
+       if(S_ISBLK(st->st_mode)) {
+               fd = open(filename, O_RDONLY);
+               if (fd < 0)
+                       return -1;
+               if (blk_getimagesize(fd, size) != 0) {
+                       close(fd);
+                       return -1;
+               }
+               close(fd);
+       } else *size = (st->st_size >> SECTOR_SHIFT);   
+       return 0;
+}
+
 /**
  * @return 
  *        0 if parent id successfully retrieved;
@@ -1916,7 +1953,7 @@ static int qcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
                return TD_NO_PARENT;
 
        id->name = strdup(s->backing_file);
-       id->drivertype = DISK_TYPE_QCOW2;
+       id->drivertype = DISK_TYPE_AIO;
 
        return 0;
 }
@@ -1924,18 +1961,123 @@ static int qcow_get_parent_id(struct disk_driver *dd, struct disk_id *id)
 static int qcow_validate_parent(struct disk_driver *child, 
                struct disk_driver *parent, td_flag_t flags)
 {
-       struct BDRVQcowState *cs = (struct BDRVQcowState*) child->private;
-       struct BDRVQcowState *ps = (struct BDRVQcowState*) parent->private;
+       struct stat stats;
+       uint64_t psize, csize;
+       
+       if (stat(parent->name, &stats))
+               return -EINVAL;
+       if (get_filesize(parent->name, &psize, &stats))
+               return -EINVAL;
 
-       if (ps->total_sectors != cs->total_sectors) {
-               DPRINTF("qcow_validate_parent(): %#"PRIx64" != %#"PRIx64"\n",
-                       ps->total_sectors, cs->total_sectors);
+       if (stat(child->name, &stats))
                return -EINVAL;
-       }
-       
+       if (get_filesize(child->name, &csize, &stats))
+               return -EINVAL;
+
+       if (csize != psize)
+               return -EINVAL;
+
        return 0;
 }
 
+int qcow2_create(const char *filename, uint64_t total_size,
+                      const char *backing_file, int flags)
+{
+    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
+    int ret = 0;
+    QCowHeader header;
+    uint64_t tmp, offset;
+    QCowCreateState s1, *s = &s1;
+
+    memset(s, 0, sizeof(*s));
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -1;
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        header.backing_file_offset = cpu_to_be64(header_size);
+        backing_filename_len = strlen(backing_file);
+        header.backing_file_size = cpu_to_be32(backing_filename_len);
+        header_size += backing_filename_len;
+    }
+    s->cluster_bits = 12;  /* 4 KB clusters */
+    s->cluster_size = 1 << s->cluster_bits;
+    header.cluster_bits = cpu_to_be32(s->cluster_bits);
+    header_size = (header_size + 7) & ~7;
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+    l2_bits = s->cluster_bits - 3;
+    shift = s->cluster_bits + l2_bits;
+    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
+    offset = align_offset(header_size, s->cluster_size);
+    s->l1_table_offset = offset;
+    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
+    header.l1_size = cpu_to_be32(l1_size);
+    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
+
+    s->refcount_table = qemu_mallocz(s->cluster_size);
+    s->refcount_block = qemu_mallocz(s->cluster_size);
+
+    s->refcount_table_offset = offset;
+    header.refcount_table_offset = cpu_to_be64(offset);
+    header.refcount_table_clusters = cpu_to_be32(1);
+    offset += s->cluster_size;
+
+    s->refcount_table[0] = cpu_to_be64(offset);
+    s->refcount_block_offset = offset;
+    offset += s->cluster_size;
+
+    /* update refcounts */
+    create_refcount_update(s, 0, header_size);
+    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
+    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
+    create_refcount_update(s, s->refcount_block_offset, s->cluster_size);
+
+    /* write all the data */
+    ret = write(fd, &header, sizeof(header));
+    if (ret < 0)
+        goto out;
+    if (backing_file) {
+        ret = write(fd, backing_file, backing_filename_len);
+        if (ret < 0)
+            goto out;
+    }
+    lseek(fd, s->l1_table_offset, SEEK_SET);
+    tmp = 0;
+    for(i = 0;i < l1_size; i++) {
+        ret = write(fd, &tmp, sizeof(tmp));
+        if (ret < 0)
+            goto out;
+    }
+    lseek(fd, s->refcount_table_offset, SEEK_SET);
+    ret = write(fd, s->refcount_table, s->cluster_size);
+    if (ret < 0)
+        goto out;
+
+    lseek(fd, s->refcount_block_offset, SEEK_SET);
+    ret = write(fd, s->refcount_block, s->cluster_size);
+    if (ret < 0)
+        goto out;
+    ret = 0;
+
+  out:
+    qemu_free(s->refcount_table);
+    qemu_free(s->refcount_block);
+    close(fd);
+    return ret;
+}
+
+
+
 struct tap_disk tapdisk_qcow2 = {
        "qcow2",
        sizeof(BDRVQcowState),
index 5ff50648a8ceb1ce9ce04421b5aaab4a107be6a7..25abfcd1d41c0ca2511530adcf490f0e6d7606c9 100644 (file)
@@ -52,7 +52,7 @@ static void help(void)
 {
        fprintf(stderr, "Qcow-utils: v1.0.0\n");
        fprintf(stderr, 
-               "usage: qcow-create [-h help] [-r reserve] <SIZE(MB)> <FILENAME> "
+               "usage: qcow-create [-h help] [-r reserve] [-f format] <SIZE(MB)> <FILENAME> "
                "[<BACKING_FILENAME>]\n"); 
        exit(-1);
 }
@@ -61,11 +61,13 @@ int main(int argc, char *argv[])
 {
        int ret = -1, c, backed = 0;
        int sparse =  1;
+       char *fmt = "qcow";
        uint64_t size;
        char filename[MAX_NAME_LEN], bfilename[MAX_NAME_LEN];
+       char *tmpfile;
 
         for(;;) {
-                c = getopt(argc, argv, "hr");
+                c = getopt(argc, argv, "hrf");
                 if (c == -1)
                         break;
                 switch(c) {
@@ -73,6 +75,9 @@ int main(int argc, char *argv[])
                         help();
                         exit(0);
                         break;
+                case 'f':
+                        fmt = argv[optind++];
+                        break;
                 case 'r':
                        sparse = 0;
                        break;
@@ -105,11 +110,16 @@ int main(int argc, char *argv[])
                }
        }
 
-       DFPRINTF("Creating file size %llu, name %s\n",(long long unsigned)size, filename);
-       if (!backed)
-               ret = qcow_create(filename,size,NULL,sparse);
-       else
-               ret = qcow_create(filename,size,bfilename,sparse);
+    tmpfile = backed ? bfilename: NULL; 
+    if (!strcmp(fmt, "qcow")) {
+        ret = qcow_create(filename, size, tmpfile, sparse);
+    } else if(!strcmp(fmt, "qcow2")) {
+        ret = qcow2_create(filename, size, tmpfile, sparse);
+    } else {
+        fprintf(stderr,"Unsupport format:%s\n", fmt);
+        exit(-1);
+    } 
+    DFPRINTF("Creating file size %llu, name %s\n",(long long unsigned)size, filename);
 
        if (ret < 0)
                DPRINTF("Unable to create QCOW file\n");
index c8a21827ffd08ab546f7a975cfb40bf7eb5354af..f3e165ac33a347a9e3d42f984d749518e4f18bbb 100644 (file)
@@ -145,6 +145,8 @@ typedef struct disk_info {
        char handle[10];     /* xend handle, e.g. 'ram' */
        int  single_handler; /* is there a single controller for all */
                             /* instances of disk type? */
+       int  use_ioemu;      /* backend provider: 0 = tapdisk; 1 = ioemu */
+
 #ifdef TAPDISK
        struct tap_disk *drv;   
 #endif
@@ -159,16 +161,6 @@ extern struct tap_disk tapdisk_ram;
 extern struct tap_disk tapdisk_qcow;
 extern struct tap_disk tapdisk_qcow2;
 
-#define MAX_DISK_TYPES     20
-
-#define DISK_TYPE_AIO      0
-#define DISK_TYPE_SYNC     1
-#define DISK_TYPE_VMDK     2
-#define DISK_TYPE_RAM      3
-#define DISK_TYPE_QCOW     4
-#define DISK_TYPE_QCOW2    5
-#define DISK_TYPE_IOEMU    6
-
 
 /*Define Individual Disk Parameters here */
 static disk_info_t aio_disk = {
@@ -176,6 +168,7 @@ static disk_info_t aio_disk = {
        "raw image (aio)",
        "aio",
        0,
+       0,
 #ifdef TAPDISK
        &tapdisk_aio,
 #endif
@@ -186,6 +179,7 @@ static disk_info_t sync_disk = {
        "raw image (sync)",
        "sync",
        0,
+       0,
 #ifdef TAPDISK
        &tapdisk_sync,
 #endif
@@ -196,6 +190,7 @@ static disk_info_t vmdk_disk = {
        "vmware image (vmdk)",
        "vmdk",
        1,
+       0,
 #ifdef TAPDISK
        &tapdisk_vmdk,
 #endif
@@ -206,6 +201,7 @@ static disk_info_t ram_disk = {
        "ramdisk image (ram)",
        "ram",
        1,
+       0,
 #ifdef TAPDISK
        &tapdisk_ram,
 #endif
@@ -216,6 +212,7 @@ static disk_info_t qcow_disk = {
        "qcow disk (qcow)",
        "qcow",
        0,
+       0,
 #ifdef TAPDISK
        &tapdisk_qcow,
 #endif
@@ -226,21 +223,12 @@ static disk_info_t qcow2_disk = {
        "qcow2 disk (qcow2)",
        "qcow2",
        0,
+       0,
 #ifdef TAPDISK
        &tapdisk_qcow2,
 #endif
 };
 
-static disk_info_t ioemu_disk = {
-       DISK_TYPE_IOEMU,
-       "ioemu disk",
-       "ioemu",
-       1,
-#ifdef TAPDISK
-       NULL
-#endif
-};
-
 /*Main disk info array */
 static disk_info_t *dtypes[] = {
        &aio_disk,
@@ -249,7 +237,6 @@ static disk_info_t *dtypes[] = {
        &ram_disk,
        &qcow_disk,
        &qcow2_disk,
-       &ioemu_disk,
 };
 
 typedef struct driver_list_entry {
@@ -266,4 +253,7 @@ typedef struct fd_list_entry {
 
 int qcow_create(const char *filename, uint64_t total_size,
                const char *backing_file, int flags);
+
+int qcow2_create(const char *filename, uint64_t total_size,
+               const char *backing_file, int flags);
 #endif /*TAPDISK_H_*/
index 5511bf2f084982611541937b7c2d779b0c28146d..d54ae3e57b249c5699484c14fe7183144aab8688 100644 (file)
@@ -210,6 +210,16 @@ typedef struct msg_pid {
 #define CTLMSG_PID         9
 #define CTLMSG_PID_RSP     10
 
+/* disk driver types */
+#define MAX_DISK_TYPES     20
+
+#define DISK_TYPE_AIO      0
+#define DISK_TYPE_SYNC     1
+#define DISK_TYPE_VMDK     2
+#define DISK_TYPE_RAM      3
+#define DISK_TYPE_QCOW     4
+#define DISK_TYPE_QCOW2    5
+
 /* xenstore/xenbus: */
 #define DOMNAME "Domain-0"
 int setup_probe_watch(struct xs_handle *h);
index 4fc56d661baf8482a89435b1f80cd52e557ee84a..bf17d0b7a02c24b1ac27252557b1e7b8c0561117 100644 (file)
@@ -48,6 +48,7 @@
 #include <poll.h>
 #include <time.h>
 #include <sys/time.h>
+#include <unistd.h>
 #include "blktaplib.h"
 #include "list.h"
 #include "xs_api.h"
@@ -149,6 +150,137 @@ static int backend_remove(struct xs_handle *h, struct backend_info *be)
        return 0;
 }
 
+static int check_sharing(struct xs_handle *h, struct backend_info *be)
+{
+       char *dom_uuid;
+       char *cur_dom_uuid;
+       char *path;
+       char *mode;
+       char *params;
+       char **domains;
+       char **devices;
+       int i, j;
+       unsigned int num_dom, num_dev;
+       blkif_info_t *info;
+       int ret = 0;
+
+       /* If the mode contains '!' or doesn't contain 'w' don't check anything */
+       xs_gather(h, be->backpath, "mode", NULL, &mode, NULL);
+       if (strchr(mode, '!'))
+               goto out;
+       if (strchr(mode, 'w') == NULL)
+               goto out;
+
+       /* Get the UUID of the domain we want to attach to */
+       if (asprintf(&path, "/local/domain/%ld", be->frontend_id) == -1)
+               goto fail;
+       xs_gather(h, path, "vm", NULL, &dom_uuid, NULL);
+       free(path);
+
+       /* Iterate through the devices of all VMs */
+       domains = xs_directory(h, XBT_NULL, "backend/tap", &num_dom);
+       if (domains == NULL)
+               num_dom = 0;
+
+       for (i = 0; !ret && (i < num_dom); i++) {
+
+               /* If it's the same VM, no action needed */
+               if (asprintf(&path, "/local/domain/%s", domains[i]) == -1) {
+                       ret = -1;
+                       break;
+               }
+               xs_gather(h, path, "vm", NULL, &cur_dom_uuid, NULL);
+               free(path);
+
+               if (!strcmp(cur_dom_uuid, dom_uuid)) {
+                       free(cur_dom_uuid);
+                       continue;
+               }
+
+               /* Check the devices */
+               if (asprintf(&path, "backend/tap/%s", domains[i]) == -1) {
+                       ret = -1;
+                       free(cur_dom_uuid);
+                       break;
+               }
+               devices = xs_directory(h, XBT_NULL, path, &num_dev);
+               if (devices == NULL)
+                       num_dev = 0;
+               free(path);
+
+               for (j = 0; !ret && (j < num_dev); j++) {
+                       if (asprintf(&path, "backend/tap/%s/%s", domains[i], devices[j]) == -1) {
+                               ret = -1;
+                               break;
+                       }
+                       xs_gather(h, path, "params", NULL, &params, NULL);
+                       free(path);
+
+                       info =  be->blkif->info;
+                       if (strcmp(params, info->params)) {
+                               ret = -1;
+                       }
+
+                       free(params);
+               }
+
+               free(cur_dom_uuid);
+               free(devices);
+       }
+       free(domains);
+       free(dom_uuid);
+       goto out;
+
+fail:
+       ret = -1;
+out:
+       free(mode);
+       return ret;
+}
+
+static int check_image(struct xs_handle *h, struct backend_info *be,
+       const char** errmsg)
+{
+       const char *tmp;
+       const char *path;
+       int mode;
+       blkif_t *blkif = be->blkif;
+       blkif_info_t *info = blkif->info;
+
+       /* Strip off the image type */
+       path = info->params;
+
+       if (!strncmp(path, "tapdisk:", strlen("tapdisk:"))) {
+               path += strlen("tapdisk:");
+       } else if (!strncmp(path, "ioemu:", strlen("ioemu:"))) {
+               path += strlen("ioemu:");
+       }
+
+       tmp = strchr(path, ':');
+       if (tmp != NULL)
+               path = tmp + 1;
+
+       /* Check if the image exists and access is permitted */
+       mode = R_OK;
+       if (!be->readonly)
+               mode |= W_OK;
+       if (access(path, mode)) {
+               if (errno == ENOENT)
+                       *errmsg = "File not found.";
+               else
+                       *errmsg = "Insufficient file permissions.";
+               return -1;
+       }
+
+       /* Check that the image is not attached to a different VM */
+       if (check_sharing(h, be)) {
+               *errmsg = "File already in use by other domain";
+               return -1;
+       }
+
+       return 0;
+}
+
 static void ueblktap_setup(struct xs_handle *h, char *bepath)
 {
        struct backend_info *be;
@@ -156,6 +288,7 @@ static void ueblktap_setup(struct xs_handle *h, char *bepath)
        int len, er, deverr;
        long int pdev = 0, handle;
        blkif_info_t *blk;
+       const char* errmsg = NULL;
        
        be = be_lookup_be(bepath);
        if (be == NULL)
@@ -211,6 +344,9 @@ static void ueblktap_setup(struct xs_handle *h, char *bepath)
                        be->pdev = pdev;
                }
 
+               if (check_image(h, be, &errmsg))
+                       goto fail;
+
                er = blkif_init(be->blkif, handle, be->pdev, be->readonly);
                if (er != 0) {
                        DPRINTF("Unable to open device %s\n",blk->params);
@@ -246,12 +382,21 @@ static void ueblktap_setup(struct xs_handle *h, char *bepath)
        }
 
        be->blkif->state = CONNECTED;
+       xs_printf(h, be->backpath, "hotplug-status", "connected");
+
        DPRINTF("[SETUP] Complete\n\n");
        goto close;
        
 fail:
-       if ( (be != NULL) && (be->blkif != NULL) ) 
+       if (be) {
+               if (errmsg == NULL)
+                       errmsg = "Setting up the backend failed. See the log "
+                               "files in /var/log/xen/ for details.";
+               xs_printf(h, be->backpath, "hotplug-error", errmsg);
+               xs_printf(h, be->backpath, "hotplug-status", "error");
+
                backend_remove(h, be);
+       }
 close:
        if (path)
                free(path);
@@ -286,7 +431,8 @@ static void ueblktap_probe(struct xs_handle *h, struct xenbus_watch *w,
        len = strsep_len(bepath, '/', 7);
        if (len < 0) 
                goto free_be;
-       bepath[len] = '\0';
+       if (bepath[len] != '\0')
+               goto free_be;
        
        be = malloc(sizeof(*be));
        if (!be) {
index 10e909a0b621572096317c02c6afaae3b76dc3f3..7339e9e064d026db8a18da3bca46e649c71a343d 100644 (file)
@@ -16,7 +16,7 @@ all: $(BIN)
 
 .PHONY: clean
 clean:
-       $(RM) *.a *.so *.o *.rpm $(BIN)
+       $(RM) *.a *.so *.o *.rpm $(BIN) $(DEPS)
        $(RM) client/*.o daemon/*.o
 
 xenconsoled: $(patsubst %.c,%.o,$(wildcard daemon/*.c))
@@ -33,3 +33,5 @@ install: $(BIN)
        $(INSTALL_PROG) xenconsoled $(DESTDIR)/$(SBINDIR)
        $(INSTALL_DIR) $(DESTDIR)$(PRIVATE_BINDIR)
        $(INSTALL_PROG) xenconsole $(DESTDIR)$(PRIVATE_BINDIR)
+
+-include $(DEPS)
index 509e44bf91a52ac17122c86174b1c6a41636688c..39556da5a048fb72a8caa2b4eabd3a5cbf13cad3 100644 (file)
@@ -35,6 +35,9 @@
 #include <err.h>
 #include <errno.h>
 #include <string.h>
+#ifdef __sun__
+#include <sys/stropts.h>
+#endif
 
 #include "xs.h"
 
@@ -71,6 +74,21 @@ static void usage(const char *program) {
               , program);
 }
 
+#ifdef __sun__
+void cfmakeraw(struct termios *termios_p)
+{
+       termios_p->c_iflag &=
+           ~(IGNBRK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON);
+       termios_p->c_oflag &= ~OPOST;
+       termios_p->c_lflag &= ~(ECHO|ECHONL|ICANON|ISIG|IEXTEN);
+       termios_p->c_cflag &= ~(CSIZE|PARENB);
+       termios_p->c_cflag |= CS8;
+
+       termios_p->c_cc[VMIN] = 0;
+       termios_p->c_cc[VTIME] = 0;
+}
+#endif
+
 static int get_pty_fd(struct xs_handle *xs, char *path, int seconds)
 /* Check for a pty in xenstore, open it and return its fd.
  * Assumes there is already a watch set in the store for this path. */
@@ -80,7 +98,7 @@ static int get_pty_fd(struct xs_handle *xs, char *path, int seconds)
        int xs_fd = xs_fileno(xs), pty_fd = -1;
        int start, now;
        unsigned int len = 0;
-       char *pty_path, **watch_paths;;
+       char *pty_path, **watch_paths;
 
        start = now = time(NULL);
        do {
@@ -104,6 +122,29 @@ static int get_pty_fd(struct xs_handle *xs, char *path, int seconds)
                        }
                }
        } while (pty_fd == -1 && (now = time(NULL)) < start + seconds);
+
+#ifdef __sun__
+       if (pty_fd != -1) {
+               struct termios term;
+
+               /*
+                * The pty may come from either xend (with pygrub) or
+                * xenconsoled.  It may have tty semantics set up, or not.
+                * While it isn't strictly necessary to have those
+                * semantics here, it is good to have a consistent
+                * state that is the same as under Linux.
+                *
+                * If tcgetattr fails, they have not been set up,
+                * so go ahead and set them up now, by pushing the
+                * ptem and ldterm streams modules.
+                */
+               if (tcgetattr(pty_fd, &term) < 0) {
+                       ioctl(pty_fd, I_PUSH, "ptem");
+                       ioctl(pty_fd, I_PUSH, "ldterm");
+               }
+       }
+#endif
+
        return pty_fd;
 }
 
@@ -119,12 +160,12 @@ static void init_term(int fd, struct termios *old)
        new_term = *old;
        cfmakeraw(&new_term);
 
-       tcsetattr(fd, TCSAFLUSH, &new_term);
+       tcsetattr(fd, TCSANOW, &new_term);
 }
 
 static void restore_term(int fd, struct termios *old)
 {
-       tcsetattr(fd, TCSAFLUSH, old);
+       tcsetattr(fd, TCSANOW, old);
 }
 
 static int console_loop(int fd, struct xs_handle *xs, char *pty_path)
@@ -152,7 +193,8 @@ static int console_loop(int fd, struct xs_handle *xs, char *pty_path)
 
                if (FD_ISSET(xs_fileno(xs), &fds)) {
                        int newfd = get_pty_fd(xs, pty_path, 0);
-                       close(fd);
+                       if (fd != -1)
+                               close(fd);
                         if (newfd == -1) 
                                /* Console PTY has become invalid */
                                return 0;
index 78aea83ed7d596f2c07e3c4a2ab772a6c44b50ed..de712affe3ee8e990a7c04abbb48c6908672154c 100644 (file)
@@ -402,9 +402,7 @@ static int domain_create_tty(struct domain *dom)
        assert(dom->slave_fd == -1);
        assert(dom->master_fd == -1);
 
-       cfmakeraw(&term);
-
-       if (openpty(&dom->master_fd, &dom->slave_fd, NULL, &term, NULL) < 0) {
+       if (openpty(&dom->master_fd, &dom->slave_fd, NULL, NULL, NULL) < 0) {
                err = errno;
                dolog(LOG_ERR, "Failed to create tty for domain-%d "
                      "(errno = %i, %s)",
@@ -412,6 +410,22 @@ static int domain_create_tty(struct domain *dom)
                return 0;
        }
 
+       if (tcgetattr(dom->slave_fd, &term) < 0) {
+               err = errno;
+               dolog(LOG_ERR, "Failed to get tty attributes for domain-%d "
+                       "(errno = %i, %s)",
+                       dom->domid, err, strerror(err));
+               goto out;
+       }
+       cfmakeraw(&term);
+       if (tcsetattr(dom->slave_fd, TCSANOW, &term) < 0) {
+               err = errno;
+               dolog(LOG_ERR, "Failed to set tty attributes for domain-%d "
+                       "(errno = %i, %s)",
+                       dom->domid, err, strerror(err));
+               goto out;
+       }
+
        if ((slave = ptsname(dom->master_fd)) == NULL) {
                err = errno;
                dolog(LOG_ERR, "Failed to get slave name for domain-%d "
index c1529d0ac2b1c6fb7935611ff88a2b1e74f11def..60faa4bb2a2ca45430158656299228713bc7b92e 100644 (file)
@@ -86,7 +86,9 @@ int main(int argc, char **argv)
                        version(argv[0]);
                        exit(0);
                case 'v':
+#ifndef __sun__
                        syslog_option |= LOG_PERROR;
+#endif
                        syslog_mask = LOG_DEBUG;
                        break;
                case 'i':
index e4702179341c9afbd7df12589263d62ae26417af..1ceb866df3de6f6a1087f162b8a7f7d695eed7e5 100644 (file)
@@ -83,7 +83,7 @@ READLINE_DEP = $$(READLINE_DIR)
 # -I. for config files.
 # -I${srcdir} for our headers.
 # -I$(srcdir)/../regformats for regdef.h.
-INCLUDE_CFLAGS = -I. -I${srcdir} -I$(srcdir)/../regformats -I$(INCLUDE_DIR)  -I../../../../../libxc/
+INCLUDE_CFLAGS = -I. -I${srcdir} -I$(srcdir)/../regformats -I$(INCLUDE_DIR)  -I../../../../../libxc/ -I../../../../../include/
 
 # M{H,T}_CFLAGS, if defined, has host- and target-dependent CFLAGS
 # from the config/ directory.
index 39310394f5600c6d0cf2a6edfc6d6d603107b56b..6c8349b154e7a08dfba725bfdfe26ca40cead302 100644 (file)
@@ -24,41 +24,6 @@ XEN_CONFIGS += xmexample.vti
 XEN_CONFIGS += xend-pci-quirks.sxp
 XEN_CONFIGS += xend-pci-permissive.sxp
 
-# Xen script dir and scripts to go there.
-XEN_SCRIPT_DIR = /etc/xen/scripts
-XEN_SCRIPTS = network-bridge vif-bridge
-XEN_SCRIPTS += network-route vif-route
-XEN_SCRIPTS += network-nat vif-nat
-XEN_SCRIPTS += block
-XEN_SCRIPTS += block-enbd block-nbd
-XEN_SCRIPTS += blktap
-XEN_SCRIPTS += vtpm vtpm-delete
-XEN_SCRIPTS += xen-hotplug-cleanup
-XEN_SCRIPTS += external-device-migrate
-XEN_SCRIPTS += vscsi
-XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
-XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
-XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
-XEN_SCRIPT_DATA += vtpm-migration.sh vtpm-impl
-
-XEN_HOTPLUG_DIR = /etc/hotplug
-XEN_HOTPLUG_SCRIPTS = xen-backend.agent
-
-UDEV_RULES_DIR = /etc/udev
-UDEV_RULES = xen-backend.rules
-
-DI = $(if $(DISTDIR),$(shell readlink -f $(DISTDIR)),)
-DE = $(if $(DESTDIR),$(shell readlink -f $(DESTDIR)),)
-ifeq ($(findstring $(DI),$(DE)),$(DI))
-HOTPLUGS=install-hotplug install-udev
-else
-ifeq ($(shell [ -x /usr/bin/udevinfo ] && [ `/usr/bin/udevinfo -V | sed -e 's/^[^0-9]* \([0-9]\{1,\}\)[^0-9]\{0,\}/\1/'` -ge 059 ] && echo 1),1)
-HOTPLUGS=install-udev
-else
-HOTPLUGS=install-hotplug
-endif
-endif
-
 .PHONY: all
 all:
 
@@ -66,7 +31,7 @@ all:
 build:
 
 .PHONY: install
-install: all install-readmes install-initd install-configs install-scripts $(HOTPLUGS)
+install: all install-readmes install-configs $(HOTPLUGS)
 
 .PHONY: install-readmes
 install-readmes:
@@ -77,14 +42,6 @@ install-readmes:
            $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_CONFIG_DIR); \
        done
 
-.PHONY: install-initd
-install-initd:
-       [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
-       [ -d $(DESTDIR)/etc/sysconfig ] || $(INSTALL_DIR) $(DESTDIR)/etc/sysconfig
-       $(INSTALL_PROG) $(XEND_INITD) $(DESTDIR)/etc/init.d
-       $(INSTALL_PROG) $(XENDOMAINS_INITD) $(DESTDIR)/etc/init.d
-       $(INSTALL_PROG) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)/etc/sysconfig/xendomains
-
 .PHONY: install-configs
 install-configs: $(XEN_CONFIGS)
        [ -d $(DESTDIR)$(XEN_CONFIG_DIR) ] || \
@@ -96,19 +53,6 @@ install-configs: $(XEN_CONFIGS)
            $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_CONFIG_DIR); \
        done
 
-.PHONY: install-scripts
-install-scripts:
-       [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \
-               $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
-       set -e; for i in $(XEN_SCRIPTS); \
-           do \
-           $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
-       done
-       set -e; for i in $(XEN_SCRIPT_DATA); \
-           do \
-           $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
-       done
-
 .PHONY: install-hotplug
 install-hotplug:
        [ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
index 5465c39af31da2f09f9b1cc0577c29c1b1327ef2..7ca270661a0f5b442ac7a11f4abb56697390e500 100644 (file)
@@ -64,6 +64,7 @@
 #(xend-relocation-server no)
 (xend-relocation-server yes)
 #(xend-relocation-ssl-server no)
+#(xend-udev-event-server no)
 
 #(xend-unix-path /var/lib/xend/xend-socket)
 
index aa014b529b935aa65b90c9ccd12edb2383082063..420e2b27638487df55dc25b87b55ab510a4eaafb 100644 (file)
@@ -7,11 +7,11 @@
 #============================================================================
 
 import os, re
+
+arch_libdir = 'lib'
 arch = os.uname()[4]
-if re.search('64', arch):
+if os.uname()[0] == 'Linux' and re.search('64', arch):
     arch_libdir = 'lib64'
-else:
-    arch_libdir = 'lib'
 
 #----------------------------------------------------------------------------
 # Kernel image file.
@@ -224,6 +224,10 @@ serial='pty'
 #   Set keyboard layout, default is en-us keyboard. 
 #keymap='ja'
 
+#-----------------------------------------------------------------------------
+#   Enable/disable xen platform PCI device, default=1 (enabled)
+#xen_platform_pci=1
+
 #-----------------------------------------------------------------------------
 #   Configure guest CPUID responses:
 #
@@ -264,8 +268,8 @@ serial='pty'
 # Look like a generic 686 :
 # cpuid = [ '0:eax=0x3,ebx=0x0,ecx=0x0,edx=0x0',
 #           '1:eax=0x06b1,
-#              ecx=xxxxxxxxxx0000xx00xxx0000000xx0,
-#              edx=xx00000xxxxxxx0xxxxxxxxx0xxxxxx',
+#              ecx=xxxxxxxxxxx0000xx00xxx0000000xx0,
+#              edx=xxx00000xxxxxxx0xxxxxxxxx0xxxxxx',
 #           '4:eax=0x3,ebx=0x0,ecx=0x0,edx=0x0',
 #  '0x80000000:eax=0x3,ebx=0x0,ecx=0x0,edx=0x0']
 #  with the highest leaf
@@ -288,6 +292,48 @@ serial='pty'
 #  'x' -> we don't care (do not check)
 #  's' -> the bit must be the same as on the host that started this VM
 
+#-----------------------------------------------------------------------------
+#   Configure passthrough PCI{,-X,e} devices:
+#
+#   pci=[ '[SSSS:]BB:DD.F[,option1[,option2[...]]]', ... ]
+#
+#   [SSSS]:BB:DD.F  "bus segment:bus:device.function"(1) of the device to
+#                   be assigned, bus segment is optional. All fields are
+#                   in hexadecimal and no field should be longer than that
+#                   as shown in the pattern. Successful assignment may need
+#                   certain hardware support and additional configurations
+#                   (e.g. VT-d, see docs/misc/vtd.txt for more details).
+#
+#       (1) bus segment is sometimes also referred to as the PCI "domain",
+#           not to be confused with Xen domain.
+#
+#
+#   optionN         per-device options in "key=val" format. Current
+#                   available options are:
+#                   - msitranslate=0|1
+#                      per-device overriden of pci_msitranslate, see below
+#                   - power_mgmt=0|1
+#                      per-device overriden of pci_power_mgmt, see below
+#
+#pci=[ '07:00.0', '07:00.1' ]
+
+#   MSI-INTx translation for MSI capable devices:
+#
+#   If it's set, Xen will enable MSI for the device that supports it even
+# if the guest don't use MSI. In the case, an IO-APIC type interrupt will
+# be injected to the guest every time a corresponding MSI message is
+# received.
+#   If the guest enables MSI or MSI-X, the translation is automatically
+# turned off.
+# 
+#pci_msitranslate=1
+
+#   PCI Power Management:
+#
+#   If it's set, the guest OS will be able to program D0-D3hot states of the
+# PCI device for the purpose of low power consumption.
+# 
+#pci_power_mgmt=0
 
 #-----------------------------------------------------------------------------
 #   Configure PVSCSI devices:
index de1619ba322e8e05db2bb0bc2792528811138dee..9e7d207b988cec077ac9e9f2260b74e90dfb0ce2 100644 (file)
@@ -11,4 +11,4 @@ vif = [ '', 'type=ioemu, bridge=xenbr0' ]
 disk = [ 'file:/var/images/min-el3-i386.img,hda,w', ',hdc:cdrom,r' ]
 
 # Actual output via PVFB
-vfb = [ 'type=sdl' ]
+vfb = [ 'sdl=1' ]
index 2d647c936573c11fd57044f003e232e820dd7e92..1b687151b60b82c8959c0c1a3783f29656d6545c 100644 (file)
@@ -77,29 +77,29 @@ disk = [ 'phy:hda1,hda1,w' ]
 #
 # To create one using the SDL backend and sensible defaults:
 #
-# vfb = [ 'type=sdl' ]
+# vfb = [ 'sdl=1' ]
 #
 # This uses environment variables XAUTHORITY and DISPLAY.  You
 # can override that:
 #
-# vfb = [ 'type=sdl,xauthority=/home/bozo/.Xauthority,display=:1' ]
+# vfb = [ 'sdl=1,xauthority=/home/bozo/.Xauthority,display=:1' ]
 #
 # To create one using the VNC backend and sensible defaults:
 #
-# vfb = [ 'type=vnc' ]
+# vfb = [ 'vnc=1' ]
 #
 # The backend listens on 127.0.0.1 port 5900+N by default, where N is
 # the domain ID.  You can override both address and N:
 #
-# vfb = [ 'type=vnc,vnclisten=127.0.0.1,vncdisplay=1' ]
+# vfb = [ 'vnc=1,vnclisten=127.0.0.1,vncdisplay=1' ]
 #
 # Or you can bind the first unused port above 5900:
 #
-# vfb = [ 'type=vnc,vnclisten=0.0.0.0,vncunused=1' ]
+# vfb = [ 'vnc=1,vnclisten=0.0.0.0,vncunused=1' ]
 #
 # You can override the password:
 #
-# vfb = [ 'type=vnc,vncpasswd=MYPASSWD' ]
+# vfb = [ 'vnc=1,vncpasswd=MYPASSWD' ]
 #
 # Empty password disables authentication.  Defaults to the vncpasswd
 # configured in xend-config.sxp.
index 3169e52d03a65e4d4f684e32e6f202708f5a5106..d93c6531acad9d6234423c3b28465708f40d2585 100644 (file)
@@ -7,8 +7,8 @@
 #============================================================================
 
 import os, re
-arch = os.uname()[4]
 arch_libdir = 'lib'
+arch = os.uname()[4]
 
 #----------------------------------------------------------------------------
 # Kernel image file.
index fe382d43f658a941105fa13df8205872ee67a901..330aaf099935cbe249cc97e781a55dc1234019eb 100644 (file)
@@ -73,29 +73,29 @@ disk = [ 'phy:hda1,hda1,w' ]
 #
 # To create one using the SDL backend and sensible defaults:
 #
-# vfb = [ 'type=sdl' ]
+# vfb = [ 'sdl=1' ]
 #
 # This uses environment variables XAUTHORITY and DISPLAY.  You
 # can override that:
 #
-# vfb = [ 'type=sdl,xauthority=/home/bozo/.Xauthority,display=:1' ]
+# vfb = [ 'sdl=1,xauthority=/home/bozo/.Xauthority,display=:1' ]
 #
 # To create one using the VNC backend and sensible defaults:
 #
-# vfb = [ 'type=vnc' ]
+# vfb = [ 'vnc=1' ]
 #
 # The backend listens on 127.0.0.1 port 5900+N by default, where N is
 # the domain ID.  You can override both address and N:
 #
-# vfb = [ 'type=vnc,vnclisten=127.0.0.1,vncdisplay=1' ]
+# vfb = [ 'vnc=1,vnclisten=127.0.0.1,vncdisplay=1' ]
 #
 # Or you can bind the first unused port above 5900:
 #
-# vfb = [ 'type=vnc,vnclisten=0.0.0.0,vncunused=1' ]
+# vfb = [ 'vnc=1,vnclisten=0.0.0.0,vncunused=1' ]
 #
 # You can override the password:
 #
-# vfb = [ 'type=vnc,vncpasswd=MYPASSWD' ]
+# vfb = [ 'vnc=1,vncpasswd=MYPASSWD' ]
 #
 # Empty password disables authentication.  Defaults to the vncpasswd
 # configured in xend-config.sxp.
index 53ee3aa98eea7d2e049f737b9bcc23f8b9aea228..4e5eb3dffb58869b3a6cd23b8576a0214b1070dd 100644 (file)
@@ -109,29 +109,29 @@ disk = [ 'phy:sda%d,sda1,w' % (7+vmid),
 #
 # To create one using the SDL backend and sensible defaults:
 #
-# vfb = [ 'type=sdl' ]
+# vfb = [ 'sdl=1' ]
 #
 # This uses environment variables XAUTHORITY and DISPLAY.  You
 # can override that:
 #
-# vfb = [ 'type=sdl,xauthority=/home/bozo/.Xauthority,display=:1' ]
+# vfb = [ 'sdl=1,xauthority=/home/bozo/.Xauthority,display=:1' ]
 #
 # To create one using the VNC backend and sensible defaults:
 #
-# vfb = [ 'type=vnc' ]
+# vfb = [ 'vnc=1' ]
 #
 # The backend listens on 127.0.0.1 port 5900+N by default, where N is
 # the domain ID.  You can override both address and N:
 #
-# vfb = [ 'type=vnc,vnclisten=127.0.0.1,vncdisplay=%d' % vmid ]
+# vfb = [ 'vnc=1,vnclisten=127.0.0.1,vncdisplay=%d' % vmid ]
 #
 # Or you can bind the first unused port above 5900:
 #
-# vfb = [ 'type=vnc,vnclisten=0.0.0.0,vncunused=1' ]
+# vfb = [ 'vnc=1,vnclisten=0.0.0.0,vncunused=1' ]
 #
 # You can override the password:
 #
-# vfb = [ 'type=vnc,vncpasswd=MYPASSWD' ]
+# vfb = [ 'vnc=1,vncpasswd=MYPASSWD' ]
 #
 # Empty password disables authentication.  Defaults to the vncpasswd
 # configured in xend-config.sxp.
index 99281904fd0ab8a15bbd51e1e401d4bf43b306c1..dc22ce1db3bf4ec28c28901a063920a9e46353b3 100644 (file)
@@ -94,29 +94,29 @@ disk = [ 'phy:hda%d,hda1,w' % (vmid)]
 #
 # To create one using the SDL backend and sensible defaults:
 #
-# vfb = [ 'type=sdl' ]
+# vfb = [ 'sdl=1' ]
 #
 # This uses environment variables XAUTHORITY and DISPLAY.  You
 # can override that:
 #
-# vfb = [ 'type=sdl,xauthority=/home/bozo/.Xauthority,display=:1' ]
+# vfb = [ 'sdl=1,xauthority=/home/bozo/.Xauthority,display=:1' ]
 #
 # To create one using the VNC backend and sensible defaults:
 #
-# vfb = [ 'type=vnc' ]
+# vfb = [ 'vnc=1' ]
 #
 # The backend listens on 127.0.0.1 port 5900+N by default, where N is
 # the domain ID.  You can override both address and N:
 #
-# vfb = [ 'type=vnc,vnclisten=127.0.0.1,vncdisplay=%d' % vmid ]
+# vfb = [ 'vnc=1,vnclisten=127.0.0.1,vncdisplay=%d' % vmid ]
 #
 # Or you can bind the first unused port above 5900:
 #
-# vfb = [ 'type=vnc,vnclisten=0.0.0.0,vncunused=1' ]
+# vfb = [ 'vnc=1,vnclisten=0.0.0.0,vncunused=1' ]
 #
 # You can override the password:
 #
-# vfb = [ 'type=vnc,vncpasswd=MYPASSWD' ]
+# vfb = [ 'vnc=1,vncpasswd=MYPASSWD' ]
 #
 # Empty password disables authentication.  Defaults to the vncpasswd
 # configured in xend-config.sxp.
index a9bc54da5c23848e3a8379db34ff91d2a9fd86a6..2fb60359566ab1ce4ebbea9c6e7f3b2b43b7a5e7 100644 (file)
@@ -2,9 +2,8 @@ XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
 
 # hvmloader is a 32-bit protected mode binary.
-# It belongs in /usr/lib, not /usr/lib64.
 TARGET      := hvmloader/hvmloader
-INST_DIR := $(DESTDIR)/usr/lib/xen/boot
+INST_DIR := $(DESTDIR)$(LIBDIR_x86_32)/xen/boot
 
 SUBDIRS :=
 SUBDIRS += rombios
@@ -15,13 +14,13 @@ SUBDIRS += hvmloader
 .PHONY: all
 all:
        @set -e; if [ $$((`( bcc -v 2>&1 | grep version || echo 0.0.0 ) | cut -d' ' -f 3 | awk -F. '{ printf "0x%02x%02x%02x", $$1, $$2, $$3}'`)) -lt $$((0x00100e)) ] ; then \
-       echo "***********************************************************"; \
-       echo "Require dev86 package version >= 0.16.14 to build firmware!"; \
-       echo "(visit http://www.cix.co.uk/~mayday for more information)"; \
-       echo "***********************************************************"; \
-       else \
-       $(MAKE) subdirs-$@; \
+       echo "==========================================================================="; \
+       echo "Require dev86 rpm or bin86 & bcc debs version >= 0.16.14 to build firmware!"; \
+       echo "(visit http://www.debath.co.uk/dev86/ for more information)"; \
+       echo "==========================================================================="; \
+       false ; \
        fi
+       $(MAKE) subdirs-$@; \
 
 
 .PHONY: install
index 6cf0dfa2ac01c6979f062da0d6d4a04fd3a63786..901bbf2434cb6fe3d216773c867152958531b854 100644 (file)
@@ -2,7 +2,7 @@
 override XEN_TARGET_ARCH = x86_32
 
 # User-supplied CFLAGS are not useful here.
-CFLAGS :=
+CFLAGS =
 
 include $(XEN_ROOT)/tools/Rules.mk
 
@@ -13,9 +13,9 @@ endif
 CFLAGS += -Werror
 
 # Disable PIE/SSP if GCC supports them. They can break us.
-CFLAGS += $(call cc-option,$(CC),-nopie,)
-CFLAGS += $(call cc-option,$(CC),-fno-stack-protector,)
-CFLAGS += $(call cc-option,$(CC),-fno-stack-protector-all,)
+$(call cc-option-add,CFLAGS,CC,-nopie)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector-all)
 
 # Extra CFLAGS suitable for an embedded type of environment.
 CFLAGS += -fno-builtin -msoft-float
index b96f9650e3faf0f61f47ea58ab2cfe44d69b1925..fa62ccb3eee8514c3cbf6a1d72eed3398d83f9fe 100644 (file)
 
 #include "../rombios/32bit/32bitbios_flat.h"
 
-static void relocate_32bitbios(char *elfarray, uint32_t elfarraysize)
+static uint32_t relocate_32bitbios(char *elfarray, uint32_t elfarraysize)
 {
     Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfarray;
     Elf32_Shdr *shdr = (Elf32_Shdr *)&elfarray[ehdr->e_shoff];
-    char *secstrings = &elfarray[shdr[ehdr->e_shstrndx].sh_offset];
-    char *jump_table;
     uint32_t reloc_off, reloc_size;
     char *highbiosarea;
-    int i, jump_sec_idx = 0;
+    int i;
 
     /*
      * Step 1. General elf cleanup, and compute total relocation size.
@@ -51,13 +49,6 @@ static void relocate_32bitbios(char *elfarray, uint32_t elfarraysize)
         /* By default all section data points into elf image data array. */
         shdr[i].sh_addr = (Elf32_Addr)&elfarray[shdr[i].sh_offset];
 
-        if ( !strcmp(".biosjumptable", secstrings + shdr[i].sh_name) )
-        {
-            /* We do not relocate the BIOS jump table to high memory. */
-            shdr[i].sh_flags &= ~SHF_ALLOC;
-            jump_sec_idx = i;
-        }
-
         /* Fix up a corner case of address alignment. */
         if ( shdr[i].sh_addralign == 0 )
             shdr[i].sh_addralign = 1;
@@ -76,7 +67,7 @@ static void relocate_32bitbios(char *elfarray, uint32_t elfarraysize)
      */
     reloc_size = reloc_off;
     printf("%d bytes of ROMBIOS high-memory extensions:\n", reloc_size);
-    highbiosarea = (char *)(long)e820_malloc(reloc_size, 0);
+    highbiosarea = mem_alloc(reloc_size, 0);
     BUG_ON(highbiosarea == NULL);
     printf("  Relocating to 0x%x-0x%x ... ",
            (uint32_t)&highbiosarea[0],
@@ -148,21 +139,12 @@ static void relocate_32bitbios(char *elfarray, uint32_t elfarraysize)
         }
     }
 
-    /* Step 5. Find the ROMBIOS jump-table stub and copy in the real table. */
-    for ( jump_table = (char *)ROMBIOS_BEGIN;
-          jump_table != (char *)ROMBIOS_END;
-          jump_table++ )
-        if ( !strncmp(jump_table, "___JMPT", 7) )
-            break;
-    BUG_ON(jump_table == NULL);
-    BUG_ON(jump_sec_idx == 0);
-    memcpy(jump_table, (char *)shdr[jump_sec_idx].sh_addr,
-           shdr[jump_sec_idx].sh_size);
-
     printf("done\n");
+
+    return (uint32_t)highbiosarea;
 }
 
-void highbios_setup(void)
+uint32_t highbios_setup(void)
 {
-    relocate_32bitbios((char *)highbios_array, sizeof(highbios_array));
+    return relocate_32bitbios((char *)highbios_array, sizeof(highbios_array));
 }
index 79f641d9a22ee7fd21542289d9e23fb36a963bee..6d89269fe2de77f571254ae858726f2750be9b19 100644 (file)
@@ -58,4 +58,6 @@ roms.h: ../rombios/BIOS-bochs-latest ../vgabios/VGABIOS-lgpl-latest.bin \
 .PHONY: clean
 clean: subdirs-clean
        rm -f roms.h acpi.h
-       rm -f hvmloader hvmloader.tmp *.o
+       rm -f hvmloader hvmloader.tmp *.o $(DEPS)
+
+-include $(DEPS)
index 47a7bea66cd1fcbb98e9b2acf7245182056f2fad..c282d018d80bab86468e7c6a09f667bead9e1c64 100644 (file)
@@ -22,9 +22,6 @@ C_SRC = build.c dsdt.c static_tables.c
 H_SRC = $(wildcard *.h)
 OBJS  = $(patsubst %.c,%.o,$(C_SRC))
 
-IASL_VER = acpica-unix-20080729
-IASL_URL = http://acpica.org/download/$(IASL_VER).tar.gz
-
 CFLAGS += -I. -I.. $(CFLAGS_include)
 
 vpath iasl $(PATH)
@@ -45,15 +42,11 @@ dsdt.c: dsdt.asl
 
 iasl:
        @echo
-       @echo "ACPI ASL compiler(iasl) is needed"
-       @echo "Download Intel ACPI CA"
-       @echo "If wget failed, please download and compile manually from"
+       @echo "ACPI ASL compiler (iasl) is needed"
+       @echo "Download and install Intel ACPI CA from"
        @echo "http://acpica.org/downloads/"
        @echo 
-       wget $(IASL_URL)
-       tar xzf $(IASL_VER).tar.gz
-       make -C $(IASL_VER)/compiler
-       $(INSTALL_PROG) $(IASL_VER)/compiler/iasl $(DESTDIR)$(BINDIR)/iasl
+       @exit 1
 
 acpi.a: $(OBJS)
        $(AR) rc $@ $(OBJS)
@@ -62,6 +55,8 @@ acpi.a: $(OBJS)
        $(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $<
 
 clean:
-       rm -rf *.a *.o $(IASL_VER) $(IASL_VER).tar.gz
+       rm -rf *.a *.o $(IASL_VER) $(IASL_VER).tar.gz $(DEPS)
 
 install: all
+
+-include $(DEPS)
index b779c763932a61b39dd09c54de598a532f4d437d..82510fc25db0d81df8aaa6589e36572e33a4cdac 100644 (file)
@@ -48,50 +48,11 @@ static void set_checksum(
     p[checksum_offset] = -sum;
 }
 
-static int uart_exists(uint16_t uart_base)
-{
-    uint16_t ier = uart_base + 1;
-    uint8_t a, b, c;
-
-    a = inb(ier);
-    outb(ier, 0);
-    b = inb(ier);
-    outb(ier, 0xf);
-    c = inb(ier);
-    outb(ier, a);
-
-    return ((b == 0) && (c == 0xf));
-}
-
-static int hpet_exists(unsigned long hpet_base)
-{
-    uint32_t hpet_id = *(uint32_t *)hpet_base;
-    return ((hpet_id >> 16) == 0x8086);
-}
-
 static uint8_t battery_port_exists(void)
 {
     return (inb(0x88) == 0x1F);
 }
 
-static int construct_bios_info_table(uint8_t *buf)
-{
-    struct bios_info *bios_info = (struct bios_info *)buf;
-
-    memset(bios_info, 0, sizeof(*bios_info));
-
-    bios_info->com1_present = uart_exists(0x3f8);
-    bios_info->com2_present = uart_exists(0x2f8);
-
-    bios_info->hpet_present = hpet_exists(ACPI_HPET_ADDRESS);
-
-    bios_info->pci_min = PCI_MEMBASE;
-    bios_info->pci_len = PCI_MEMSIZE;
-    bios_info->xen_pfiob = 0xdead;
-
-    return align16(sizeof(*bios_info));
-}
-
 static int construct_madt(struct acpi_20_madt *madt)
 {
     struct acpi_20_madt_intsrcovr *intsrcovr;
@@ -150,7 +111,7 @@ static int construct_madt(struct acpi_20_madt *madt)
     offset += sizeof(*io_apic);
 
     lapic = (struct acpi_20_madt_lapic *)(io_apic + 1);
-    for ( i = 0; i < get_vcpu_nr(); i++ )
+    for ( i = 0; i < hvm_info->nr_vcpus; i++ )
     {
         memset(lapic, 0, sizeof(*lapic));
         lapic->type    = ACPI_PROCESSOR_LOCAL_APIC;
@@ -199,9 +160,10 @@ static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
     struct acpi_20_tcpa *tcpa;
     static const uint16_t tis_signature[] = {0x0001, 0x0001, 0x0001};
     uint16_t *tis_hdr;
+    void *lasa;
 
     /* MADT. */
-    if ( (get_vcpu_nr() > 1) || get_apic_mode() )
+    if ( (hvm_info->nr_vcpus > 1) || hvm_info->apic_mode )
     {
         madt = (struct acpi_20_madt *)&buf[offset];
         offset += construct_madt(madt);
@@ -246,11 +208,11 @@ static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
         tcpa->header.oem_revision = ACPI_OEM_REVISION;
         tcpa->header.creator_id   = ACPI_CREATOR_ID;
         tcpa->header.creator_revision = ACPI_CREATOR_REVISION;
-        tcpa->lasa = e820_malloc(ACPI_2_0_TCPA_LAML_SIZE, 0);
-        if ( tcpa->lasa )
+        if ( (lasa = mem_alloc(ACPI_2_0_TCPA_LAML_SIZE, 0)) != NULL )
         {
+            tcpa->lasa = virt_to_phys(lasa);
             tcpa->laml = ACPI_2_0_TCPA_LAML_SIZE;
-            memset((char *)(unsigned long)tcpa->lasa, 0, tcpa->laml);
+            memset(lasa, 0, tcpa->laml);
             set_checksum(tcpa,
                          offsetof(struct acpi_header, checksum),
                          tcpa->header.length);
@@ -348,9 +310,7 @@ static void __acpi_build_tables(uint8_t *buf, int *low_sz, int *high_sz)
     buf = (uint8_t *)ACPI_PHYSICAL_ADDRESS;
     offset = 0;
 
-    offset += construct_bios_info_table(&buf[offset]);
     rsdp = (struct acpi_20_rsdp *)&buf[offset];
-
     memcpy(rsdp, &Rsdp, sizeof(struct acpi_20_rsdp));
     offset += align16(sizeof(struct acpi_20_rsdp));
     rsdp->rsdt_address = (unsigned long)rsdt;
@@ -376,7 +336,7 @@ void acpi_build_tables(void)
     memset(buf, 0, high_sz);
 
     /* Allocate data area and set up ACPI tables there. */
-    buf = (uint8_t *)e820_malloc(high_sz, 0);
+    buf = mem_alloc(high_sz, 0);
     __acpi_build_tables(buf, &low_sz, &high_sz);
 
     printf(" - Lo data: %08lx-%08lx\n"
index 7ded1c5f8bfd4cd5aba9a26b03acd1f9486d4cd7..a21574e0bb47abd103c4aaf7d1e2701d813882c9 100644 (file)
@@ -86,7 +86,7 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
 
     Scope (\_SB)
     {
-       /* ACPI_PHYSICAL_ADDRESS == 0xEA000 */
+       /* BIOS_INFO_PHYSICAL_ADDRESS == 0xEA000 */
        OperationRegion(BIOS, SystemMemory, 0xEA000, 16)
        Field(BIOS, ByteAcc, NoLock, Preserve) {
            UAR1, 1,
@@ -122,6 +122,20 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
            Name (_ADR, 0x00)
            Name (_BBN, 0x00)
 
+           /*
+            * Reserve the IO port ranges [0x10c0, 0x10c2] and [0xb044, 0xb047].
+            * Or else, for a hotplugged-in device, the port IO BAR assigned
+            * by guest OS may conflict with the ranges here.
+            */
+           Device(HP0)
+           {
+               Name(_HID, EISAID("PNP0C02"))
+               Name(_CRS, ResourceTemplate() {
+                   IO (Decode16, 0x10c0, 0x10c0, 0x00, 0x03)
+                   IO (Decode16, 0xb044, 0xb044, 0x00, 0x04)
+               })
+           }
+
            Method (_CRS, 0, NotSerialized)
            {
                Name (PRT0, ResourceTemplate ()
@@ -456,6 +470,102 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
                 Package(){0x000fffff, 1, \_SB.PCI0.LNKA, 0},
                 Package(){0x000fffff, 2, \_SB.PCI0.LNKB, 0},
                 Package(){0x000fffff, 3, \_SB.PCI0.LNKC, 0},
+
+                /* Device 16, INTA - INTD */
+                Package(){0x0010ffff, 0, \_SB.PCI0.LNKA, 0},
+                Package(){0x0010ffff, 1, \_SB.PCI0.LNKB, 0},
+                Package(){0x0010ffff, 2, \_SB.PCI0.LNKC, 0},
+                Package(){0x0010ffff, 3, \_SB.PCI0.LNKD, 0},
+
+                /* Device 17, INTA - INTD */
+                Package(){0x0011ffff, 0, \_SB.PCI0.LNKB, 0},
+                Package(){0x0011ffff, 1, \_SB.PCI0.LNKC, 0},
+                Package(){0x0011ffff, 2, \_SB.PCI0.LNKD, 0},
+                Package(){0x0011ffff, 3, \_SB.PCI0.LNKA, 0},
+
+                /* Device 18, INTA - INTD */
+                Package(){0x0012ffff, 0, \_SB.PCI0.LNKC, 0},
+                Package(){0x0012ffff, 1, \_SB.PCI0.LNKD, 0},
+                Package(){0x0012ffff, 2, \_SB.PCI0.LNKA, 0},
+                Package(){0x0012ffff, 3, \_SB.PCI0.LNKB, 0},
+
+                /* Device 19, INTA - INTD */
+                Package(){0x0013ffff, 0, \_SB.PCI0.LNKD, 0},
+                Package(){0x0013ffff, 1, \_SB.PCI0.LNKA, 0},
+                Package(){0x0013ffff, 2, \_SB.PCI0.LNKB, 0},
+                Package(){0x0013ffff, 3, \_SB.PCI0.LNKC, 0},
+
+                /* Device 20, INTA - INTD */
+                Package(){0x0014ffff, 0, \_SB.PCI0.LNKA, 0},
+                Package(){0x0014ffff, 1, \_SB.PCI0.LNKB, 0},
+                Package(){0x0014ffff, 2, \_SB.PCI0.LNKC, 0},
+                Package(){0x0014ffff, 3, \_SB.PCI0.LNKD, 0},
+
+                /* Device 21, INTA - INTD */
+                Package(){0x0015ffff, 0, \_SB.PCI0.LNKB, 0},
+                Package(){0x0015ffff, 1, \_SB.PCI0.LNKC, 0},
+                Package(){0x0015ffff, 2, \_SB.PCI0.LNKD, 0},
+                Package(){0x0015ffff, 3, \_SB.PCI0.LNKA, 0},
+
+                /* Device 22, INTA - INTD */
+                Package(){0x0016ffff, 0, \_SB.PCI0.LNKC, 0},
+                Package(){0x0016ffff, 1, \_SB.PCI0.LNKD, 0},
+                Package(){0x0016ffff, 2, \_SB.PCI0.LNKA, 0},
+                Package(){0x0016ffff, 3, \_SB.PCI0.LNKB, 0},
+
+                /* Device 23, INTA - INTD */
+                Package(){0x0017ffff, 0, \_SB.PCI0.LNKD, 0},
+                Package(){0x0017ffff, 1, \_SB.PCI0.LNKA, 0},
+                Package(){0x0017ffff, 2, \_SB.PCI0.LNKB, 0},
+                Package(){0x0017ffff, 3, \_SB.PCI0.LNKC, 0},
+
+                /* Device 24, INTA - INTD */
+                Package(){0x0018ffff, 0, \_SB.PCI0.LNKA, 0},
+                Package(){0x0018ffff, 1, \_SB.PCI0.LNKB, 0},
+                Package(){0x0018ffff, 2, \_SB.PCI0.LNKC, 0},
+                Package(){0x0018ffff, 3, \_SB.PCI0.LNKD, 0},
+
+                /* Device 25, INTA - INTD */
+                Package(){0x0019ffff, 0, \_SB.PCI0.LNKB, 0},
+                Package(){0x0019ffff, 1, \_SB.PCI0.LNKC, 0},
+                Package(){0x0019ffff, 2, \_SB.PCI0.LNKD, 0},
+                Package(){0x0019ffff, 3, \_SB.PCI0.LNKA, 0},
+
+                /* Device 26, INTA - INTD */
+                Package(){0x001affff, 0, \_SB.PCI0.LNKC, 0},
+                Package(){0x001affff, 1, \_SB.PCI0.LNKD, 0},
+                Package(){0x001affff, 2, \_SB.PCI0.LNKA, 0},
+                Package(){0x001affff, 3, \_SB.PCI0.LNKB, 0},
+
+                /* Device 27, INTA - INTD */
+                Package(){0x001bffff, 0, \_SB.PCI0.LNKD, 0},
+                Package(){0x001bffff, 1, \_SB.PCI0.LNKA, 0},
+                Package(){0x001bffff, 2, \_SB.PCI0.LNKB, 0},
+                Package(){0x001bffff, 3, \_SB.PCI0.LNKC, 0},
+
+                /* Device 28, INTA - INTD */
+                Package(){0x001cffff, 0, \_SB.PCI0.LNKA, 0},
+                Package(){0x001cffff, 1, \_SB.PCI0.LNKB, 0},
+                Package(){0x001cffff, 2, \_SB.PCI0.LNKC, 0},
+                Package(){0x001cffff, 3, \_SB.PCI0.LNKD, 0},
+
+                /* Device 29, INTA - INTD */
+                Package(){0x001dffff, 0, \_SB.PCI0.LNKB, 0},
+                Package(){0x001dffff, 1, \_SB.PCI0.LNKC, 0},
+                Package(){0x001dffff, 2, \_SB.PCI0.LNKD, 0},
+                Package(){0x001dffff, 3, \_SB.PCI0.LNKA, 0},
+
+                /* Device 30, INTA - INTD */
+                Package(){0x001effff, 0, \_SB.PCI0.LNKC, 0},
+                Package(){0x001effff, 1, \_SB.PCI0.LNKD, 0},
+                Package(){0x001effff, 2, \_SB.PCI0.LNKA, 0},
+                Package(){0x001effff, 3, \_SB.PCI0.LNKB, 0},
+
+                /* Device 31, INTA - INTD */
+                Package(){0x001fffff, 0, \_SB.PCI0.LNKD, 0},
+                Package(){0x001fffff, 1, \_SB.PCI0.LNKA, 0},
+                Package(){0x001fffff, 2, \_SB.PCI0.LNKB, 0},
+                Package(){0x001fffff, 3, \_SB.PCI0.LNKC, 0},
             })
 
             Name(PRTA, Package() {
@@ -548,6 +658,102 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
                 Package(){0x000fffff, 1, 0, 46},
                 Package(){0x000fffff, 2, 0, 47},
                 Package(){0x000fffff, 3, 0, 16},
+
+                /* Device 16, INTA - INTD */
+                Package(){0x0010ffff, 0, 0, 18},
+                Package(){0x0010ffff, 1, 0, 19},
+                Package(){0x0010ffff, 2, 0, 20},
+                Package(){0x0010ffff, 3, 0, 21},
+
+                /* Device 17, INTA - INTD */
+                Package(){0x0011ffff, 0, 0, 22},
+                Package(){0x0011ffff, 1, 0, 23},
+                Package(){0x0011ffff, 2, 0, 24},
+                Package(){0x0011ffff, 3, 0, 25},
+
+                /* Device 18, INTA - INTD */
+                Package(){0x0012ffff, 0, 0, 26},
+                Package(){0x0012ffff, 1, 0, 27},
+                Package(){0x0012ffff, 2, 0, 28},
+                Package(){0x0012ffff, 3, 0, 29},
+
+                /* Device 19, INTA - INTD */
+                Package(){0x0013ffff, 0, 0, 30},
+                Package(){0x0013ffff, 1, 0, 31},
+                Package(){0x0013ffff, 2, 0, 32},
+                Package(){0x0013ffff, 3, 0, 33},
+
+                /* Device 20, INTA - INTD */
+                Package(){0x0014ffff, 0, 0, 34},
+                Package(){0x0014ffff, 1, 0, 35},
+                Package(){0x0014ffff, 2, 0, 36},
+                Package(){0x0014ffff, 3, 0, 37},
+
+                /* Device 21, INTA - INTD */
+                Package(){0x0015ffff, 0, 0, 38},
+                Package(){0x0015ffff, 1, 0, 39},
+                Package(){0x0015ffff, 2, 0, 40},
+                Package(){0x0015ffff, 3, 0, 41},
+
+                /* Device 22, INTA - INTD */
+                Package(){0x0016ffff, 0, 0, 42},
+                Package(){0x0016ffff, 1, 0, 43},
+                Package(){0x0016ffff, 2, 0, 44},
+                Package(){0x0016ffff, 3, 0, 45},
+
+                /* Device 23, INTA - INTD */
+                Package(){0x0017ffff, 0, 0, 46},
+                Package(){0x0017ffff, 1, 0, 47},
+                Package(){0x0017ffff, 2, 0, 16},
+                Package(){0x0017ffff, 3, 0, 17},
+
+                /* Device 24, INTA - INTD */
+                Package(){0x0018ffff, 0, 0, 19},
+                Package(){0x0018ffff, 1, 0, 20},
+                Package(){0x0018ffff, 2, 0, 21},
+                Package(){0x0018ffff, 3, 0, 22},
+
+                /* Device 25, INTA - INTD */
+                Package(){0x0019ffff, 0, 0, 23},
+                Package(){0x0019ffff, 1, 0, 24},
+                Package(){0x0019ffff, 2, 0, 25},
+                Package(){0x0019ffff, 3, 0, 26},
+
+                /* Device 26, INTA - INTD */
+                Package(){0x001affff, 0, 0, 27},
+                Package(){0x001affff, 1, 0, 28},
+                Package(){0x001affff, 2, 0, 29},
+                Package(){0x001affff, 3, 0, 30},
+
+                /* Device 27, INTA - INTD */
+                Package(){0x001bffff, 0, 0, 31},
+                Package(){0x001bffff, 1, 0, 32},
+                Package(){0x001bffff, 2, 0, 33},
+                Package(){0x001bffff, 3, 0, 34},
+
+                /* Device 28, INTA - INTD */
+                Package(){0x001cffff, 0, 0, 35},
+                Package(){0x001cffff, 1, 0, 36},
+                Package(){0x001cffff, 2, 0, 37},
+                Package(){0x001cffff, 3, 0, 38},
+
+                /* Device 29, INTA - INTD */
+                Package(){0x001dffff, 0, 0, 39},
+                Package(){0x001dffff, 1, 0, 40},
+                Package(){0x001dffff, 2, 0, 41},
+                Package(){0x001dffff, 3, 0, 42},
+
+                /* Device 30, INTA - INTD */
+                Package(){0x001effff, 0, 0, 43},
+                Package(){0x001effff, 1, 0, 44},
+                Package(){0x001effff, 2, 0, 45},
+                Package(){0x001effff, 3, 0, 46},
+
+                /* Device 31, INTA - INTD */
+                Package(){0x001fffff, 0, 0, 47},
+                Package(){0x001fffff, 1, 0, 16},
+                Package(){0x001fffff, 2, 0, 17},
+                Package(){0x001fffff, 3, 0, 18},
             })
             
             Device (ISA)
@@ -775,100 +981,1190 @@ DefinitionBlock ("DSDT.aml", "DSDT", 2, "Xen", "HVM", 0)
              * handle the hotplug action and status, which is beyond the ACPI
              * scope.
              */
+            Device(S00)
+            {
+                Name (_ADR, 0x00000000) /* Dev 0, Func 0 */
+                Name (_SUN, 0x00000000)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x00, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x00, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x00, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH00) /* eject php slot 0x00 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x00, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH00) /* IN status as the _STA */
+                }
+            }
 
-            Device (S1F0)
+            Device(S01)
             {
-                Name (_ADR, 0x00060000) /* Dev 6, Func 0 */
+                Name (_ADR, 0x00010000) /* Dev 1, Func 0 */
                 Name (_SUN, 0x00000001)
 
                 Method (_PS0, 0)
                 {
+                    Store (0x01, \_GPE.DPT1)
                     Store (0x80, \_GPE.DPT2)
                 }
 
                 Method (_PS3, 0)
                 {
+                    Store (0x01, \_GPE.DPT1)
                     Store (0x83, \_GPE.DPT2)
                 }
 
                 Method (_EJ0, 1)
                 {
+                    Store (0x01, \_GPE.DPT1)
                     Store (0x88, \_GPE.DPT2)
-                    Store (0x1, \_GPE.PHP1) /* eject php slot 1*/
+                    Store (0x1, \_GPE.PH01) /* eject php slot 0x01 */
                 }
 
                 Method (_STA, 0)
                 {
+                    Store (0x01, \_GPE.DPT1)
                     Store (0x89, \_GPE.DPT2)
-                    Return ( \_GPE.PHP1 )   /* IN status as the _STA */
+                    Return (\_GPE.PH01) /* IN status as the _STA */
                 }
             }
 
-            Device (S2F0)
+            Device(S02)
             {
-                Name (_ADR, 0x00070000) /* Dev 7, Func 0 */
+                Name (_ADR, 0x00020000) /* Dev 2, Func 0 */
                 Name (_SUN, 0x00000002)
 
                 Method (_PS0, 0)
                 {
-                    Store (0x90, \_GPE.DPT2)
+                    Store (0x02, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
                 }
 
                 Method (_PS3, 0)
                 {
-                    Store (0x93, \_GPE.DPT2)
+                    Store (0x02, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
                 }
 
                 Method (_EJ0, 1)
                 {
-                    Store (0x98, \_GPE.DPT2)
-                    Store (0x1, \_GPE.PHP2) /* eject php slot 1*/
+                    Store (0x02, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH02) /* eject php slot 0x02 */
                 }
 
                 Method (_STA, 0)
                 {
-                    Store (0x99, \_GPE.DPT2)
-                    Return ( \_GPE.PHP2 )   /* IN status as the _STA */
+                    Store (0x02, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH02) /* IN status as the _STA */
                 }
             }
-        }
-    }
 
-    Scope (\_GPE)
-    {
-        OperationRegion (PHP, SystemIO, 0x10c0, 0x03)
-        Field (PHP, ByteAcc, NoLock, Preserve)
-        {
-            PSTA,   8, /* hotplug controller status reg */
-            PHP1,   8, /* hotplug slot 1 control reg */
-            PHP2,   8  /* hotplug slot 2 control reg */
-        }
-        OperationRegion (DG1, SystemIO, 0xb044, 0x04)
-        Field (DG1, ByteAcc, NoLock, Preserve)
-        {
-            DPT1,   8,
-            DPT2,   8
-        }
-        Method (_L03, 0, NotSerialized)
-        {
-            /* detect slot and event(remove/add) */
-            Name (SLT, 0x0)
-            Name (EVT, 0x0)
-            Store (PSTA, Local1)
-            ShiftRight (Local1, 0x4, SLT)
-            And (Local1, 0xf, EVT)
+            Device(S03)
+            {
+                Name (_ADR, 0x00030000) /* Dev 3, Func 0 */
+                Name (_SUN, 0x00000003)
 
-            /* debug */
-            Store (SLT, DPT1)
-            Store (EVT, DPT2)
+                Method (_PS0, 0)
+                {
+                    Store (0x03, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x03, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x03, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH03) /* eject php slot 0x03 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x03, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH03) /* IN status as the _STA */
+                }
+            }
+
+            Device(S04)
+            {
+                Name (_ADR, 0x00040000) /* Dev 4, Func 0 */
+                Name (_SUN, 0x00000004)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x04, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x04, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x04, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH04) /* eject php slot 0x04 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x04, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH04) /* IN status as the _STA */
+                }
+            }
+
+            Device(S05)
+            {
+                Name (_ADR, 0x00050000) /* Dev 5, Func 0 */
+                Name (_SUN, 0x00000005)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x05, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x05, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x05, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH05) /* eject php slot 0x05 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x05, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH05) /* IN status as the _STA */
+                }
+            }
+
+            Device(S06)
+            {
+                Name (_ADR, 0x00060000) /* Dev 6, Func 0 */
+                Name (_SUN, 0x00000006)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x06, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x06, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x06, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH06) /* eject php slot 0x06 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x06, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH06) /* IN status as the _STA */
+                }
+            }
+
+            Device(S07)
+            {
+                Name (_ADR, 0x00070000) /* Dev 7, Func 0 */
+                Name (_SUN, 0x00000007)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x07, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x07, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x07, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH07) /* eject php slot 0x07 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x07, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH07) /* IN status as the _STA */
+                }
+            }
+
+            Device(S08)
+            {
+                Name (_ADR, 0x00080000) /* Dev 8, Func 0 */
+                Name (_SUN, 0x00000008)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x08, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x08, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x08, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH08) /* eject php slot 0x08 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x08, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH08) /* IN status as the _STA */
+                }
+            }
 
-            If ( LEqual(SLT, 0x1) )
+            Device(S09)
             {
-                Notify (\_SB.PCI0.S1F0, EVT)
+                Name (_ADR, 0x00090000) /* Dev 9, Func 0 */
+                Name (_SUN, 0x00000009)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x09, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x09, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x09, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH09) /* eject php slot 0x09 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x09, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH09) /* IN status as the _STA */
+                }
+            }
+
+            Device(S0A)
+            {
+                Name (_ADR, 0x000a0000) /* Dev 10, Func 0 */
+                Name (_SUN, 0x0000000a)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x0a, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x0a, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x0a, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH0A) /* eject php slot 0x0a */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x0a, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH0A) /* IN status as the _STA */
+                }
             }
-            ElseIf ( LEqual(SLT, 0x2) )
+
+            Device(S0B)
             {
-                Notify (\_SB.PCI0.S2F0, EVT)
+                Name (_ADR, 0x000b0000) /* Dev 11, Func 0 */
+                Name (_SUN, 0x0000000b)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x0b, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x0b, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x0b, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH0B) /* eject php slot 0x0b */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x0b, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH0B) /* IN status as the _STA */
+                }
+            }
+
+            Device(S0C)
+            {
+                Name (_ADR, 0x000c0000) /* Dev 12, Func 0 */
+                Name (_SUN, 0x0000000c)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x0c, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x0c, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x0c, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH0C) /* eject php slot 0x0c */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x0c, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH0C) /* IN status as the _STA */
+                }
+            }
+
+            Device(S0D)
+            {
+                Name (_ADR, 0x000d0000) /* Dev 13, Func 0 */
+                Name (_SUN, 0x0000000d)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x0d, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x0d, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x0d, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH0D) /* eject php slot 0x0d */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x0d, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH0D) /* IN status as the _STA */
+                }
+            }
+
+            Device(S0E)
+            {
+                Name (_ADR, 0x000e0000) /* Dev 14, Func 0 */
+                Name (_SUN, 0x0000000e)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x0e, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x0e, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x0e, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH0E) /* eject php slot 0x0e */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x0e, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH0E) /* IN status as the _STA */
+                }
+            }
+
+            Device(S0F)
+            {
+                Name (_ADR, 0x000f0000) /* Dev 15, Func 0 */
+                Name (_SUN, 0x0000000f)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x0f, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x0f, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x0f, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH0F) /* eject php slot 0x0f */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x0f, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH0F) /* IN status as the _STA */
+                }
+            }
+
+            Device(S10)
+            {
+                Name (_ADR, 0x00100000) /* Dev 16, Func 0 */
+                Name (_SUN, 0x00000010)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x10, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x10, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x10, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH10) /* eject php slot 0x10 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x10, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH10) /* IN status as the _STA */
+                }
+            }
+
+            Device(S11)
+            {
+                Name (_ADR, 0x00110000) /* Dev 17, Func 0 */
+                Name (_SUN, 0x00000011)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x11, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x11, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x11, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH11) /* eject php slot 0x11 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x11, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH11) /* IN status as the _STA */
+                }
+            }
+
+            Device(S12)
+            {
+                Name (_ADR, 0x00120000) /* Dev 18, Func 0 */
+                Name (_SUN, 0x00000012)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x12, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x12, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x12, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH12) /* eject php slot 0x12 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x12, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH12) /* IN status as the _STA */
+                }
+            }
+
+            Device(S13)
+            {
+                Name (_ADR, 0x00130000) /* Dev 19, Func 0 */
+                Name (_SUN, 0x00000013)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x13, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x13, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x13, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH13) /* eject php slot 0x13 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x13, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH13) /* IN status as the _STA */
+                }
+            }
+
+            Device(S14)
+            {
+                Name (_ADR, 0x00140000) /* Dev 20, Func 0 */
+                Name (_SUN, 0x00000014)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x14, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x14, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x14, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH14) /* eject php slot 0x14 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x14, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH14) /* IN status as the _STA */
+                }
+            }
+
+            Device(S15)
+            {
+                Name (_ADR, 0x00150000) /* Dev 21, Func 0 */
+                Name (_SUN, 0x00000015)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x15, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x15, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x15, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH15) /* eject php slot 0x15 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x15, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH15) /* IN status as the _STA */
+                }
+            }
+
+            Device(S16)
+            {
+                Name (_ADR, 0x00160000) /* Dev 22, Func 0 */
+                Name (_SUN, 0x00000016)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x16, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x16, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x16, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH16) /* eject php slot 0x16 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x16, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH16) /* IN status as the _STA */
+                }
+            }
+
+            Device(S17)
+            {
+                Name (_ADR, 0x00170000) /* Dev 23, Func 0 */
+                Name (_SUN, 0x00000017)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x17, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x17, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x17, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH17) /* eject php slot 0x17 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x17, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH17) /* IN status as the _STA */
+                }
+            }
+
+            Device(S18)
+            {
+                Name (_ADR, 0x00180000) /* Dev 24, Func 0 */
+                Name (_SUN, 0x00000018)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x18, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x18, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x18, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH18) /* eject php slot 0x18 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x18, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH18) /* IN status as the _STA */
+                }
+            }
+
+            Device(S19)
+            {
+                Name (_ADR, 0x00190000) /* Dev 25, Func 0 */
+                Name (_SUN, 0x00000019)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x19, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x19, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x19, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH19) /* eject php slot 0x19 */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x19, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH19) /* IN status as the _STA */
+                }
+            }
+
+            Device(S1A)
+            {
+                Name (_ADR, 0x001a0000) /* Dev 26, Func 0 */
+                Name (_SUN, 0x0000001a)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x1a, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x1a, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x1a, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH1A) /* eject php slot 0x1a */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x1a, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH1A) /* IN status as the _STA */
+                }
+            }
+
+            Device(S1B)
+            {
+                Name (_ADR, 0x001b0000) /* Dev 27, Func 0 */
+                Name (_SUN, 0x0000001b)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x1b, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x1b, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x1b, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH1B) /* eject php slot 0x1b */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x1b, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH1B) /* IN status as the _STA */
+                }
+            }
+
+            Device(S1C)
+            {
+                Name (_ADR, 0x001c0000) /* Dev 28, Func 0 */
+                Name (_SUN, 0x0000001c)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x1c, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x1c, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x1c, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH1C) /* eject php slot 0x1c */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x1c, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH1C) /* IN status as the _STA */
+                }
+            }
+
+            Device(S1D)
+            {
+                Name (_ADR, 0x001d0000) /* Dev 29, Func 0 */
+                Name (_SUN, 0x0000001d)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x1d, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x1d, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x1d, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH1D) /* eject php slot 0x1d */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x1d, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH1D) /* IN status as the _STA */
+                }
+            }
+
+            Device(S1E)
+            {
+                Name (_ADR, 0x001e0000) /* Dev 30, Func 0 */
+                Name (_SUN, 0x0000001e)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x1e, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x1e, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x1e, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH1E) /* eject php slot 0x1e */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x1e, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH1E) /* IN status as the _STA */
+                }
+            }
+
+            Device(S1F)
+            {
+                Name (_ADR, 0x001f0000) /* Dev 31, Func 0 */
+                Name (_SUN, 0x0000001f)
+
+                Method (_PS0, 0)
+                {
+                    Store (0x1f, \_GPE.DPT1)
+                    Store (0x80, \_GPE.DPT2)
+                }
+
+                Method (_PS3, 0)
+                {
+                    Store (0x1f, \_GPE.DPT1)
+                    Store (0x83, \_GPE.DPT2)
+                }
+
+                Method (_EJ0, 1)
+                {
+                    Store (0x1f, \_GPE.DPT1)
+                    Store (0x88, \_GPE.DPT2)
+                    Store (0x1, \_GPE.PH1F) /* eject php slot 0x1f */
+                }
+
+                Method (_STA, 0)
+                {
+                    Store (0x1f, \_GPE.DPT1)
+                    Store (0x89, \_GPE.DPT2)
+                    Return (\_GPE.PH1F) /* IN status as the _STA */
+                }
+            }
+        }
+    }
+
+    Scope (\_GPE)
+    {
+        OperationRegion (PHP, SystemIO, 0x10c0, 0x22)
+        Field (PHP, ByteAcc, NoLock, Preserve)
+        {
+            PSTA,  8, /* hotplug controller event reg */
+            PSTB,  8, /* hotplug controller slot  reg */
+            PH00,  8, /* hotplug slot 0x00 control reg */
+            PH01,  8, /* hotplug slot 0x01 control reg */
+            PH02,  8, /* hotplug slot 0x02 control reg */
+            PH03,  8, /* hotplug slot 0x03 control reg */
+            PH04,  8, /* hotplug slot 0x04 control reg */
+            PH05,  8, /* hotplug slot 0x05 control reg */
+            PH06,  8, /* hotplug slot 0x06 control reg */
+            PH07,  8, /* hotplug slot 0x07 control reg */
+            PH08,  8, /* hotplug slot 0x08 control reg */
+            PH09,  8, /* hotplug slot 0x09 control reg */
+            PH0A,  8, /* hotplug slot 0x0a control reg */
+            PH0B,  8, /* hotplug slot 0x0b control reg */
+            PH0C,  8, /* hotplug slot 0x0c control reg */
+            PH0D,  8, /* hotplug slot 0x0d control reg */
+            PH0E,  8, /* hotplug slot 0x0e control reg */
+            PH0F,  8, /* hotplug slot 0x0f control reg */
+            PH10,  8, /* hotplug slot 0x10 control reg */
+            PH11,  8, /* hotplug slot 0x11 control reg */
+            PH12,  8, /* hotplug slot 0x12 control reg */
+            PH13,  8, /* hotplug slot 0x13 control reg */
+            PH14,  8, /* hotplug slot 0x14 control reg */
+            PH15,  8, /* hotplug slot 0x15 control reg */
+            PH16,  8, /* hotplug slot 0x16 control reg */
+            PH17,  8, /* hotplug slot 0x17 control reg */
+            PH18,  8, /* hotplug slot 0x18 control reg */
+            PH19,  8, /* hotplug slot 0x19 control reg */
+            PH1A,  8, /* hotplug slot 0x1a control reg */
+            PH1B,  8, /* hotplug slot 0x1b control reg */
+            PH1C,  8, /* hotplug slot 0x1c control reg */
+            PH1D,  8, /* hotplug slot 0x1d control reg */
+            PH1E,  8, /* hotplug slot 0x1e control reg */
+            PH1F,  8  /* hotplug slot 0x1f control reg */
+       }
+        OperationRegion (DG1, SystemIO, 0xb044, 0x04)
+        Field (DG1, ByteAcc, NoLock, Preserve)
+        {
+            DPT1,   8,
+            DPT2,   8
+        }
+        Method (_L03, 0, Serialized)
+        {
+            /* detect slot and event(remove/add) */
+            Name (SLT, 0x0)
+            Name (EVT, 0x0)
+            Store (PSTA, Local1)
+            And (Local1, 0xf, EVT)
+            Store (PSTB, Local1)           /* XXX: Store (PSTB, SLT) ? */
+            And (Local1, 0xff, SLT)
+
+            /* debug */
+            Store (SLT, DPT1)
+            Store (EVT, DPT2)
+
+            Switch (SLT)
+            {
+                Case (0x00) {
+                    Notify (\_SB.PCI0.S00, EVT)
+                }
+                Case (0x01) {
+                    Notify (\_SB.PCI0.S01, EVT)
+                }
+                Case (0x02) {
+                    Notify (\_SB.PCI0.S02, EVT)
+                }
+                Case (0x03) {
+                    Notify (\_SB.PCI0.S03, EVT)
+                }
+                Case (0x04) {
+                    Notify (\_SB.PCI0.S04, EVT)
+                }
+                Case (0x05) {
+                    Notify (\_SB.PCI0.S05, EVT)
+                }
+                Case (0x06) {
+                    Notify (\_SB.PCI0.S06, EVT)
+                }
+                Case (0x07) {
+                    Notify (\_SB.PCI0.S07, EVT)
+                }
+                Case (0x08) {
+                    Notify (\_SB.PCI0.S08, EVT)
+                }
+                Case (0x09) {
+                    Notify (\_SB.PCI0.S09, EVT)
+                }
+                Case (0x0a) {
+                    Notify (\_SB.PCI0.S0A, EVT)
+                }
+                Case (0x0b) {
+                    Notify (\_SB.PCI0.S0B, EVT)
+                }
+                Case (0x0c) {
+                    Notify (\_SB.PCI0.S0C, EVT)
+                }
+                Case (0x0d) {
+                    Notify (\_SB.PCI0.S0D, EVT)
+                }
+                Case (0x0e) {
+                    Notify (\_SB.PCI0.S0E, EVT)
+                }
+                Case (0x0f) {
+                    Notify (\_SB.PCI0.S0F, EVT)
+                }
+                Case (0x10) {
+                    Notify (\_SB.PCI0.S10, EVT)
+                }
+                Case (0x11) {
+                    Notify (\_SB.PCI0.S11, EVT)
+                }
+                Case (0x12) {
+                    Notify (\_SB.PCI0.S12, EVT)
+                }
+                Case (0x13) {
+                    Notify (\_SB.PCI0.S13, EVT)
+                }
+                Case (0x14) {
+                    Notify (\_SB.PCI0.S14, EVT)
+                }
+                Case (0x15) {
+                    Notify (\_SB.PCI0.S15, EVT)
+                }
+                Case (0x16) {
+                    Notify (\_SB.PCI0.S16, EVT)
+                }
+                Case (0x17) {
+                    Notify (\_SB.PCI0.S17, EVT)
+                }
+                Case (0x18) {
+                    Notify (\_SB.PCI0.S18, EVT)
+                }
+                Case (0x19) {
+                    Notify (\_SB.PCI0.S19, EVT)
+                }
+                Case (0x1a) {
+                    Notify (\_SB.PCI0.S1A, EVT)
+                }
+                Case (0x1b) {
+                    Notify (\_SB.PCI0.S1B, EVT)
+                }
+                Case (0x1c) {
+                    Notify (\_SB.PCI0.S1C, EVT)
+                }
+                Case (0x1d) {
+                    Notify (\_SB.PCI0.S1D, EVT)
+                }
+                Case (0x1e) {
+                    Notify (\_SB.PCI0.S1E, EVT)
+                }
+                Case (0x1f) {
+                    Notify (\_SB.PCI0.S1F, EVT)
+                }
             }
         }
     }
index c1368282218250f08307516395d6859e777f916a..ca58b32ee3a708ba2b7d0f42da44e7871f5d0001 100644 (file)
@@ -1,22 +1,22 @@
 /*
  * 
  * Intel ACPI Component Architecture
- * ASL Optimizing Compiler version 20060707 [Feb 16 2007]
- * Copyright (C) 2000 - 2006 Intel Corporation
+ * ASL Optimizing Compiler version 20090220 [Mar  9 2009]
+ * Copyright (C) 2000 - 2009 Intel Corporation
  * Supports ACPI Specification Revision 3.0a
  * 
- * Compilation of "dsdt.asl" - Tue May 20 14:34:40 2008
+ * Compilation of "dsdt.asl" - Tue Mar 17 10:44:21 2009
  * 
  * C source code output
  *
  */
 unsigned char AmlCode[] =
 {
-    0x44,0x53,0x44,0x54,0x32,0x11,0x00,0x00,  /* 00000000    "DSDT2..." */
-    0x02,0xEC,0x58,0x65,0x6E,0x00,0x00,0x00,  /* 00000008    "..Xen..." */
+    0x44,0x53,0x44,0x54,0x02,0x32,0x00,0x00,  /* 00000000    "DSDT.2.." */
+    0x02,0xC6,0x58,0x65,0x6E,0x00,0x00,0x00,  /* 00000008    "..Xen..." */
     0x48,0x56,0x4D,0x00,0x00,0x00,0x00,0x00,  /* 00000010    "HVM....." */
     0x00,0x00,0x00,0x00,0x49,0x4E,0x54,0x4C,  /* 00000018    "....INTL" */
-    0x07,0x07,0x06,0x20,0x08,0x50,0x4D,0x42,  /* 00000020    "... .PMB" */
+    0x20,0x02,0x09,0x20,0x08,0x50,0x4D,0x42,  /* 00000020    " .. .PMB" */
     0x53,0x0B,0x00,0x0C,0x08,0x50,0x4D,0x4C,  /* 00000028    "S....PML" */
     0x4E,0x0A,0x08,0x08,0x49,0x4F,0x42,0x31,  /* 00000030    "N...IOB1" */
     0x00,0x08,0x49,0x4F,0x4C,0x31,0x00,0x08,  /* 00000038    "..IOL1.." */
@@ -56,512 +56,1562 @@ unsigned char AmlCode[] =
     0x07,0x0A,0x07,0x00,0x00,0x08,0x50,0x49,  /* 00000148    "......PI" */
     0x43,0x44,0x00,0x14,0x0C,0x5F,0x50,0x49,  /* 00000150    "CD..._PI" */
     0x43,0x01,0x70,0x68,0x50,0x49,0x43,0x44,  /* 00000158    "C.phPICD" */
-    0x10,0x42,0xF1,0x5F,0x53,0x42,0x5F,0x5B,  /* 00000160    ".B._SB_[" */
-    0x80,0x42,0x49,0x4F,0x53,0x00,0x0C,0x00,  /* 00000168    ".BIOS..." */
-    0xA0,0x0E,0x00,0x0A,0x10,0x5B,0x81,0x21,  /* 00000170    ".....[.!" */
-    0x42,0x49,0x4F,0x53,0x01,0x55,0x41,0x52,  /* 00000178    "BIOS.UAR" */
-    0x31,0x01,0x55,0x41,0x52,0x32,0x01,0x48,  /* 00000180    "1.UAR2.H" */
-    0x50,0x45,0x54,0x01,0x00,0x1D,0x50,0x4D,  /* 00000188    "PET...PM" */
-    0x49,0x4E,0x20,0x50,0x4C,0x45,0x4E,0x20,  /* 00000190    "IN PLEN " */
-    0x5B,0x82,0x49,0x04,0x4D,0x45,0x4D,0x30,  /* 00000198    "[.I.MEM0" */
-    0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,  /* 000001A0    "._HID.A." */
-    0x0C,0x02,0x08,0x5F,0x43,0x52,0x53,0x11,  /* 000001A8    "..._CRS." */
-    0x33,0x0A,0x30,0x8A,0x2B,0x00,0x00,0x0D,  /* 000001B0    "3.0.+..." */
-    0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,  /* 000001B8    "........" */
+    0x10,0x83,0xB7,0x02,0x5F,0x53,0x42,0x5F,  /* 00000160    "...._SB_" */
+    0x5B,0x80,0x42,0x49,0x4F,0x53,0x00,0x0C,  /* 00000168    "[.BIOS.." */
+    0x00,0xA0,0x0E,0x00,0x0A,0x10,0x5B,0x81,  /* 00000170    "......[." */
+    0x21,0x42,0x49,0x4F,0x53,0x01,0x55,0x41,  /* 00000178    "!BIOS.UA" */
+    0x52,0x31,0x01,0x55,0x41,0x52,0x32,0x01,  /* 00000180    "R1.UAR2." */
+    0x48,0x50,0x45,0x54,0x01,0x00,0x1D,0x50,  /* 00000188    "HPET...P" */
+    0x4D,0x49,0x4E,0x20,0x50,0x4C,0x45,0x4E,  /* 00000190    "MIN PLEN" */
+    0x20,0x5B,0x82,0x49,0x04,0x4D,0x45,0x4D,  /* 00000198    " [.I.MEM" */
+    0x30,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 000001A0    "0._HID.A" */
+    0xD0,0x0C,0x02,0x08,0x5F,0x43,0x52,0x53,  /* 000001A8    "...._CRS" */
+    0x11,0x33,0x0A,0x30,0x8A,0x2B,0x00,0x00,  /* 000001B0    ".3.0.+.." */
+    0x0D,0x03,0x00,0x00,0x00,0x00,0x00,0x00,  /* 000001B8    "........" */
     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,  /* 000001C0    "........" */
-    0x00,0xFF,0xFF,0x09,0x00,0x00,0x00,0x00,  /* 000001C8    "........" */
+    0x00,0x00,0xFF,0xFF,0x09,0x00,0x00,0x00,  /* 000001C8    "........" */
     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,  /* 000001D0    "........" */
-    0x00,0x00,0x00,0x0A,0x00,0x00,0x00,0x00,  /* 000001D8    "........" */
-    0x00,0x79,0x00,0x5B,0x82,0x4E,0xE8,0x50,  /* 000001E0    ".y.[.N.P" */
-    0x43,0x49,0x30,0x08,0x5F,0x48,0x49,0x44,  /* 000001E8    "CI0._HID" */
-    0x0C,0x41,0xD0,0x0A,0x03,0x08,0x5F,0x55,  /* 000001F0    ".A...._U" */
-    0x49,0x44,0x00,0x08,0x5F,0x41,0x44,0x52,  /* 000001F8    "ID.._ADR" */
-    0x00,0x08,0x5F,0x42,0x42,0x4E,0x00,0x14,  /* 00000200    ".._BBN.." */
-    0x4E,0x0C,0x5F,0x43,0x52,0x53,0x00,0x08,  /* 00000208    "N._CRS.." */
-    0x50,0x52,0x54,0x30,0x11,0x42,0x07,0x0A,  /* 00000210    "PRT0.B.." */
-    0x6E,0x88,0x0D,0x00,0x02,0x0E,0x00,0x00,  /* 00000218    "n......." */
-    0x00,0x00,0x00,0xFF,0x00,0x00,0x00,0x00,  /* 00000220    "........" */
-    0x01,0x47,0x01,0xF8,0x0C,0xF8,0x0C,0x01,  /* 00000228    ".G......" */
-    0x08,0x88,0x0D,0x00,0x01,0x0C,0x03,0x00,  /* 00000230    "........" */
-    0x00,0x00,0x00,0xF7,0x0C,0x00,0x00,0xF8,  /* 00000238    "........" */
-    0x0C,0x88,0x0D,0x00,0x01,0x0C,0x03,0x00,  /* 00000240    "........" */
-    0x00,0x00,0x0D,0xFF,0xFF,0x00,0x00,0x00,  /* 00000248    "........" */
-    0xF3,0x87,0x17,0x00,0x00,0x0C,0x03,0x00,  /* 00000250    "........" */
-    0x00,0x00,0x00,0x00,0x00,0x0A,0x00,0xFF,  /* 00000258    "........" */
-    0xFF,0x0B,0x00,0x00,0x00,0x00,0x00,0x00,  /* 00000260    "........" */
-    0x00,0x02,0x00,0x87,0x17,0x00,0x00,0x0C,  /* 00000268    "........" */
-    0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,  /* 00000270    "........" */
-    0xF0,0xFF,0xFF,0xFF,0xF4,0x00,0x00,0x00,  /* 00000278    "........" */
-    0x00,0x00,0x00,0x00,0x05,0x79,0x00,0x8A,  /* 00000280    ".....y.." */
-    0x50,0x52,0x54,0x30,0x0A,0x5C,0x4D,0x4D,  /* 00000288    "PRT0.\MM" */
-    0x49,0x4E,0x8A,0x50,0x52,0x54,0x30,0x0A,  /* 00000290    "IN.PRT0." */
-    0x60,0x4D,0x4D,0x41,0x58,0x8A,0x50,0x52,  /* 00000298    "`MMAX.PR" */
-    0x54,0x30,0x0A,0x68,0x4D,0x4C,0x45,0x4E,  /* 000002A0    "T0.hMLEN" */
-    0x70,0x50,0x4D,0x49,0x4E,0x4D,0x4D,0x49,  /* 000002A8    "pPMINMMI" */
-    0x4E,0x70,0x50,0x4C,0x45,0x4E,0x4D,0x4C,  /* 000002B0    "NpPLENML" */
-    0x45,0x4E,0x72,0x4D,0x4D,0x49,0x4E,0x4D,  /* 000002B8    "ENrMMINM" */
-    0x4C,0x45,0x4E,0x4D,0x4D,0x41,0x58,0x74,  /* 000002C0    "LENMMAXt" */
-    0x4D,0x4D,0x41,0x58,0x01,0x4D,0x4D,0x41,  /* 000002C8    "MMAX.MMA" */
-    0x58,0xA4,0x50,0x52,0x54,0x30,0x08,0x42,  /* 000002D0    "X.PRT0.B" */
-    0x55,0x46,0x41,0x11,0x09,0x0A,0x06,0x23,  /* 000002D8    "UFA....#" */
-    0x20,0x0C,0x18,0x79,0x00,0x08,0x42,0x55,  /* 000002E0    " ..y..BU" */
-    0x46,0x42,0x11,0x09,0x0A,0x06,0x23,0x00,  /* 000002E8    "FB....#." */
-    0x00,0x18,0x79,0x00,0x8B,0x42,0x55,0x46,  /* 000002F0    "..y..BUF" */
-    0x42,0x01,0x49,0x52,0x51,0x56,0x5B,0x82,  /* 000002F8    "B.IRQV[." */
-    0x48,0x08,0x4C,0x4E,0x4B,0x41,0x08,0x5F,  /* 00000300    "H.LNKA._" */
-    0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x0F,  /* 00000308    "HID.A..." */
-    0x08,0x5F,0x55,0x49,0x44,0x01,0x14,0x1C,  /* 00000310    "._UID..." */
-    0x5F,0x53,0x54,0x41,0x00,0x7B,0x50,0x49,  /* 00000318    "_STA.{PI" */
-    0x52,0x41,0x0A,0x80,0x60,0xA0,0x08,0x93,  /* 00000320    "RA..`..." */
-    0x60,0x0A,0x80,0xA4,0x0A,0x09,0xA1,0x04,  /* 00000328    "`......." */
-    0xA4,0x0A,0x0B,0x14,0x0B,0x5F,0x50,0x52,  /* 00000330    "....._PR" */
-    0x53,0x00,0xA4,0x42,0x55,0x46,0x41,0x14,  /* 00000338    "S..BUFA." */
-    0x11,0x5F,0x44,0x49,0x53,0x00,0x7D,0x50,  /* 00000340    "._DIS.}P" */
-    0x49,0x52,0x41,0x0A,0x80,0x50,0x49,0x52,  /* 00000348    "IRA..PIR" */
-    0x41,0x14,0x1A,0x5F,0x43,0x52,0x53,0x00,  /* 00000350    "A.._CRS." */
-    0x7B,0x50,0x49,0x52,0x41,0x0A,0x0F,0x60,  /* 00000358    "{PIRA..`" */
-    0x79,0x01,0x60,0x49,0x52,0x51,0x56,0xA4,  /* 00000360    "y.`IRQV." */
-    0x42,0x55,0x46,0x42,0x14,0x1B,0x5F,0x53,  /* 00000368    "BUFB.._S" */
-    0x52,0x53,0x01,0x8B,0x68,0x01,0x49,0x52,  /* 00000370    "RS..h.IR" */
-    0x51,0x31,0x82,0x49,0x52,0x51,0x31,0x60,  /* 00000378    "Q1.IRQ1`" */
-    0x76,0x60,0x70,0x60,0x50,0x49,0x52,0x41,  /* 00000380    "v`p`PIRA" */
-    0x5B,0x82,0x49,0x08,0x4C,0x4E,0x4B,0x42,  /* 00000388    "[.I.LNKB" */
-    0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,  /* 00000390    "._HID.A." */
-    0x0C,0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A,  /* 00000398    "..._UID." */
-    0x02,0x14,0x1C,0x5F,0x53,0x54,0x41,0x00,  /* 000003A0    "..._STA." */
-    0x7B,0x50,0x49,0x52,0x42,0x0A,0x80,0x60,  /* 000003A8    "{PIRB..`" */
-    0xA0,0x08,0x93,0x60,0x0A,0x80,0xA4,0x0A,  /* 000003B0    "...`...." */
-    0x09,0xA1,0x04,0xA4,0x0A,0x0B,0x14,0x0B,  /* 000003B8    "........" */
-    0x5F,0x50,0x52,0x53,0x00,0xA4,0x42,0x55,  /* 000003C0    "_PRS..BU" */
-    0x46,0x41,0x14,0x11,0x5F,0x44,0x49,0x53,  /* 000003C8    "FA.._DIS" */
-    0x00,0x7D,0x50,0x49,0x52,0x42,0x0A,0x80,  /* 000003D0    ".}PIRB.." */
-    0x50,0x49,0x52,0x42,0x14,0x1A,0x5F,0x43,  /* 000003D8    "PIRB.._C" */
-    0x52,0x53,0x00,0x7B,0x50,0x49,0x52,0x42,  /* 000003E0    "RS.{PIRB" */
-    0x0A,0x0F,0x60,0x79,0x01,0x60,0x49,0x52,  /* 000003E8    "..`y.`IR" */
-    0x51,0x56,0xA4,0x42,0x55,0x46,0x42,0x14,  /* 000003F0    "QV.BUFB." */
-    0x1B,0x5F,0x53,0x52,0x53,0x01,0x8B,0x68,  /* 000003F8    "._SRS..h" */
-    0x01,0x49,0x52,0x51,0x31,0x82,0x49,0x52,  /* 00000400    ".IRQ1.IR" */
-    0x51,0x31,0x60,0x76,0x60,0x70,0x60,0x50,  /* 00000408    "Q1`v`p`P" */
-    0x49,0x52,0x42,0x5B,0x82,0x49,0x08,0x4C,  /* 00000410    "IRB[.I.L" */
-    0x4E,0x4B,0x43,0x08,0x5F,0x48,0x49,0x44,  /* 00000418    "NKC._HID" */
-    0x0C,0x41,0xD0,0x0C,0x0F,0x08,0x5F,0x55,  /* 00000420    ".A...._U" */
-    0x49,0x44,0x0A,0x03,0x14,0x1C,0x5F,0x53,  /* 00000428    "ID...._S" */
-    0x54,0x41,0x00,0x7B,0x50,0x49,0x52,0x43,  /* 00000430    "TA.{PIRC" */
-    0x0A,0x80,0x60,0xA0,0x08,0x93,0x60,0x0A,  /* 00000438    "..`...`." */
-    0x80,0xA4,0x0A,0x09,0xA1,0x04,0xA4,0x0A,  /* 00000440    "........" */
-    0x0B,0x14,0x0B,0x5F,0x50,0x52,0x53,0x00,  /* 00000448    "..._PRS." */
-    0xA4,0x42,0x55,0x46,0x41,0x14,0x11,0x5F,  /* 00000450    ".BUFA.._" */
-    0x44,0x49,0x53,0x00,0x7D,0x50,0x49,0x52,  /* 00000458    "DIS.}PIR" */
-    0x43,0x0A,0x80,0x50,0x49,0x52,0x43,0x14,  /* 00000460    "C..PIRC." */
-    0x1A,0x5F,0x43,0x52,0x53,0x00,0x7B,0x50,  /* 00000468    "._CRS.{P" */
-    0x49,0x52,0x43,0x0A,0x0F,0x60,0x79,0x01,  /* 00000470    "IRC..`y." */
-    0x60,0x49,0x52,0x51,0x56,0xA4,0x42,0x55,  /* 00000478    "`IRQV.BU" */
-    0x46,0x42,0x14,0x1B,0x5F,0x53,0x52,0x53,  /* 00000480    "FB.._SRS" */
-    0x01,0x8B,0x68,0x01,0x49,0x52,0x51,0x31,  /* 00000488    "..h.IRQ1" */
-    0x82,0x49,0x52,0x51,0x31,0x60,0x76,0x60,  /* 00000490    ".IRQ1`v`" */
-    0x70,0x60,0x50,0x49,0x52,0x43,0x5B,0x82,  /* 00000498    "p`PIRC[." */
-    0x49,0x08,0x4C,0x4E,0x4B,0x44,0x08,0x5F,  /* 000004A0    "I.LNKD._" */
-    0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x0F,  /* 000004A8    "HID.A..." */
-    0x08,0x5F,0x55,0x49,0x44,0x0A,0x04,0x14,  /* 000004B0    "._UID..." */
-    0x1C,0x5F,0x53,0x54,0x41,0x00,0x7B,0x50,  /* 000004B8    "._STA.{P" */
-    0x49,0x52,0x44,0x0A,0x80,0x60,0xA0,0x08,  /* 000004C0    "IRD..`.." */
-    0x93,0x60,0x0A,0x80,0xA4,0x0A,0x09,0xA1,  /* 000004C8    ".`......" */
-    0x04,0xA4,0x0A,0x0B,0x14,0x0B,0x5F,0x50,  /* 000004D0    "......_P" */
-    0x52,0x53,0x00,0xA4,0x42,0x55,0x46,0x41,  /* 000004D8    "RS..BUFA" */
-    0x14,0x11,0x5F,0x44,0x49,0x53,0x00,0x7D,  /* 000004E0    ".._DIS.}" */
-    0x50,0x49,0x52,0x44,0x0A,0x80,0x50,0x49,  /* 000004E8    "PIRD..PI" */
-    0x52,0x44,0x14,0x1A,0x5F,0x43,0x52,0x53,  /* 000004F0    "RD.._CRS" */
-    0x00,0x7B,0x50,0x49,0x52,0x44,0x0A,0x0F,  /* 000004F8    ".{PIRD.." */
-    0x60,0x79,0x01,0x60,0x49,0x52,0x51,0x56,  /* 00000500    "`y.`IRQV" */
-    0xA4,0x42,0x55,0x46,0x42,0x14,0x1B,0x5F,  /* 00000508    ".BUFB.._" */
-    0x53,0x52,0x53,0x01,0x8B,0x68,0x01,0x49,  /* 00000510    "SRS..h.I" */
-    0x52,0x51,0x31,0x82,0x49,0x52,0x51,0x31,  /* 00000518    "RQ1.IRQ1" */
-    0x60,0x76,0x60,0x70,0x60,0x50,0x49,0x52,  /* 00000520    "`v`p`PIR" */
-    0x44,0x5B,0x82,0x44,0x05,0x48,0x50,0x45,  /* 00000528    "D[.D.HPE" */
-    0x54,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 00000530    "T._HID.A" */
-    0xD0,0x01,0x03,0x08,0x5F,0x55,0x49,0x44,  /* 00000538    "...._UID" */
-    0x00,0x14,0x18,0x5F,0x53,0x54,0x41,0x00,  /* 00000540    "..._STA." */
-    0xA0,0x0C,0x93,0x5E,0x5E,0x5E,0x48,0x50,  /* 00000548    "...^^^HP" */
-    0x45,0x54,0x00,0xA4,0x00,0xA1,0x04,0xA4,  /* 00000550    "ET......" */
-    0x0A,0x0F,0x08,0x5F,0x43,0x52,0x53,0x11,  /* 00000558    "..._CRS." */
-    0x1F,0x0A,0x1C,0x87,0x17,0x00,0x00,0x0D,  /* 00000560    "........" */
-    0x01,0x00,0x00,0x00,0x00,0x00,0x00,0xD0,  /* 00000568    "........" */
-    0xFE,0xFF,0x03,0xD0,0xFE,0x00,0x00,0x00,  /* 00000570    "........" */
-    0x00,0x00,0x04,0x00,0x00,0x79,0x00,0x14,  /* 00000578    ".....y.." */
-    0x16,0x5F,0x50,0x52,0x54,0x00,0xA0,0x0A,  /* 00000580    "._PRT..." */
-    0x50,0x49,0x43,0x44,0xA4,0x50,0x52,0x54,  /* 00000588    "PICD.PRT" */
-    0x41,0xA4,0x50,0x52,0x54,0x50,0x08,0x50,  /* 00000590    "A.PRTP.P" */
-    0x52,0x54,0x50,0x12,0x49,0x36,0x3C,0x12,  /* 00000598    "RTP.I6<." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x01,0x00,0x00,  /* 000005A0    "........" */
-    0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0D,0x04,  /* 000005A8    "LNKB...." */
-    0x0C,0xFF,0xFF,0x01,0x00,0x01,0x4C,0x4E,  /* 000005B0    "......LN" */
-    0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 000005B8    "KC......" */
-    0xFF,0x01,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 000005C0    ".....LNK" */
-    0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 000005C8    "D......." */
-    0x01,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x41,  /* 000005D0    "....LNKA" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x02,  /* 000005D8    "........" */
-    0x00,0x00,0x4C,0x4E,0x4B,0x43,0x00,0x12,  /* 000005E0    "..LNKC.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x02,0x00,0x01,  /* 000005E8    "........" */
-    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0E,0x04,  /* 000005F0    "LNKD...." */
-    0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x02,0x4C,  /* 000005F8    ".......L" */
-    0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,0x0C,  /* 00000600    "NKA....." */
-    0xFF,0xFF,0x02,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000608    "......LN" */
-    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000610    "KB......" */
-    0xFF,0x03,0x00,0x00,0x4C,0x4E,0x4B,0x44,  /* 00000618    "....LNKD" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x03,  /* 00000620    "........" */
-    0x00,0x01,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 00000628    "..LNKA.." */
-    0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00,0x0A,  /* 00000630    "........" */
-    0x02,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E,  /* 00000638    ".LNKB..." */
-    0x04,0x0C,0xFF,0xFF,0x03,0x00,0x0A,0x03,  /* 00000640    "........" */
-    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000648    "LNKC...." */
-    0x0C,0xFF,0xFF,0x04,0x00,0x00,0x4C,0x4E,  /* 00000650    "......LN" */
-    0x4B,0x41,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000658    "KA......" */
-    0xFF,0x04,0x00,0x01,0x4C,0x4E,0x4B,0x42,  /* 00000660    "....LNKB" */
-    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x04,  /* 00000668    "........" */
-    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x43,0x00,  /* 00000670    "...LNKC." */
-    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x04,0x00,  /* 00000678    "........" */
-    0x0A,0x03,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000680    "..LNKD.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x05,0x00,0x00,  /* 00000688    "........" */
-    0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0D,0x04,  /* 00000690    "LNKB...." */
-    0x0C,0xFF,0xFF,0x05,0x00,0x01,0x4C,0x4E,  /* 00000698    "......LN" */
-    0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 000006A0    "KC......" */
-    0xFF,0x05,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 000006A8    ".....LNK" */
-    0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 000006B0    "D......." */
-    0x05,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x41,  /* 000006B8    "....LNKA" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x06,  /* 000006C0    "........" */
-    0x00,0x00,0x4C,0x4E,0x4B,0x43,0x00,0x12,  /* 000006C8    "..LNKC.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x06,0x00,0x01,  /* 000006D0    "........" */
-    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0E,0x04,  /* 000006D8    "LNKD...." */
-    0x0C,0xFF,0xFF,0x06,0x00,0x0A,0x02,0x4C,  /* 000006E0    ".......L" */
-    0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,0x0C,  /* 000006E8    "NKA....." */
-    0xFF,0xFF,0x06,0x00,0x0A,0x03,0x4C,0x4E,  /* 000006F0    "......LN" */
-    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 000006F8    "KB......" */
-    0xFF,0x07,0x00,0x00,0x4C,0x4E,0x4B,0x44,  /* 00000700    "....LNKD" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x07,  /* 00000708    "........" */
-    0x00,0x01,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 00000710    "..LNKA.." */
-    0x0E,0x04,0x0C,0xFF,0xFF,0x07,0x00,0x0A,  /* 00000718    "........" */
-    0x02,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E,  /* 00000720    ".LNKB..." */
-    0x04,0x0C,0xFF,0xFF,0x07,0x00,0x0A,0x03,  /* 00000728    "........" */
-    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000730    "LNKC...." */
-    0x0C,0xFF,0xFF,0x08,0x00,0x00,0x4C,0x4E,  /* 00000738    "......LN" */
-    0x4B,0x41,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000740    "KA......" */
-    0xFF,0x08,0x00,0x01,0x4C,0x4E,0x4B,0x42,  /* 00000748    "....LNKB" */
-    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x08,  /* 00000750    "........" */
-    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x43,0x00,  /* 00000758    "...LNKC." */
-    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x08,0x00,  /* 00000760    "........" */
-    0x0A,0x03,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000768    "..LNKD.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x09,0x00,0x00,  /* 00000770    "........" */
-    0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0D,0x04,  /* 00000778    "LNKB...." */
-    0x0C,0xFF,0xFF,0x09,0x00,0x01,0x4C,0x4E,  /* 00000780    "......LN" */
-    0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000788    "KC......" */
-    0xFF,0x09,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000790    ".....LNK" */
-    0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000798    "D......." */
-    0x09,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x41,  /* 000007A0    "....LNKA" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0A,  /* 000007A8    "........" */
-    0x00,0x00,0x4C,0x4E,0x4B,0x43,0x00,0x12,  /* 000007B0    "..LNKC.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x0A,0x00,0x01,  /* 000007B8    "........" */
-    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0E,0x04,  /* 000007C0    "LNKD...." */
-    0x0C,0xFF,0xFF,0x0A,0x00,0x0A,0x02,0x4C,  /* 000007C8    ".......L" */
-    0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,0x0C,  /* 000007D0    "NKA....." */
-    0xFF,0xFF,0x0A,0x00,0x0A,0x03,0x4C,0x4E,  /* 000007D8    "......LN" */
-    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 000007E0    "KB......" */
-    0xFF,0x0B,0x00,0x00,0x4C,0x4E,0x4B,0x44,  /* 000007E8    "....LNKD" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0B,  /* 000007F0    "........" */
-    0x00,0x01,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 000007F8    "..LNKA.." */
-    0x0E,0x04,0x0C,0xFF,0xFF,0x0B,0x00,0x0A,  /* 00000800    "........" */
-    0x02,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E,  /* 00000808    ".LNKB..." */
-    0x04,0x0C,0xFF,0xFF,0x0B,0x00,0x0A,0x03,  /* 00000810    "........" */
-    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000818    "LNKC...." */
-    0x0C,0xFF,0xFF,0x0C,0x00,0x00,0x4C,0x4E,  /* 00000820    "......LN" */
-    0x4B,0x41,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000828    "KA......" */
-    0xFF,0x0C,0x00,0x01,0x4C,0x4E,0x4B,0x42,  /* 00000830    "....LNKB" */
-    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x0C,  /* 00000838    "........" */
-    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x43,0x00,  /* 00000840    "...LNKC." */
-    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x0C,0x00,  /* 00000848    "........" */
-    0x0A,0x03,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000850    "..LNKD.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x0D,0x00,0x00,  /* 00000858    "........" */
-    0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0D,0x04,  /* 00000860    "LNKB...." */
-    0x0C,0xFF,0xFF,0x0D,0x00,0x01,0x4C,0x4E,  /* 00000868    "......LN" */
-    0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000870    "KC......" */
-    0xFF,0x0D,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000878    ".....LNK" */
-    0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000880    "D......." */
-    0x0D,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x41,  /* 00000888    "....LNKA" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0E,  /* 00000890    "........" */
-    0x00,0x00,0x4C,0x4E,0x4B,0x43,0x00,0x12,  /* 00000898    "..LNKC.." */
-    0x0D,0x04,0x0C,0xFF,0xFF,0x0E,0x00,0x01,  /* 000008A0    "........" */
-    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0E,0x04,  /* 000008A8    "LNKD...." */
-    0x0C,0xFF,0xFF,0x0E,0x00,0x0A,0x02,0x4C,  /* 000008B0    ".......L" */
-    0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,0x0C,  /* 000008B8    "NKA....." */
-    0xFF,0xFF,0x0E,0x00,0x0A,0x03,0x4C,0x4E,  /* 000008C0    "......LN" */
-    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 000008C8    "KB......" */
-    0xFF,0x0F,0x00,0x00,0x4C,0x4E,0x4B,0x44,  /* 000008D0    "....LNKD" */
-    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0F,  /* 000008D8    "........" */
-    0x00,0x01,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 000008E0    "..LNKA.." */
-    0x0E,0x04,0x0C,0xFF,0xFF,0x0F,0x00,0x0A,  /* 000008E8    "........" */
-    0x02,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E,  /* 000008F0    ".LNKB..." */
-    0x04,0x0C,0xFF,0xFF,0x0F,0x00,0x0A,0x03,  /* 000008F8    "........" */
-    0x4C,0x4E,0x4B,0x43,0x00,0x08,0x50,0x52,  /* 00000900    "LNKC..PR" */
-    0x54,0x41,0x12,0x41,0x2F,0x3C,0x12,0x0B,  /* 00000908    "TA.A/<.." */
-    0x04,0x0C,0xFF,0xFF,0x01,0x00,0x00,0x00,  /* 00000910    "........" */
-    0x0A,0x14,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000918    "........" */
-    0x01,0x00,0x01,0x00,0x0A,0x15,0x12,0x0C,  /* 00000920    "........" */
-    0x04,0x0C,0xFF,0xFF,0x01,0x00,0x0A,0x02,  /* 00000928    "........" */
-    0x00,0x0A,0x16,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000930    "........" */
-    0xFF,0x01,0x00,0x0A,0x03,0x00,0x0A,0x17,  /* 00000938    "........" */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x02,0x00,  /* 00000940    "........" */
-    0x00,0x00,0x0A,0x18,0x12,0x0B,0x04,0x0C,  /* 00000948    "........" */
-    0xFF,0xFF,0x02,0x00,0x01,0x00,0x0A,0x19,  /* 00000950    "........" */
-    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x02,0x00,  /* 00000958    "........" */
-    0x0A,0x02,0x00,0x0A,0x1A,0x12,0x0C,0x04,  /* 00000960    "........" */
-    0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x03,0x00,  /* 00000968    "........" */
-    0x0A,0x1B,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000970    "........" */
-    0x03,0x00,0x00,0x00,0x0A,0x1C,0x12,0x0B,  /* 00000978    "........" */
-    0x04,0x0C,0xFF,0xFF,0x03,0x00,0x01,0x00,  /* 00000980    "........" */
-    0x0A,0x1D,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000988    "........" */
-    0x03,0x00,0x0A,0x02,0x00,0x0A,0x1E,0x12,  /* 00000990    "........" */
-    0x0C,0x04,0x0C,0xFF,0xFF,0x03,0x00,0x0A,  /* 00000998    "........" */
-    0x03,0x00,0x0A,0x1F,0x12,0x0B,0x04,0x0C,  /* 000009A0    "........" */
-    0xFF,0xFF,0x04,0x00,0x00,0x00,0x0A,0x20,  /* 000009A8    "....... " */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x04,0x00,  /* 000009B0    "........" */
-    0x01,0x00,0x0A,0x21,0x12,0x0C,0x04,0x0C,  /* 000009B8    "...!...." */
-    0xFF,0xFF,0x04,0x00,0x0A,0x02,0x00,0x0A,  /* 000009C0    "........" */
-    0x22,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x04,  /* 000009C8    ""......." */
-    0x00,0x0A,0x03,0x00,0x0A,0x23,0x12,0x0B,  /* 000009D0    ".....#.." */
-    0x04,0x0C,0xFF,0xFF,0x05,0x00,0x00,0x00,  /* 000009D8    "........" */
-    0x0A,0x24,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 000009E0    ".$......" */
-    0x05,0x00,0x01,0x00,0x0A,0x25,0x12,0x0C,  /* 000009E8    ".....%.." */
-    0x04,0x0C,0xFF,0xFF,0x05,0x00,0x0A,0x02,  /* 000009F0    "........" */
-    0x00,0x0A,0x26,0x12,0x0C,0x04,0x0C,0xFF,  /* 000009F8    "..&....." */
-    0xFF,0x05,0x00,0x0A,0x03,0x00,0x0A,0x27,  /* 00000A00    ".......'" */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x06,0x00,  /* 00000A08    "........" */
-    0x00,0x00,0x0A,0x28,0x12,0x0B,0x04,0x0C,  /* 00000A10    "...(...." */
-    0xFF,0xFF,0x06,0x00,0x01,0x00,0x0A,0x29,  /* 00000A18    ".......)" */
-    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x06,0x00,  /* 00000A20    "........" */
-    0x0A,0x02,0x00,0x0A,0x2A,0x12,0x0C,0x04,  /* 00000A28    "....*..." */
-    0x0C,0xFF,0xFF,0x06,0x00,0x0A,0x03,0x00,  /* 00000A30    "........" */
-    0x0A,0x2B,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000A38    ".+......" */
-    0x07,0x00,0x00,0x00,0x0A,0x2C,0x12,0x0B,  /* 00000A40    ".....,.." */
-    0x04,0x0C,0xFF,0xFF,0x07,0x00,0x01,0x00,  /* 00000A48    "........" */
-    0x0A,0x2D,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000A50    ".-......" */
-    0x07,0x00,0x0A,0x02,0x00,0x0A,0x2E,0x12,  /* 00000A58    "........" */
-    0x0C,0x04,0x0C,0xFF,0xFF,0x07,0x00,0x0A,  /* 00000A60    "........" */
-    0x03,0x00,0x0A,0x2F,0x12,0x0B,0x04,0x0C,  /* 00000A68    ".../...." */
-    0xFF,0xFF,0x08,0x00,0x00,0x00,0x0A,0x11,  /* 00000A70    "........" */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x08,0x00,  /* 00000A78    "........" */
-    0x01,0x00,0x0A,0x12,0x12,0x0C,0x04,0x0C,  /* 00000A80    "........" */
-    0xFF,0xFF,0x08,0x00,0x0A,0x02,0x00,0x0A,  /* 00000A88    "........" */
-    0x13,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x08,  /* 00000A90    "........" */
-    0x00,0x0A,0x03,0x00,0x0A,0x14,0x12,0x0B,  /* 00000A98    "........" */
-    0x04,0x0C,0xFF,0xFF,0x09,0x00,0x00,0x00,  /* 00000AA0    "........" */
-    0x0A,0x15,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000AA8    "........" */
-    0x09,0x00,0x01,0x00,0x0A,0x16,0x12,0x0C,  /* 00000AB0    "........" */
-    0x04,0x0C,0xFF,0xFF,0x09,0x00,0x0A,0x02,  /* 00000AB8    "........" */
-    0x00,0x0A,0x17,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000AC0    "........" */
-    0xFF,0x09,0x00,0x0A,0x03,0x00,0x0A,0x18,  /* 00000AC8    "........" */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x0A,0x00,  /* 00000AD0    "........" */
-    0x00,0x00,0x0A,0x19,0x12,0x0B,0x04,0x0C,  /* 00000AD8    "........" */
-    0xFF,0xFF,0x0A,0x00,0x01,0x00,0x0A,0x1A,  /* 00000AE0    "........" */
-    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x0A,0x00,  /* 00000AE8    "........" */
-    0x0A,0x02,0x00,0x0A,0x1B,0x12,0x0C,0x04,  /* 00000AF0    "........" */
-    0x0C,0xFF,0xFF,0x0A,0x00,0x0A,0x03,0x00,  /* 00000AF8    "........" */
-    0x0A,0x1C,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000B00    "........" */
-    0x0B,0x00,0x00,0x00,0x0A,0x1D,0x12,0x0B,  /* 00000B08    "........" */
-    0x04,0x0C,0xFF,0xFF,0x0B,0x00,0x01,0x00,  /* 00000B10    "........" */
-    0x0A,0x1E,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000B18    "........" */
-    0x0B,0x00,0x0A,0x02,0x00,0x0A,0x1F,0x12,  /* 00000B20    "........" */
-    0x0C,0x04,0x0C,0xFF,0xFF,0x0B,0x00,0x0A,  /* 00000B28    "........" */
-    0x03,0x00,0x0A,0x20,0x12,0x0B,0x04,0x0C,  /* 00000B30    "... ...." */
-    0xFF,0xFF,0x0C,0x00,0x00,0x00,0x0A,0x21,  /* 00000B38    ".......!" */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x0C,0x00,  /* 00000B40    "........" */
-    0x01,0x00,0x0A,0x22,0x12,0x0C,0x04,0x0C,  /* 00000B48    "..."...." */
-    0xFF,0xFF,0x0C,0x00,0x0A,0x02,0x00,0x0A,  /* 00000B50    "........" */
-    0x23,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x0C,  /* 00000B58    "#......." */
-    0x00,0x0A,0x03,0x00,0x0A,0x24,0x12,0x0B,  /* 00000B60    ".....$.." */
-    0x04,0x0C,0xFF,0xFF,0x0D,0x00,0x00,0x00,  /* 00000B68    "........" */
-    0x0A,0x25,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000B70    ".%......" */
-    0x0D,0x00,0x01,0x00,0x0A,0x26,0x12,0x0C,  /* 00000B78    ".....&.." */
-    0x04,0x0C,0xFF,0xFF,0x0D,0x00,0x0A,0x02,  /* 00000B80    "........" */
-    0x00,0x0A,0x27,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000B88    "..'....." */
-    0xFF,0x0D,0x00,0x0A,0x03,0x00,0x0A,0x28,  /* 00000B90    ".......(" */
-    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x0E,0x00,  /* 00000B98    "........" */
-    0x00,0x00,0x0A,0x29,0x12,0x0B,0x04,0x0C,  /* 00000BA0    "...)...." */
-    0xFF,0xFF,0x0E,0x00,0x01,0x00,0x0A,0x2A,  /* 00000BA8    ".......*" */
-    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x0E,0x00,  /* 00000BB0    "........" */
-    0x0A,0x02,0x00,0x0A,0x2B,0x12,0x0C,0x04,  /* 00000BB8    "....+..." */
-    0x0C,0xFF,0xFF,0x0E,0x00,0x0A,0x03,0x00,  /* 00000BC0    "........" */
-    0x0A,0x2C,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000BC8    ".,......" */
-    0x0F,0x00,0x00,0x00,0x0A,0x2D,0x12,0x0B,  /* 00000BD0    ".....-.." */
-    0x04,0x0C,0xFF,0xFF,0x0F,0x00,0x01,0x00,  /* 00000BD8    "........" */
-    0x0A,0x2E,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000BE0    "........" */
-    0x0F,0x00,0x0A,0x02,0x00,0x0A,0x2F,0x12,  /* 00000BE8    "....../." */
-    0x0C,0x04,0x0C,0xFF,0xFF,0x0F,0x00,0x0A,  /* 00000BF0    "........" */
-    0x03,0x00,0x0A,0x10,0x5B,0x82,0x46,0x37,  /* 00000BF8    "....[.F7" */
-    0x49,0x53,0x41,0x5F,0x08,0x5F,0x41,0x44,  /* 00000C00    "ISA_._AD" */
-    0x52,0x0C,0x00,0x00,0x01,0x00,0x5B,0x80,  /* 00000C08    "R.....[." */
-    0x50,0x49,0x52,0x51,0x02,0x0A,0x60,0x0A,  /* 00000C10    "PIRQ..`." */
-    0x04,0x10,0x2E,0x5C,0x00,0x5B,0x81,0x29,  /* 00000C18    "...\.[.)" */
-    0x5C,0x2F,0x04,0x5F,0x53,0x42,0x5F,0x50,  /* 00000C20    "\/._SB_P" */
-    0x43,0x49,0x30,0x49,0x53,0x41,0x5F,0x50,  /* 00000C28    "CI0ISA_P" */
-    0x49,0x52,0x51,0x01,0x50,0x49,0x52,0x41,  /* 00000C30    "IRQ.PIRA" */
-    0x08,0x50,0x49,0x52,0x42,0x08,0x50,0x49,  /* 00000C38    ".PIRB.PI" */
-    0x52,0x43,0x08,0x50,0x49,0x52,0x44,0x08,  /* 00000C40    "RC.PIRD." */
-    0x5B,0x82,0x46,0x0B,0x53,0x59,0x53,0x52,  /* 00000C48    "[.F.SYSR" */
-    0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,  /* 00000C50    "._HID.A." */
-    0x0C,0x02,0x08,0x5F,0x55,0x49,0x44,0x01,  /* 00000C58    "..._UID." */
-    0x08,0x43,0x52,0x53,0x5F,0x11,0x4E,0x08,  /* 00000C60    ".CRS_.N." */
-    0x0A,0x8A,0x47,0x01,0x10,0x00,0x10,0x00,  /* 00000C68    "..G....." */
-    0x00,0x10,0x47,0x01,0x22,0x00,0x22,0x00,  /* 00000C70    "..G."."." */
-    0x00,0x0C,0x47,0x01,0x30,0x00,0x30,0x00,  /* 00000C78    "..G.0.0." */
-    0x00,0x10,0x47,0x01,0x44,0x00,0x44,0x00,  /* 00000C80    "..G.D.D." */
-    0x00,0x1C,0x47,0x01,0x62,0x00,0x62,0x00,  /* 00000C88    "..G.b.b." */
-    0x00,0x02,0x47,0x01,0x65,0x00,0x65,0x00,  /* 00000C90    "..G.e.e." */
-    0x00,0x0B,0x47,0x01,0x72,0x00,0x72,0x00,  /* 00000C98    "..G.r.r." */
-    0x00,0x0E,0x47,0x01,0x80,0x00,0x80,0x00,  /* 00000CA0    "..G....." */
-    0x00,0x01,0x47,0x01,0x84,0x00,0x84,0x00,  /* 00000CA8    "..G....." */
-    0x00,0x03,0x47,0x01,0x88,0x00,0x88,0x00,  /* 00000CB0    "..G....." */
-    0x00,0x01,0x47,0x01,0x8C,0x00,0x8C,0x00,  /* 00000CB8    "..G....." */
-    0x00,0x03,0x47,0x01,0x90,0x00,0x90,0x00,  /* 00000CC0    "..G....." */
-    0x00,0x10,0x47,0x01,0xA2,0x00,0xA2,0x00,  /* 00000CC8    "..G....." */
-    0x00,0x1C,0x47,0x01,0xE0,0x00,0xE0,0x00,  /* 00000CD0    "..G....." */
-    0x00,0x10,0x47,0x01,0xA0,0x08,0xA0,0x08,  /* 00000CD8    "..G....." */
-    0x00,0x04,0x47,0x01,0xC0,0x0C,0xC0,0x0C,  /* 00000CE0    "..G....." */
-    0x00,0x10,0x47,0x01,0xD0,0x04,0xD0,0x04,  /* 00000CE8    "..G....." */
-    0x00,0x02,0x79,0x00,0x14,0x0B,0x5F,0x43,  /* 00000CF0    "..y..._C" */
-    0x52,0x53,0x00,0xA4,0x43,0x52,0x53,0x5F,  /* 00000CF8    "RS..CRS_" */
-    0x5B,0x82,0x2B,0x50,0x49,0x43,0x5F,0x08,  /* 00000D00    "[.+PIC_." */
-    0x5F,0x48,0x49,0x44,0x0B,0x41,0xD0,0x08,  /* 00000D08    "_HID.A.." */
-    0x5F,0x43,0x52,0x53,0x11,0x18,0x0A,0x15,  /* 00000D10    "_CRS...." */
-    0x47,0x01,0x20,0x00,0x20,0x00,0x01,0x02,  /* 00000D18    "G. . ..." */
-    0x47,0x01,0xA0,0x00,0xA0,0x00,0x01,0x02,  /* 00000D20    "G......." */
-    0x22,0x04,0x00,0x79,0x00,0x5B,0x82,0x47,  /* 00000D28    ""..y.[.G" */
-    0x05,0x44,0x4D,0x41,0x30,0x08,0x5F,0x48,  /* 00000D30    ".DMA0._H" */
-    0x49,0x44,0x0C,0x41,0xD0,0x02,0x00,0x08,  /* 00000D38    "ID.A...." */
-    0x5F,0x43,0x52,0x53,0x11,0x41,0x04,0x0A,  /* 00000D40    "_CRS.A.." */
-    0x3D,0x2A,0x10,0x04,0x47,0x01,0x00,0x00,  /* 00000D48    "=*..G..." */
-    0x00,0x00,0x00,0x10,0x47,0x01,0x81,0x00,  /* 00000D50    "....G..." */
-    0x81,0x00,0x00,0x03,0x47,0x01,0x87,0x00,  /* 00000D58    "....G..." */
-    0x87,0x00,0x00,0x01,0x47,0x01,0x89,0x00,  /* 00000D60    "....G..." */
-    0x89,0x00,0x00,0x03,0x47,0x01,0x8F,0x00,  /* 00000D68    "....G..." */
-    0x8F,0x00,0x00,0x01,0x47,0x01,0xC0,0x00,  /* 00000D70    "....G..." */
-    0xC0,0x00,0x00,0x20,0x47,0x01,0x80,0x04,  /* 00000D78    "... G..." */
-    0x80,0x04,0x00,0x10,0x79,0x00,0x5B,0x82,  /* 00000D80    "....y.[." */
-    0x25,0x54,0x4D,0x52,0x5F,0x08,0x5F,0x48,  /* 00000D88    "%TMR_._H" */
-    0x49,0x44,0x0C,0x41,0xD0,0x01,0x00,0x08,  /* 00000D90    "ID.A...." */
-    0x5F,0x43,0x52,0x53,0x11,0x10,0x0A,0x0D,  /* 00000D98    "_CRS...." */
-    0x47,0x01,0x40,0x00,0x40,0x00,0x00,0x04,  /* 00000DA0    "G.@.@..." */
-    0x22,0x01,0x00,0x79,0x00,0x5B,0x82,0x25,  /* 00000DA8    ""..y.[.%" */
-    0x52,0x54,0x43,0x5F,0x08,0x5F,0x48,0x49,  /* 00000DB0    "RTC_._HI" */
-    0x44,0x0C,0x41,0xD0,0x0B,0x00,0x08,0x5F,  /* 00000DB8    "D.A...._" */
-    0x43,0x52,0x53,0x11,0x10,0x0A,0x0D,0x47,  /* 00000DC0    "CRS....G" */
-    0x01,0x70,0x00,0x70,0x00,0x00,0x02,0x22,  /* 00000DC8    ".p.p..."" */
-    0x00,0x01,0x79,0x00,0x5B,0x82,0x22,0x53,  /* 00000DD0    "..y.[."S" */
-    0x50,0x4B,0x52,0x08,0x5F,0x48,0x49,0x44,  /* 00000DD8    "PKR._HID" */
-    0x0C,0x41,0xD0,0x08,0x00,0x08,0x5F,0x43,  /* 00000DE0    ".A...._C" */
-    0x52,0x53,0x11,0x0D,0x0A,0x0A,0x47,0x01,  /* 00000DE8    "RS....G." */
-    0x61,0x00,0x61,0x00,0x00,0x01,0x79,0x00,  /* 00000DF0    "a.a...y." */
-    0x5B,0x82,0x31,0x50,0x53,0x32,0x4D,0x08,  /* 00000DF8    "[.1PS2M." */
-    0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,0x0F,  /* 00000E00    "_HID.A.." */
-    0x13,0x08,0x5F,0x43,0x49,0x44,0x0C,0x41,  /* 00000E08    ".._CID.A" */
-    0xD0,0x0F,0x13,0x14,0x09,0x5F,0x53,0x54,  /* 00000E10    "....._ST" */
-    0x41,0x00,0xA4,0x0A,0x0F,0x08,0x5F,0x43,  /* 00000E18    "A....._C" */
-    0x52,0x53,0x11,0x08,0x0A,0x05,0x22,0x00,  /* 00000E20    "RS...."." */
-    0x10,0x79,0x00,0x5B,0x82,0x42,0x04,0x50,  /* 00000E28    ".y.[.B.P" */
-    0x53,0x32,0x4B,0x08,0x5F,0x48,0x49,0x44,  /* 00000E30    "S2K._HID" */
-    0x0C,0x41,0xD0,0x03,0x03,0x08,0x5F,0x43,  /* 00000E38    ".A...._C" */
-    0x49,0x44,0x0C,0x41,0xD0,0x03,0x0B,0x14,  /* 00000E40    "ID.A...." */
-    0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,0x0A,  /* 00000E48    "._STA..." */
-    0x0F,0x08,0x5F,0x43,0x52,0x53,0x11,0x18,  /* 00000E50    ".._CRS.." */
-    0x0A,0x15,0x47,0x01,0x60,0x00,0x60,0x00,  /* 00000E58    "..G.`.`." */
-    0x00,0x01,0x47,0x01,0x64,0x00,0x64,0x00,  /* 00000E60    "..G.d.d." */
-    0x00,0x01,0x22,0x02,0x00,0x79,0x00,0x5B,  /* 00000E68    ".."..y.[" */
-    0x82,0x3A,0x46,0x44,0x43,0x30,0x08,0x5F,  /* 00000E70    ".:FDC0._" */
-    0x48,0x49,0x44,0x0C,0x41,0xD0,0x07,0x00,  /* 00000E78    "HID.A..." */
-    0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,  /* 00000E80    ".._STA.." */
-    0x0A,0x0F,0x08,0x5F,0x43,0x52,0x53,0x11,  /* 00000E88    "..._CRS." */
-    0x1B,0x0A,0x18,0x47,0x01,0xF0,0x03,0xF0,  /* 00000E90    "...G...." */
-    0x03,0x01,0x06,0x47,0x01,0xF7,0x03,0xF7,  /* 00000E98    "...G...." */
-    0x03,0x01,0x01,0x22,0x40,0x00,0x2A,0x04,  /* 00000EA0    "..."@.*." */
-    0x00,0x79,0x00,0x5B,0x82,0x46,0x04,0x55,  /* 00000EA8    ".y.[.F.U" */
-    0x41,0x52,0x31,0x08,0x5F,0x48,0x49,0x44,  /* 00000EB0    "AR1._HID" */
-    0x0C,0x41,0xD0,0x05,0x01,0x08,0x5F,0x55,  /* 00000EB8    ".A...._U" */
-    0x49,0x44,0x01,0x14,0x19,0x5F,0x53,0x54,  /* 00000EC0    "ID..._ST" */
-    0x41,0x00,0xA0,0x0D,0x93,0x5E,0x5E,0x5E,  /* 00000EC8    "A....^^^" */
-    0x5E,0x55,0x41,0x52,0x31,0x00,0xA4,0x00,  /* 00000ED0    "^UAR1..." */
-    0xA1,0x04,0xA4,0x0A,0x0F,0x08,0x5F,0x43,  /* 00000ED8    "......_C" */
-    0x52,0x53,0x11,0x10,0x0A,0x0D,0x47,0x01,  /* 00000EE0    "RS....G." */
-    0xF8,0x03,0xF8,0x03,0x08,0x08,0x22,0x10,  /* 00000EE8    "......"." */
-    0x00,0x79,0x00,0x5B,0x82,0x47,0x04,0x55,  /* 00000EF0    ".y.[.G.U" */
-    0x41,0x52,0x32,0x08,0x5F,0x48,0x49,0x44,  /* 00000EF8    "AR2._HID" */
-    0x0C,0x41,0xD0,0x05,0x01,0x08,0x5F,0x55,  /* 00000F00    ".A...._U" */
-    0x49,0x44,0x0A,0x02,0x14,0x19,0x5F,0x53,  /* 00000F08    "ID...._S" */
-    0x54,0x41,0x00,0xA0,0x0D,0x93,0x5E,0x5E,  /* 00000F10    "TA....^^" */
-    0x5E,0x5E,0x55,0x41,0x52,0x32,0x00,0xA4,  /* 00000F18    "^^UAR2.." */
-    0x00,0xA1,0x04,0xA4,0x0A,0x0F,0x08,0x5F,  /* 00000F20    "......._" */
-    0x43,0x52,0x53,0x11,0x10,0x0A,0x0D,0x47,  /* 00000F28    "CRS....G" */
-    0x01,0xF8,0x02,0xF8,0x02,0x08,0x08,0x22,  /* 00000F30    "......."" */
-    0x08,0x00,0x79,0x00,0x5B,0x82,0x36,0x4C,  /* 00000F38    "..y.[.6L" */
-    0x54,0x50,0x31,0x08,0x5F,0x48,0x49,0x44,  /* 00000F40    "TP1._HID" */
-    0x0C,0x41,0xD0,0x04,0x00,0x08,0x5F,0x55,  /* 00000F48    ".A...._U" */
-    0x49,0x44,0x0A,0x02,0x14,0x09,0x5F,0x53,  /* 00000F50    "ID...._S" */
-    0x54,0x41,0x00,0xA4,0x0A,0x0F,0x08,0x5F,  /* 00000F58    "TA....._" */
-    0x43,0x52,0x53,0x11,0x10,0x0A,0x0D,0x47,  /* 00000F60    "CRS....G" */
-    0x01,0x78,0x03,0x78,0x03,0x08,0x08,0x22,  /* 00000F68    ".x.x..."" */
-    0x80,0x00,0x79,0x00,0x5B,0x82,0x4D,0x07,  /* 00000F70    "..y.[.M." */
-    0x53,0x31,0x46,0x30,0x08,0x5F,0x41,0x44,  /* 00000F78    "S1F0._AD" */
-    0x52,0x0C,0x00,0x00,0x06,0x00,0x08,0x5F,  /* 00000F80    "R......_" */
-    0x53,0x55,0x4E,0x01,0x14,0x13,0x5F,0x50,  /* 00000F88    "SUN..._P" */
-    0x53,0x30,0x00,0x70,0x0A,0x80,0x5C,0x2E,  /* 00000F90    "S0.p..\." */
-    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00000F98    "_GPEDPT2" */
-    0x14,0x13,0x5F,0x50,0x53,0x33,0x00,0x70,  /* 00000FA0    ".._PS3.p" */
-    0x0A,0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00000FA8    "..\._GPE" */
-    0x44,0x50,0x54,0x32,0x14,0x1F,0x5F,0x45,  /* 00000FB0    "DPT2.._E" */
-    0x4A,0x30,0x01,0x70,0x0A,0x88,0x5C,0x2E,  /* 00000FB8    "J0.p..\." */
-    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00000FC0    "_GPEDPT2" */
-    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00000FC8    "p.\._GPE" */
-    0x50,0x48,0x50,0x31,0x14,0x1E,0x5F,0x53,  /* 00000FD0    "PHP1.._S" */
-    0x54,0x41,0x00,0x70,0x0A,0x89,0x5C,0x2E,  /* 00000FD8    "TA.p..\." */
-    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00000FE0    "_GPEDPT2" */
-    0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x50,  /* 00000FE8    ".\._GPEP" */
-    0x48,0x50,0x31,0x5B,0x82,0x4E,0x07,0x53,  /* 00000FF0    "HP1[.N.S" */
-    0x32,0x46,0x30,0x08,0x5F,0x41,0x44,0x52,  /* 00000FF8    "2F0._ADR" */
-    0x0C,0x00,0x00,0x07,0x00,0x08,0x5F,0x53,  /* 00001000    "......_S" */
-    0x55,0x4E,0x0A,0x02,0x14,0x13,0x5F,0x50,  /* 00001008    "UN...._P" */
-    0x53,0x30,0x00,0x70,0x0A,0x90,0x5C,0x2E,  /* 00001010    "S0.p..\." */
-    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001018    "_GPEDPT2" */
-    0x14,0x13,0x5F,0x50,0x53,0x33,0x00,0x70,  /* 00001020    ".._PS3.p" */
-    0x0A,0x93,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001028    "..\._GPE" */
-    0x44,0x50,0x54,0x32,0x14,0x1F,0x5F,0x45,  /* 00001030    "DPT2.._E" */
-    0x4A,0x30,0x01,0x70,0x0A,0x98,0x5C,0x2E,  /* 00001038    "J0.p..\." */
-    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001040    "_GPEDPT2" */
-    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001048    "p.\._GPE" */
-    0x50,0x48,0x50,0x32,0x14,0x1E,0x5F,0x53,  /* 00001050    "PHP2.._S" */
-    0x54,0x41,0x00,0x70,0x0A,0x99,0x5C,0x2E,  /* 00001058    "TA.p..\." */
-    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001060    "_GPEDPT2" */
-    0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x50,  /* 00001068    ".\._GPEP" */
-    0x48,0x50,0x32,0x10,0x4E,0x0B,0x5F,0x47,  /* 00001070    "HP2.N._G" */
-    0x50,0x45,0x5B,0x80,0x50,0x48,0x50,0x5F,  /* 00001078    "PE[.PHP_" */
-    0x01,0x0B,0xC0,0x10,0x0A,0x03,0x5B,0x81,  /* 00001080    "......[." */
-    0x15,0x50,0x48,0x50,0x5F,0x01,0x50,0x53,  /* 00001088    ".PHP_.PS" */
-    0x54,0x41,0x08,0x50,0x48,0x50,0x31,0x08,  /* 00001090    "TA.PHP1." */
-    0x50,0x48,0x50,0x32,0x08,0x5B,0x80,0x44,  /* 00001098    "PHP2.[.D" */
-    0x47,0x31,0x5F,0x01,0x0B,0x44,0xB0,0x0A,  /* 000010A0    "G1_..D.." */
-    0x04,0x5B,0x81,0x10,0x44,0x47,0x31,0x5F,  /* 000010A8    ".[..DG1_" */
-    0x01,0x44,0x50,0x54,0x31,0x08,0x44,0x50,  /* 000010B0    ".DPT1.DP" */
-    0x54,0x32,0x08,0x14,0x46,0x07,0x5F,0x4C,  /* 000010B8    "T2..F._L" */
-    0x30,0x33,0x00,0x08,0x53,0x4C,0x54,0x5F,  /* 000010C0    "03..SLT_" */
-    0x00,0x08,0x45,0x56,0x54,0x5F,0x00,0x70,  /* 000010C8    "..EVT_.p" */
-    0x50,0x53,0x54,0x41,0x61,0x7A,0x61,0x0A,  /* 000010D0    "PSTAaza." */
-    0x04,0x53,0x4C,0x54,0x5F,0x7B,0x61,0x0A,  /* 000010D8    ".SLT_{a." */
-    0x0F,0x45,0x56,0x54,0x5F,0x70,0x53,0x4C,  /* 000010E0    ".EVT_pSL" */
-    0x54,0x5F,0x44,0x50,0x54,0x31,0x70,0x45,  /* 000010E8    "T_DPT1pE" */
-    0x56,0x54,0x5F,0x44,0x50,0x54,0x32,0xA0,  /* 000010F0    "VT_DPT2." */
-    0x1B,0x93,0x53,0x4C,0x54,0x5F,0x01,0x86,  /* 000010F8    "..SLT_.." */
-    0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,  /* 00001100    "\/._SB_P" */
-    0x43,0x49,0x30,0x53,0x31,0x46,0x30,0x45,  /* 00001108    "CI0S1F0E" */
-    0x56,0x54,0x5F,0xA1,0x1E,0xA0,0x1C,0x93,  /* 00001110    "VT_....." */
-    0x53,0x4C,0x54,0x5F,0x0A,0x02,0x86,0x5C,  /* 00001118    "SLT_...\" */
-    0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,0x43,  /* 00001120    "/._SB_PC" */
-    0x49,0x30,0x53,0x32,0x46,0x30,0x45,0x56,  /* 00001128    "I0S2F0EV" */
+    0x00,0x00,0x00,0x00,0x0A,0x00,0x00,0x00,  /* 000001D8    "........" */
+    0x00,0x00,0x79,0x00,0x5B,0x82,0x8E,0xAE,  /* 000001E0    "..y.[..." */
+    0x02,0x50,0x43,0x49,0x30,0x08,0x5F,0x48,  /* 000001E8    ".PCI0._H" */
+    0x49,0x44,0x0C,0x41,0xD0,0x0A,0x03,0x08,  /* 000001F0    "ID.A...." */
+    0x5F,0x55,0x49,0x44,0x00,0x08,0x5F,0x41,  /* 000001F8    "_UID.._A" */
+    0x44,0x52,0x00,0x08,0x5F,0x42,0x42,0x4E,  /* 00000200    "DR.._BBN" */
+    0x00,0x5B,0x82,0x2A,0x48,0x50,0x30,0x5F,  /* 00000208    ".[.*HP0_" */
+    0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,  /* 00000210    "._HID.A." */
+    0x0C,0x02,0x08,0x5F,0x43,0x52,0x53,0x11,  /* 00000218    "..._CRS." */
+    0x15,0x0A,0x12,0x47,0x01,0xC0,0x10,0xC0,  /* 00000220    "...G...." */
+    0x10,0x00,0x03,0x47,0x01,0x44,0xB0,0x44,  /* 00000228    "...G.D.D" */
+    0xB0,0x00,0x04,0x79,0x00,0x14,0x4E,0x0C,  /* 00000230    "...y..N." */
+    0x5F,0x43,0x52,0x53,0x00,0x08,0x50,0x52,  /* 00000238    "_CRS..PR" */
+    0x54,0x30,0x11,0x42,0x07,0x0A,0x6E,0x88,  /* 00000240    "T0.B..n." */
+    0x0D,0x00,0x02,0x0E,0x00,0x00,0x00,0x00,  /* 00000248    "........" */
+    0x00,0xFF,0x00,0x00,0x00,0x00,0x01,0x47,  /* 00000250    ".......G" */
+    0x01,0xF8,0x0C,0xF8,0x0C,0x01,0x08,0x88,  /* 00000258    "........" */
+    0x0D,0x00,0x01,0x0C,0x03,0x00,0x00,0x00,  /* 00000260    "........" */
+    0x00,0xF7,0x0C,0x00,0x00,0xF8,0x0C,0x88,  /* 00000268    "........" */
+    0x0D,0x00,0x01,0x0C,0x03,0x00,0x00,0x00,  /* 00000270    "........" */
+    0x0D,0xFF,0xFF,0x00,0x00,0x00,0xF3,0x87,  /* 00000278    "........" */
+    0x17,0x00,0x00,0x0C,0x03,0x00,0x00,0x00,  /* 00000280    "........" */
+    0x00,0x00,0x00,0x0A,0x00,0xFF,0xFF,0x0B,  /* 00000288    "........" */
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,  /* 00000290    "........" */
+    0x00,0x87,0x17,0x00,0x00,0x0C,0x03,0x00,  /* 00000298    "........" */
+    0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0xFF,  /* 000002A0    "........" */
+    0xFF,0xFF,0xF4,0x00,0x00,0x00,0x00,0x00,  /* 000002A8    "........" */
+    0x00,0x00,0x05,0x79,0x00,0x8A,0x50,0x52,  /* 000002B0    "...y..PR" */
+    0x54,0x30,0x0A,0x5C,0x4D,0x4D,0x49,0x4E,  /* 000002B8    "T0.\MMIN" */
+    0x8A,0x50,0x52,0x54,0x30,0x0A,0x60,0x4D,  /* 000002C0    ".PRT0.`M" */
+    0x4D,0x41,0x58,0x8A,0x50,0x52,0x54,0x30,  /* 000002C8    "MAX.PRT0" */
+    0x0A,0x68,0x4D,0x4C,0x45,0x4E,0x70,0x50,  /* 000002D0    ".hMLENpP" */
+    0x4D,0x49,0x4E,0x4D,0x4D,0x49,0x4E,0x70,  /* 000002D8    "MINMMINp" */
+    0x50,0x4C,0x45,0x4E,0x4D,0x4C,0x45,0x4E,  /* 000002E0    "PLENMLEN" */
+    0x72,0x4D,0x4D,0x49,0x4E,0x4D,0x4C,0x45,  /* 000002E8    "rMMINMLE" */
+    0x4E,0x4D,0x4D,0x41,0x58,0x74,0x4D,0x4D,  /* 000002F0    "NMMAXtMM" */
+    0x41,0x58,0x01,0x4D,0x4D,0x41,0x58,0xA4,  /* 000002F8    "AX.MMAX." */
+    0x50,0x52,0x54,0x30,0x08,0x42,0x55,0x46,  /* 00000300    "PRT0.BUF" */
+    0x41,0x11,0x09,0x0A,0x06,0x23,0x20,0x0C,  /* 00000308    "A....# ." */
+    0x18,0x79,0x00,0x08,0x42,0x55,0x46,0x42,  /* 00000310    ".y..BUFB" */
+    0x11,0x09,0x0A,0x06,0x23,0x00,0x00,0x18,  /* 00000318    "....#..." */
+    0x79,0x00,0x8B,0x42,0x55,0x46,0x42,0x01,  /* 00000320    "y..BUFB." */
+    0x49,0x52,0x51,0x56,0x5B,0x82,0x48,0x08,  /* 00000328    "IRQV[.H." */
+    0x4C,0x4E,0x4B,0x41,0x08,0x5F,0x48,0x49,  /* 00000330    "LNKA._HI" */
+    0x44,0x0C,0x41,0xD0,0x0C,0x0F,0x08,0x5F,  /* 00000338    "D.A...._" */
+    0x55,0x49,0x44,0x01,0x14,0x1C,0x5F,0x53,  /* 00000340    "UID..._S" */
+    0x54,0x41,0x00,0x7B,0x50,0x49,0x52,0x41,  /* 00000348    "TA.{PIRA" */
+    0x0A,0x80,0x60,0xA0,0x08,0x93,0x60,0x0A,  /* 00000350    "..`...`." */
+    0x80,0xA4,0x0A,0x09,0xA1,0x04,0xA4,0x0A,  /* 00000358    "........" */
+    0x0B,0x14,0x0B,0x5F,0x50,0x52,0x53,0x00,  /* 00000360    "..._PRS." */
+    0xA4,0x42,0x55,0x46,0x41,0x14,0x11,0x5F,  /* 00000368    ".BUFA.._" */
+    0x44,0x49,0x53,0x00,0x7D,0x50,0x49,0x52,  /* 00000370    "DIS.}PIR" */
+    0x41,0x0A,0x80,0x50,0x49,0x52,0x41,0x14,  /* 00000378    "A..PIRA." */
+    0x1A,0x5F,0x43,0x52,0x53,0x00,0x7B,0x50,  /* 00000380    "._CRS.{P" */
+    0x49,0x52,0x41,0x0A,0x0F,0x60,0x79,0x01,  /* 00000388    "IRA..`y." */
+    0x60,0x49,0x52,0x51,0x56,0xA4,0x42,0x55,  /* 00000390    "`IRQV.BU" */
+    0x46,0x42,0x14,0x1B,0x5F,0x53,0x52,0x53,  /* 00000398    "FB.._SRS" */
+    0x01,0x8B,0x68,0x01,0x49,0x52,0x51,0x31,  /* 000003A0    "..h.IRQ1" */
+    0x82,0x49,0x52,0x51,0x31,0x60,0x76,0x60,  /* 000003A8    ".IRQ1`v`" */
+    0x70,0x60,0x50,0x49,0x52,0x41,0x5B,0x82,  /* 000003B0    "p`PIRA[." */
+    0x49,0x08,0x4C,0x4E,0x4B,0x42,0x08,0x5F,  /* 000003B8    "I.LNKB._" */
+    0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x0F,  /* 000003C0    "HID.A..." */
+    0x08,0x5F,0x55,0x49,0x44,0x0A,0x02,0x14,  /* 000003C8    "._UID..." */
+    0x1C,0x5F,0x53,0x54,0x41,0x00,0x7B,0x50,  /* 000003D0    "._STA.{P" */
+    0x49,0x52,0x42,0x0A,0x80,0x60,0xA0,0x08,  /* 000003D8    "IRB..`.." */
+    0x93,0x60,0x0A,0x80,0xA4,0x0A,0x09,0xA1,  /* 000003E0    ".`......" */
+    0x04,0xA4,0x0A,0x0B,0x14,0x0B,0x5F,0x50,  /* 000003E8    "......_P" */
+    0x52,0x53,0x00,0xA4,0x42,0x55,0x46,0x41,  /* 000003F0    "RS..BUFA" */
+    0x14,0x11,0x5F,0x44,0x49,0x53,0x00,0x7D,  /* 000003F8    ".._DIS.}" */
+    0x50,0x49,0x52,0x42,0x0A,0x80,0x50,0x49,  /* 00000400    "PIRB..PI" */
+    0x52,0x42,0x14,0x1A,0x5F,0x43,0x52,0x53,  /* 00000408    "RB.._CRS" */
+    0x00,0x7B,0x50,0x49,0x52,0x42,0x0A,0x0F,  /* 00000410    ".{PIRB.." */
+    0x60,0x79,0x01,0x60,0x49,0x52,0x51,0x56,  /* 00000418    "`y.`IRQV" */
+    0xA4,0x42,0x55,0x46,0x42,0x14,0x1B,0x5F,  /* 00000420    ".BUFB.._" */
+    0x53,0x52,0x53,0x01,0x8B,0x68,0x01,0x49,  /* 00000428    "SRS..h.I" */
+    0x52,0x51,0x31,0x82,0x49,0x52,0x51,0x31,  /* 00000430    "RQ1.IRQ1" */
+    0x60,0x76,0x60,0x70,0x60,0x50,0x49,0x52,  /* 00000438    "`v`p`PIR" */
+    0x42,0x5B,0x82,0x49,0x08,0x4C,0x4E,0x4B,  /* 00000440    "B[.I.LNK" */
+    0x43,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 00000448    "C._HID.A" */
+    0xD0,0x0C,0x0F,0x08,0x5F,0x55,0x49,0x44,  /* 00000450    "...._UID" */
+    0x0A,0x03,0x14,0x1C,0x5F,0x53,0x54,0x41,  /* 00000458    "...._STA" */
+    0x00,0x7B,0x50,0x49,0x52,0x43,0x0A,0x80,  /* 00000460    ".{PIRC.." */
+    0x60,0xA0,0x08,0x93,0x60,0x0A,0x80,0xA4,  /* 00000468    "`...`..." */
+    0x0A,0x09,0xA1,0x04,0xA4,0x0A,0x0B,0x14,  /* 00000470    "........" */
+    0x0B,0x5F,0x50,0x52,0x53,0x00,0xA4,0x42,  /* 00000478    "._PRS..B" */
+    0x55,0x46,0x41,0x14,0x11,0x5F,0x44,0x49,  /* 00000480    "UFA.._DI" */
+    0x53,0x00,0x7D,0x50,0x49,0x52,0x43,0x0A,  /* 00000488    "S.}PIRC." */
+    0x80,0x50,0x49,0x52,0x43,0x14,0x1A,0x5F,  /* 00000490    ".PIRC.._" */
+    0x43,0x52,0x53,0x00,0x7B,0x50,0x49,0x52,  /* 00000498    "CRS.{PIR" */
+    0x43,0x0A,0x0F,0x60,0x79,0x01,0x60,0x49,  /* 000004A0    "C..`y.`I" */
+    0x52,0x51,0x56,0xA4,0x42,0x55,0x46,0x42,  /* 000004A8    "RQV.BUFB" */
+    0x14,0x1B,0x5F,0x53,0x52,0x53,0x01,0x8B,  /* 000004B0    ".._SRS.." */
+    0x68,0x01,0x49,0x52,0x51,0x31,0x82,0x49,  /* 000004B8    "h.IRQ1.I" */
+    0x52,0x51,0x31,0x60,0x76,0x60,0x70,0x60,  /* 000004C0    "RQ1`v`p`" */
+    0x50,0x49,0x52,0x43,0x5B,0x82,0x49,0x08,  /* 000004C8    "PIRC[.I." */
+    0x4C,0x4E,0x4B,0x44,0x08,0x5F,0x48,0x49,  /* 000004D0    "LNKD._HI" */
+    0x44,0x0C,0x41,0xD0,0x0C,0x0F,0x08,0x5F,  /* 000004D8    "D.A...._" */
+    0x55,0x49,0x44,0x0A,0x04,0x14,0x1C,0x5F,  /* 000004E0    "UID...._" */
+    0x53,0x54,0x41,0x00,0x7B,0x50,0x49,0x52,  /* 000004E8    "STA.{PIR" */
+    0x44,0x0A,0x80,0x60,0xA0,0x08,0x93,0x60,  /* 000004F0    "D..`...`" */
+    0x0A,0x80,0xA4,0x0A,0x09,0xA1,0x04,0xA4,  /* 000004F8    "........" */
+    0x0A,0x0B,0x14,0x0B,0x5F,0x50,0x52,0x53,  /* 00000500    "...._PRS" */
+    0x00,0xA4,0x42,0x55,0x46,0x41,0x14,0x11,  /* 00000508    "..BUFA.." */
+    0x5F,0x44,0x49,0x53,0x00,0x7D,0x50,0x49,  /* 00000510    "_DIS.}PI" */
+    0x52,0x44,0x0A,0x80,0x50,0x49,0x52,0x44,  /* 00000518    "RD..PIRD" */
+    0x14,0x1A,0x5F,0x43,0x52,0x53,0x00,0x7B,  /* 00000520    ".._CRS.{" */
+    0x50,0x49,0x52,0x44,0x0A,0x0F,0x60,0x79,  /* 00000528    "PIRD..`y" */
+    0x01,0x60,0x49,0x52,0x51,0x56,0xA4,0x42,  /* 00000530    ".`IRQV.B" */
+    0x55,0x46,0x42,0x14,0x1B,0x5F,0x53,0x52,  /* 00000538    "UFB.._SR" */
+    0x53,0x01,0x8B,0x68,0x01,0x49,0x52,0x51,  /* 00000540    "S..h.IRQ" */
+    0x31,0x82,0x49,0x52,0x51,0x31,0x60,0x76,  /* 00000548    "1.IRQ1`v" */
+    0x60,0x70,0x60,0x50,0x49,0x52,0x44,0x5B,  /* 00000550    "`p`PIRD[" */
+    0x82,0x44,0x05,0x48,0x50,0x45,0x54,0x08,  /* 00000558    ".D.HPET." */
+    0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,0x01,  /* 00000560    "_HID.A.." */
+    0x03,0x08,0x5F,0x55,0x49,0x44,0x00,0x14,  /* 00000568    ".._UID.." */
+    0x18,0x5F,0x53,0x54,0x41,0x00,0xA0,0x0C,  /* 00000570    "._STA..." */
+    0x93,0x5E,0x5E,0x5E,0x48,0x50,0x45,0x54,  /* 00000578    ".^^^HPET" */
+    0x00,0xA4,0x00,0xA1,0x04,0xA4,0x0A,0x0F,  /* 00000580    "........" */
+    0x08,0x5F,0x43,0x52,0x53,0x11,0x1F,0x0A,  /* 00000588    "._CRS..." */
+    0x1C,0x87,0x17,0x00,0x00,0x0D,0x01,0x00,  /* 00000590    "........" */
+    0x00,0x00,0x00,0x00,0x00,0xD0,0xFE,0xFF,  /* 00000598    "........" */
+    0x03,0xD0,0xFE,0x00,0x00,0x00,0x00,0x00,  /* 000005A0    "........" */
+    0x04,0x00,0x00,0x79,0x00,0x14,0x16,0x5F,  /* 000005A8    "...y..._" */
+    0x50,0x52,0x54,0x00,0xA0,0x0A,0x50,0x49,  /* 000005B0    "PRT...PI" */
+    0x43,0x44,0xA4,0x50,0x52,0x54,0x41,0xA4,  /* 000005B8    "CD.PRTA." */
+    0x50,0x52,0x54,0x50,0x08,0x50,0x52,0x54,  /* 000005C0    "PRTP.PRT" */
+    0x50,0x12,0x49,0x70,0x7C,0x12,0x0D,0x04,  /* 000005C8    "P.Ip|..." */
+    0x0C,0xFF,0xFF,0x01,0x00,0x00,0x4C,0x4E,  /* 000005D0    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 000005D8    "KB......" */
+    0xFF,0x01,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 000005E0    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x01,  /* 000005E8    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 000005F0    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x01,0x00,  /* 000005F8    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 00000600    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x02,0x00,0x00,  /* 00000608    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000610    "LNKC...." */
+    0x0C,0xFF,0xFF,0x02,0x00,0x01,0x4C,0x4E,  /* 00000618    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000620    "KD......" */
+    0xFF,0x02,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000628    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000630    "A......." */
+    0x02,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 00000638    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x03,  /* 00000640    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000648    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x03,0x00,0x01,  /* 00000650    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000658    "LNKA...." */
+    0x0C,0xFF,0xFF,0x03,0x00,0x0A,0x02,0x4C,  /* 00000660    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000668    "NKB....." */
+    0xFF,0xFF,0x03,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000670    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000678    "KC......" */
+    0xFF,0x04,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000680    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x04,  /* 00000688    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000690    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x04,0x00,0x0A,  /* 00000698    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 000006A0    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x04,0x00,0x0A,0x03,  /* 000006A8    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 000006B0    "LNKD...." */
+    0x0C,0xFF,0xFF,0x05,0x00,0x00,0x4C,0x4E,  /* 000006B8    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 000006C0    "KB......" */
+    0xFF,0x05,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 000006C8    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x05,  /* 000006D0    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 000006D8    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x05,0x00,  /* 000006E0    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 000006E8    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x06,0x00,0x00,  /* 000006F0    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 000006F8    "LNKC...." */
+    0x0C,0xFF,0xFF,0x06,0x00,0x01,0x4C,0x4E,  /* 00000700    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000708    "KD......" */
+    0xFF,0x06,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000710    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000718    "A......." */
+    0x06,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 00000720    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x07,  /* 00000728    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000730    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x07,0x00,0x01,  /* 00000738    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000740    "LNKA...." */
+    0x0C,0xFF,0xFF,0x07,0x00,0x0A,0x02,0x4C,  /* 00000748    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000750    "NKB....." */
+    0xFF,0xFF,0x07,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000758    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000760    "KC......" */
+    0xFF,0x08,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000768    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x08,  /* 00000770    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000778    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x08,0x00,0x0A,  /* 00000780    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 00000788    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x08,0x00,0x0A,0x03,  /* 00000790    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 00000798    "LNKD...." */
+    0x0C,0xFF,0xFF,0x09,0x00,0x00,0x4C,0x4E,  /* 000007A0    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 000007A8    "KB......" */
+    0xFF,0x09,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 000007B0    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x09,  /* 000007B8    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 000007C0    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x09,0x00,  /* 000007C8    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 000007D0    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x0A,0x00,0x00,  /* 000007D8    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 000007E0    "LNKC...." */
+    0x0C,0xFF,0xFF,0x0A,0x00,0x01,0x4C,0x4E,  /* 000007E8    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 000007F0    "KD......" */
+    0xFF,0x0A,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 000007F8    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000800    "A......." */
+    0x0A,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 00000808    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0B,  /* 00000810    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000818    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x0B,0x00,0x01,  /* 00000820    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000828    "LNKA...." */
+    0x0C,0xFF,0xFF,0x0B,0x00,0x0A,0x02,0x4C,  /* 00000830    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000838    "NKB....." */
+    0xFF,0xFF,0x0B,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000840    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000848    "KC......" */
+    0xFF,0x0C,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000850    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0C,  /* 00000858    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000860    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x0C,0x00,0x0A,  /* 00000868    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 00000870    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x0C,0x00,0x0A,0x03,  /* 00000878    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 00000880    "LNKD...." */
+    0x0C,0xFF,0xFF,0x0D,0x00,0x00,0x4C,0x4E,  /* 00000888    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000890    "KB......" */
+    0xFF,0x0D,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 00000898    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x0D,  /* 000008A0    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 000008A8    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x0D,0x00,  /* 000008B0    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 000008B8    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x0E,0x00,0x00,  /* 000008C0    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 000008C8    "LNKC...." */
+    0x0C,0xFF,0xFF,0x0E,0x00,0x01,0x4C,0x4E,  /* 000008D0    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 000008D8    "KD......" */
+    0xFF,0x0E,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 000008E0    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 000008E8    "A......." */
+    0x0E,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 000008F0    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x0F,  /* 000008F8    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000900    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x0F,0x00,0x01,  /* 00000908    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000910    "LNKA...." */
+    0x0C,0xFF,0xFF,0x0F,0x00,0x0A,0x02,0x4C,  /* 00000918    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000920    "NKB....." */
+    0xFF,0xFF,0x0F,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000928    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000930    "KC......" */
+    0xFF,0x10,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000938    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x10,  /* 00000940    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000948    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x10,0x00,0x0A,  /* 00000950    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 00000958    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x10,0x00,0x0A,0x03,  /* 00000960    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 00000968    "LNKD...." */
+    0x0C,0xFF,0xFF,0x11,0x00,0x00,0x4C,0x4E,  /* 00000970    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000978    "KB......" */
+    0xFF,0x11,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 00000980    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x11,  /* 00000988    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 00000990    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x11,0x00,  /* 00000998    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 000009A0    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x12,0x00,0x00,  /* 000009A8    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 000009B0    "LNKC...." */
+    0x0C,0xFF,0xFF,0x12,0x00,0x01,0x4C,0x4E,  /* 000009B8    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 000009C0    "KD......" */
+    0xFF,0x12,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 000009C8    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 000009D0    "A......." */
+    0x12,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 000009D8    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x13,  /* 000009E0    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 000009E8    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x13,0x00,0x01,  /* 000009F0    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 000009F8    "LNKA...." */
+    0x0C,0xFF,0xFF,0x13,0x00,0x0A,0x02,0x4C,  /* 00000A00    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000A08    "NKB....." */
+    0xFF,0xFF,0x13,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000A10    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000A18    "KC......" */
+    0xFF,0x14,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000A20    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x14,  /* 00000A28    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000A30    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x14,0x00,0x0A,  /* 00000A38    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 00000A40    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x14,0x00,0x0A,0x03,  /* 00000A48    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 00000A50    "LNKD...." */
+    0x0C,0xFF,0xFF,0x15,0x00,0x00,0x4C,0x4E,  /* 00000A58    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000A60    "KB......" */
+    0xFF,0x15,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 00000A68    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x15,  /* 00000A70    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 00000A78    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x15,0x00,  /* 00000A80    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 00000A88    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x16,0x00,0x00,  /* 00000A90    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000A98    "LNKC...." */
+    0x0C,0xFF,0xFF,0x16,0x00,0x01,0x4C,0x4E,  /* 00000AA0    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000AA8    "KD......" */
+    0xFF,0x16,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000AB0    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000AB8    "A......." */
+    0x16,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 00000AC0    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x17,  /* 00000AC8    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000AD0    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x17,0x00,0x01,  /* 00000AD8    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000AE0    "LNKA...." */
+    0x0C,0xFF,0xFF,0x17,0x00,0x0A,0x02,0x4C,  /* 00000AE8    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000AF0    "NKB....." */
+    0xFF,0xFF,0x17,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000AF8    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000B00    "KC......" */
+    0xFF,0x18,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000B08    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x18,  /* 00000B10    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000B18    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x18,0x00,0x0A,  /* 00000B20    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 00000B28    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x18,0x00,0x0A,0x03,  /* 00000B30    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 00000B38    "LNKD...." */
+    0x0C,0xFF,0xFF,0x19,0x00,0x00,0x4C,0x4E,  /* 00000B40    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000B48    "KB......" */
+    0xFF,0x19,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 00000B50    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x19,  /* 00000B58    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 00000B60    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x19,0x00,  /* 00000B68    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 00000B70    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x1A,0x00,0x00,  /* 00000B78    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000B80    "LNKC...." */
+    0x0C,0xFF,0xFF,0x1A,0x00,0x01,0x4C,0x4E,  /* 00000B88    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000B90    "KD......" */
+    0xFF,0x1A,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000B98    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000BA0    "A......." */
+    0x1A,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 00000BA8    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x1B,  /* 00000BB0    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000BB8    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x1B,0x00,0x01,  /* 00000BC0    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000BC8    "LNKA...." */
+    0x0C,0xFF,0xFF,0x1B,0x00,0x0A,0x02,0x4C,  /* 00000BD0    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000BD8    "NKB....." */
+    0xFF,0xFF,0x1B,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000BE0    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000BE8    "KC......" */
+    0xFF,0x1C,0x00,0x00,0x4C,0x4E,0x4B,0x41,  /* 00000BF0    "....LNKA" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x1C,  /* 00000BF8    "........" */
+    0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,  /* 00000C00    "..LNKB.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x1C,0x00,0x0A,  /* 00000C08    "........" */
+    0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,  /* 00000C10    ".LNKC..." */
+    0x04,0x0C,0xFF,0xFF,0x1C,0x00,0x0A,0x03,  /* 00000C18    "........" */
+    0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,  /* 00000C20    "LNKD...." */
+    0x0C,0xFF,0xFF,0x1D,0x00,0x00,0x4C,0x4E,  /* 00000C28    "......LN" */
+    0x4B,0x42,0x00,0x12,0x0D,0x04,0x0C,0xFF,  /* 00000C30    "KB......" */
+    0xFF,0x1D,0x00,0x01,0x4C,0x4E,0x4B,0x43,  /* 00000C38    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x1D,  /* 00000C40    "........" */
+    0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x44,0x00,  /* 00000C48    "...LNKD." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x1D,0x00,  /* 00000C50    "........" */
+    0x0A,0x03,0x4C,0x4E,0x4B,0x41,0x00,0x12,  /* 00000C58    "..LNKA.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x1E,0x00,0x00,  /* 00000C60    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0D,0x04,  /* 00000C68    "LNKC...." */
+    0x0C,0xFF,0xFF,0x1E,0x00,0x01,0x4C,0x4E,  /* 00000C70    "......LN" */
+    0x4B,0x44,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000C78    "KD......" */
+    0xFF,0x1E,0x00,0x0A,0x02,0x4C,0x4E,0x4B,  /* 00000C80    ".....LNK" */
+    0x41,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000C88    "A......." */
+    0x1E,0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x42,  /* 00000C90    "....LNKB" */
+    0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x1F,  /* 00000C98    "........" */
+    0x00,0x00,0x4C,0x4E,0x4B,0x44,0x00,0x12,  /* 00000CA0    "..LNKD.." */
+    0x0D,0x04,0x0C,0xFF,0xFF,0x1F,0x00,0x01,  /* 00000CA8    "........" */
+    0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0E,0x04,  /* 00000CB0    "LNKA...." */
+    0x0C,0xFF,0xFF,0x1F,0x00,0x0A,0x02,0x4C,  /* 00000CB8    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000CC0    "NKB....." */
+    0xFF,0xFF,0x1F,0x00,0x0A,0x03,0x4C,0x4E,  /* 00000CC8    "......LN" */
+    0x4B,0x43,0x00,0x08,0x50,0x52,0x54,0x41,  /* 00000CD0    "KC..PRTA" */
+    0x12,0x41,0x61,0x7C,0x12,0x0B,0x04,0x0C,  /* 00000CD8    ".Aa|...." */
+    0xFF,0xFF,0x01,0x00,0x00,0x00,0x0A,0x14,  /* 00000CE0    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x01,0x00,  /* 00000CE8    "........" */
+    0x01,0x00,0x0A,0x15,0x12,0x0C,0x04,0x0C,  /* 00000CF0    "........" */
+    0xFF,0xFF,0x01,0x00,0x0A,0x02,0x00,0x0A,  /* 00000CF8    "........" */
+    0x16,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x01,  /* 00000D00    "........" */
+    0x00,0x0A,0x03,0x00,0x0A,0x17,0x12,0x0B,  /* 00000D08    "........" */
+    0x04,0x0C,0xFF,0xFF,0x02,0x00,0x00,0x00,  /* 00000D10    "........" */
+    0x0A,0x18,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000D18    "........" */
+    0x02,0x00,0x01,0x00,0x0A,0x19,0x12,0x0C,  /* 00000D20    "........" */
+    0x04,0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x02,  /* 00000D28    "........" */
+    0x00,0x0A,0x1A,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000D30    "........" */
+    0xFF,0x02,0x00,0x0A,0x03,0x00,0x0A,0x1B,  /* 00000D38    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x03,0x00,  /* 00000D40    "........" */
+    0x00,0x00,0x0A,0x1C,0x12,0x0B,0x04,0x0C,  /* 00000D48    "........" */
+    0xFF,0xFF,0x03,0x00,0x01,0x00,0x0A,0x1D,  /* 00000D50    "........" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x03,0x00,  /* 00000D58    "........" */
+    0x0A,0x02,0x00,0x0A,0x1E,0x12,0x0C,0x04,  /* 00000D60    "........" */
+    0x0C,0xFF,0xFF,0x03,0x00,0x0A,0x03,0x00,  /* 00000D68    "........" */
+    0x0A,0x1F,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000D70    "........" */
+    0x04,0x00,0x00,0x00,0x0A,0x20,0x12,0x0B,  /* 00000D78    "..... .." */
+    0x04,0x0C,0xFF,0xFF,0x04,0x00,0x01,0x00,  /* 00000D80    "........" */
+    0x0A,0x21,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000D88    ".!......" */
+    0x04,0x00,0x0A,0x02,0x00,0x0A,0x22,0x12,  /* 00000D90    "......"." */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x04,0x00,0x0A,  /* 00000D98    "........" */
+    0x03,0x00,0x0A,0x23,0x12,0x0B,0x04,0x0C,  /* 00000DA0    "...#...." */
+    0xFF,0xFF,0x05,0x00,0x00,0x00,0x0A,0x24,  /* 00000DA8    ".......$" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x05,0x00,  /* 00000DB0    "........" */
+    0x01,0x00,0x0A,0x25,0x12,0x0C,0x04,0x0C,  /* 00000DB8    "...%...." */
+    0xFF,0xFF,0x05,0x00,0x0A,0x02,0x00,0x0A,  /* 00000DC0    "........" */
+    0x26,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x05,  /* 00000DC8    "&......." */
+    0x00,0x0A,0x03,0x00,0x0A,0x27,0x12,0x0B,  /* 00000DD0    ".....'.." */
+    0x04,0x0C,0xFF,0xFF,0x06,0x00,0x00,0x00,  /* 00000DD8    "........" */
+    0x0A,0x28,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000DE0    ".(......" */
+    0x06,0x00,0x01,0x00,0x0A,0x29,0x12,0x0C,  /* 00000DE8    ".....).." */
+    0x04,0x0C,0xFF,0xFF,0x06,0x00,0x0A,0x02,  /* 00000DF0    "........" */
+    0x00,0x0A,0x2A,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000DF8    "..*....." */
+    0xFF,0x06,0x00,0x0A,0x03,0x00,0x0A,0x2B,  /* 00000E00    ".......+" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x07,0x00,  /* 00000E08    "........" */
+    0x00,0x00,0x0A,0x2C,0x12,0x0B,0x04,0x0C,  /* 00000E10    "...,...." */
+    0xFF,0xFF,0x07,0x00,0x01,0x00,0x0A,0x2D,  /* 00000E18    ".......-" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x07,0x00,  /* 00000E20    "........" */
+    0x0A,0x02,0x00,0x0A,0x2E,0x12,0x0C,0x04,  /* 00000E28    "........" */
+    0x0C,0xFF,0xFF,0x07,0x00,0x0A,0x03,0x00,  /* 00000E30    "........" */
+    0x0A,0x2F,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000E38    "./......" */
+    0x08,0x00,0x00,0x00,0x0A,0x11,0x12,0x0B,  /* 00000E40    "........" */
+    0x04,0x0C,0xFF,0xFF,0x08,0x00,0x01,0x00,  /* 00000E48    "........" */
+    0x0A,0x12,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000E50    "........" */
+    0x08,0x00,0x0A,0x02,0x00,0x0A,0x13,0x12,  /* 00000E58    "........" */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x08,0x00,0x0A,  /* 00000E60    "........" */
+    0x03,0x00,0x0A,0x14,0x12,0x0B,0x04,0x0C,  /* 00000E68    "........" */
+    0xFF,0xFF,0x09,0x00,0x00,0x00,0x0A,0x15,  /* 00000E70    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x09,0x00,  /* 00000E78    "........" */
+    0x01,0x00,0x0A,0x16,0x12,0x0C,0x04,0x0C,  /* 00000E80    "........" */
+    0xFF,0xFF,0x09,0x00,0x0A,0x02,0x00,0x0A,  /* 00000E88    "........" */
+    0x17,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x09,  /* 00000E90    "........" */
+    0x00,0x0A,0x03,0x00,0x0A,0x18,0x12,0x0B,  /* 00000E98    "........" */
+    0x04,0x0C,0xFF,0xFF,0x0A,0x00,0x00,0x00,  /* 00000EA0    "........" */
+    0x0A,0x19,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000EA8    "........" */
+    0x0A,0x00,0x01,0x00,0x0A,0x1A,0x12,0x0C,  /* 00000EB0    "........" */
+    0x04,0x0C,0xFF,0xFF,0x0A,0x00,0x0A,0x02,  /* 00000EB8    "........" */
+    0x00,0x0A,0x1B,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000EC0    "........" */
+    0xFF,0x0A,0x00,0x0A,0x03,0x00,0x0A,0x1C,  /* 00000EC8    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x0B,0x00,  /* 00000ED0    "........" */
+    0x00,0x00,0x0A,0x1D,0x12,0x0B,0x04,0x0C,  /* 00000ED8    "........" */
+    0xFF,0xFF,0x0B,0x00,0x01,0x00,0x0A,0x1E,  /* 00000EE0    "........" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x0B,0x00,  /* 00000EE8    "........" */
+    0x0A,0x02,0x00,0x0A,0x1F,0x12,0x0C,0x04,  /* 00000EF0    "........" */
+    0x0C,0xFF,0xFF,0x0B,0x00,0x0A,0x03,0x00,  /* 00000EF8    "........" */
+    0x0A,0x20,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000F00    ". ......" */
+    0x0C,0x00,0x00,0x00,0x0A,0x21,0x12,0x0B,  /* 00000F08    ".....!.." */
+    0x04,0x0C,0xFF,0xFF,0x0C,0x00,0x01,0x00,  /* 00000F10    "........" */
+    0x0A,0x22,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000F18    "."......" */
+    0x0C,0x00,0x0A,0x02,0x00,0x0A,0x23,0x12,  /* 00000F20    "......#." */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x0C,0x00,0x0A,  /* 00000F28    "........" */
+    0x03,0x00,0x0A,0x24,0x12,0x0B,0x04,0x0C,  /* 00000F30    "...$...." */
+    0xFF,0xFF,0x0D,0x00,0x00,0x00,0x0A,0x25,  /* 00000F38    ".......%" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x0D,0x00,  /* 00000F40    "........" */
+    0x01,0x00,0x0A,0x26,0x12,0x0C,0x04,0x0C,  /* 00000F48    "...&...." */
+    0xFF,0xFF,0x0D,0x00,0x0A,0x02,0x00,0x0A,  /* 00000F50    "........" */
+    0x27,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x0D,  /* 00000F58    "'......." */
+    0x00,0x0A,0x03,0x00,0x0A,0x28,0x12,0x0B,  /* 00000F60    ".....(.." */
+    0x04,0x0C,0xFF,0xFF,0x0E,0x00,0x00,0x00,  /* 00000F68    "........" */
+    0x0A,0x29,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000F70    ".)......" */
+    0x0E,0x00,0x01,0x00,0x0A,0x2A,0x12,0x0C,  /* 00000F78    ".....*.." */
+    0x04,0x0C,0xFF,0xFF,0x0E,0x00,0x0A,0x02,  /* 00000F80    "........" */
+    0x00,0x0A,0x2B,0x12,0x0C,0x04,0x0C,0xFF,  /* 00000F88    "..+....." */
+    0xFF,0x0E,0x00,0x0A,0x03,0x00,0x0A,0x2C,  /* 00000F90    ".......," */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x0F,0x00,  /* 00000F98    "........" */
+    0x00,0x00,0x0A,0x2D,0x12,0x0B,0x04,0x0C,  /* 00000FA0    "...-...." */
+    0xFF,0xFF,0x0F,0x00,0x01,0x00,0x0A,0x2E,  /* 00000FA8    "........" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x0F,0x00,  /* 00000FB0    "........" */
+    0x0A,0x02,0x00,0x0A,0x2F,0x12,0x0C,0x04,  /* 00000FB8    "..../..." */
+    0x0C,0xFF,0xFF,0x0F,0x00,0x0A,0x03,0x00,  /* 00000FC0    "........" */
+    0x0A,0x10,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00000FC8    "........" */
+    0x10,0x00,0x00,0x00,0x0A,0x12,0x12,0x0B,  /* 00000FD0    "........" */
+    0x04,0x0C,0xFF,0xFF,0x10,0x00,0x01,0x00,  /* 00000FD8    "........" */
+    0x0A,0x13,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00000FE0    "........" */
+    0x10,0x00,0x0A,0x02,0x00,0x0A,0x14,0x12,  /* 00000FE8    "........" */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x10,0x00,0x0A,  /* 00000FF0    "........" */
+    0x03,0x00,0x0A,0x15,0x12,0x0B,0x04,0x0C,  /* 00000FF8    "........" */
+    0xFF,0xFF,0x11,0x00,0x00,0x00,0x0A,0x16,  /* 00001000    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x11,0x00,  /* 00001008    "........" */
+    0x01,0x00,0x0A,0x17,0x12,0x0C,0x04,0x0C,  /* 00001010    "........" */
+    0xFF,0xFF,0x11,0x00,0x0A,0x02,0x00,0x0A,  /* 00001018    "........" */
+    0x18,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x11,  /* 00001020    "........" */
+    0x00,0x0A,0x03,0x00,0x0A,0x19,0x12,0x0B,  /* 00001028    "........" */
+    0x04,0x0C,0xFF,0xFF,0x12,0x00,0x00,0x00,  /* 00001030    "........" */
+    0x0A,0x1A,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00001038    "........" */
+    0x12,0x00,0x01,0x00,0x0A,0x1B,0x12,0x0C,  /* 00001040    "........" */
+    0x04,0x0C,0xFF,0xFF,0x12,0x00,0x0A,0x02,  /* 00001048    "........" */
+    0x00,0x0A,0x1C,0x12,0x0C,0x04,0x0C,0xFF,  /* 00001050    "........" */
+    0xFF,0x12,0x00,0x0A,0x03,0x00,0x0A,0x1D,  /* 00001058    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x13,0x00,  /* 00001060    "........" */
+    0x00,0x00,0x0A,0x1E,0x12,0x0B,0x04,0x0C,  /* 00001068    "........" */
+    0xFF,0xFF,0x13,0x00,0x01,0x00,0x0A,0x1F,  /* 00001070    "........" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x13,0x00,  /* 00001078    "........" */
+    0x0A,0x02,0x00,0x0A,0x20,0x12,0x0C,0x04,  /* 00001080    ".... ..." */
+    0x0C,0xFF,0xFF,0x13,0x00,0x0A,0x03,0x00,  /* 00001088    "........" */
+    0x0A,0x21,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00001090    ".!......" */
+    0x14,0x00,0x00,0x00,0x0A,0x22,0x12,0x0B,  /* 00001098    ".....".." */
+    0x04,0x0C,0xFF,0xFF,0x14,0x00,0x01,0x00,  /* 000010A0    "........" */
+    0x0A,0x23,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 000010A8    ".#......" */
+    0x14,0x00,0x0A,0x02,0x00,0x0A,0x24,0x12,  /* 000010B0    "......$." */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x14,0x00,0x0A,  /* 000010B8    "........" */
+    0x03,0x00,0x0A,0x25,0x12,0x0B,0x04,0x0C,  /* 000010C0    "...%...." */
+    0xFF,0xFF,0x15,0x00,0x00,0x00,0x0A,0x26,  /* 000010C8    ".......&" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x15,0x00,  /* 000010D0    "........" */
+    0x01,0x00,0x0A,0x27,0x12,0x0C,0x04,0x0C,  /* 000010D8    "...'...." */
+    0xFF,0xFF,0x15,0x00,0x0A,0x02,0x00,0x0A,  /* 000010E0    "........" */
+    0x28,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x15,  /* 000010E8    "(......." */
+    0x00,0x0A,0x03,0x00,0x0A,0x29,0x12,0x0B,  /* 000010F0    ".....).." */
+    0x04,0x0C,0xFF,0xFF,0x16,0x00,0x00,0x00,  /* 000010F8    "........" */
+    0x0A,0x2A,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00001100    ".*......" */
+    0x16,0x00,0x01,0x00,0x0A,0x2B,0x12,0x0C,  /* 00001108    ".....+.." */
+    0x04,0x0C,0xFF,0xFF,0x16,0x00,0x0A,0x02,  /* 00001110    "........" */
+    0x00,0x0A,0x2C,0x12,0x0C,0x04,0x0C,0xFF,  /* 00001118    "..,....." */
+    0xFF,0x16,0x00,0x0A,0x03,0x00,0x0A,0x2D,  /* 00001120    ".......-" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x17,0x00,  /* 00001128    "........" */
+    0x00,0x00,0x0A,0x2E,0x12,0x0B,0x04,0x0C,  /* 00001130    "........" */
+    0xFF,0xFF,0x17,0x00,0x01,0x00,0x0A,0x2F,  /* 00001138    "......./" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x17,0x00,  /* 00001140    "........" */
+    0x0A,0x02,0x00,0x0A,0x10,0x12,0x0C,0x04,  /* 00001148    "........" */
+    0x0C,0xFF,0xFF,0x17,0x00,0x0A,0x03,0x00,  /* 00001150    "........" */
+    0x0A,0x11,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00001158    "........" */
+    0x18,0x00,0x00,0x00,0x0A,0x13,0x12,0x0B,  /* 00001160    "........" */
+    0x04,0x0C,0xFF,0xFF,0x18,0x00,0x01,0x00,  /* 00001168    "........" */
+    0x0A,0x14,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00001170    "........" */
+    0x18,0x00,0x0A,0x02,0x00,0x0A,0x15,0x12,  /* 00001178    "........" */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x18,0x00,0x0A,  /* 00001180    "........" */
+    0x03,0x00,0x0A,0x16,0x12,0x0B,0x04,0x0C,  /* 00001188    "........" */
+    0xFF,0xFF,0x19,0x00,0x00,0x00,0x0A,0x17,  /* 00001190    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x19,0x00,  /* 00001198    "........" */
+    0x01,0x00,0x0A,0x18,0x12,0x0C,0x04,0x0C,  /* 000011A0    "........" */
+    0xFF,0xFF,0x19,0x00,0x0A,0x02,0x00,0x0A,  /* 000011A8    "........" */
+    0x19,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x19,  /* 000011B0    "........" */
+    0x00,0x0A,0x03,0x00,0x0A,0x1A,0x12,0x0B,  /* 000011B8    "........" */
+    0x04,0x0C,0xFF,0xFF,0x1A,0x00,0x00,0x00,  /* 000011C0    "........" */
+    0x0A,0x1B,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 000011C8    "........" */
+    0x1A,0x00,0x01,0x00,0x0A,0x1C,0x12,0x0C,  /* 000011D0    "........" */
+    0x04,0x0C,0xFF,0xFF,0x1A,0x00,0x0A,0x02,  /* 000011D8    "........" */
+    0x00,0x0A,0x1D,0x12,0x0C,0x04,0x0C,0xFF,  /* 000011E0    "........" */
+    0xFF,0x1A,0x00,0x0A,0x03,0x00,0x0A,0x1E,  /* 000011E8    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x1B,0x00,  /* 000011F0    "........" */
+    0x00,0x00,0x0A,0x1F,0x12,0x0B,0x04,0x0C,  /* 000011F8    "........" */
+    0xFF,0xFF,0x1B,0x00,0x01,0x00,0x0A,0x20,  /* 00001200    "....... " */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x1B,0x00,  /* 00001208    "........" */
+    0x0A,0x02,0x00,0x0A,0x21,0x12,0x0C,0x04,  /* 00001210    "....!..." */
+    0x0C,0xFF,0xFF,0x1B,0x00,0x0A,0x03,0x00,  /* 00001218    "........" */
+    0x0A,0x22,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00001220    "."......" */
+    0x1C,0x00,0x00,0x00,0x0A,0x23,0x12,0x0B,  /* 00001228    ".....#.." */
+    0x04,0x0C,0xFF,0xFF,0x1C,0x00,0x01,0x00,  /* 00001230    "........" */
+    0x0A,0x24,0x12,0x0C,0x04,0x0C,0xFF,0xFF,  /* 00001238    ".$......" */
+    0x1C,0x00,0x0A,0x02,0x00,0x0A,0x25,0x12,  /* 00001240    "......%." */
+    0x0C,0x04,0x0C,0xFF,0xFF,0x1C,0x00,0x0A,  /* 00001248    "........" */
+    0x03,0x00,0x0A,0x26,0x12,0x0B,0x04,0x0C,  /* 00001250    "...&...." */
+    0xFF,0xFF,0x1D,0x00,0x00,0x00,0x0A,0x27,  /* 00001258    ".......'" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x1D,0x00,  /* 00001260    "........" */
+    0x01,0x00,0x0A,0x28,0x12,0x0C,0x04,0x0C,  /* 00001268    "...(...." */
+    0xFF,0xFF,0x1D,0x00,0x0A,0x02,0x00,0x0A,  /* 00001270    "........" */
+    0x29,0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x1D,  /* 00001278    ")......." */
+    0x00,0x0A,0x03,0x00,0x0A,0x2A,0x12,0x0B,  /* 00001280    ".....*.." */
+    0x04,0x0C,0xFF,0xFF,0x1E,0x00,0x00,0x00,  /* 00001288    "........" */
+    0x0A,0x2B,0x12,0x0B,0x04,0x0C,0xFF,0xFF,  /* 00001290    ".+......" */
+    0x1E,0x00,0x01,0x00,0x0A,0x2C,0x12,0x0C,  /* 00001298    ".....,.." */
+    0x04,0x0C,0xFF,0xFF,0x1E,0x00,0x0A,0x02,  /* 000012A0    "........" */
+    0x00,0x0A,0x2D,0x12,0x0C,0x04,0x0C,0xFF,  /* 000012A8    "..-....." */
+    0xFF,0x1E,0x00,0x0A,0x03,0x00,0x0A,0x2E,  /* 000012B0    "........" */
+    0x12,0x0B,0x04,0x0C,0xFF,0xFF,0x1F,0x00,  /* 000012B8    "........" */
+    0x00,0x00,0x0A,0x2F,0x12,0x0B,0x04,0x0C,  /* 000012C0    ".../...." */
+    0xFF,0xFF,0x1F,0x00,0x01,0x00,0x0A,0x10,  /* 000012C8    "........" */
+    0x12,0x0C,0x04,0x0C,0xFF,0xFF,0x1F,0x00,  /* 000012D0    "........" */
+    0x0A,0x02,0x00,0x0A,0x11,0x12,0x0C,0x04,  /* 000012D8    "........" */
+    0x0C,0xFF,0xFF,0x1F,0x00,0x0A,0x03,0x00,  /* 000012E0    "........" */
+    0x0A,0x12,0x5B,0x82,0x46,0x37,0x49,0x53,  /* 000012E8    "..[.F7IS" */
+    0x41,0x5F,0x08,0x5F,0x41,0x44,0x52,0x0C,  /* 000012F0    "A_._ADR." */
+    0x00,0x00,0x01,0x00,0x5B,0x80,0x50,0x49,  /* 000012F8    "....[.PI" */
+    0x52,0x51,0x02,0x0A,0x60,0x0A,0x04,0x10,  /* 00001300    "RQ..`..." */
+    0x2E,0x5C,0x00,0x5B,0x81,0x29,0x5C,0x2F,  /* 00001308    ".\.[.)\/" */
+    0x04,0x5F,0x53,0x42,0x5F,0x50,0x43,0x49,  /* 00001310    "._SB_PCI" */
+    0x30,0x49,0x53,0x41,0x5F,0x50,0x49,0x52,  /* 00001318    "0ISA_PIR" */
+    0x51,0x01,0x50,0x49,0x52,0x41,0x08,0x50,  /* 00001320    "Q.PIRA.P" */
+    0x49,0x52,0x42,0x08,0x50,0x49,0x52,0x43,  /* 00001328    "IRB.PIRC" */
+    0x08,0x50,0x49,0x52,0x44,0x08,0x5B,0x82,  /* 00001330    ".PIRD.[." */
+    0x46,0x0B,0x53,0x59,0x53,0x52,0x08,0x5F,  /* 00001338    "F.SYSR._" */
+    0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x02,  /* 00001340    "HID.A..." */
+    0x08,0x5F,0x55,0x49,0x44,0x01,0x08,0x43,  /* 00001348    "._UID..C" */
+    0x52,0x53,0x5F,0x11,0x4E,0x08,0x0A,0x8A,  /* 00001350    "RS_.N..." */
+    0x47,0x01,0x10,0x00,0x10,0x00,0x00,0x10,  /* 00001358    "G......." */
+    0x47,0x01,0x22,0x00,0x22,0x00,0x00,0x0C,  /* 00001360    "G."."..." */
+    0x47,0x01,0x30,0x00,0x30,0x00,0x00,0x10,  /* 00001368    "G.0.0..." */
+    0x47,0x01,0x44,0x00,0x44,0x00,0x00,0x1C,  /* 00001370    "G.D.D..." */
+    0x47,0x01,0x62,0x00,0x62,0x00,0x00,0x02,  /* 00001378    "G.b.b..." */
+    0x47,0x01,0x65,0x00,0x65,0x00,0x00,0x0B,  /* 00001380    "G.e.e..." */
+    0x47,0x01,0x72,0x00,0x72,0x00,0x00,0x0E,  /* 00001388    "G.r.r..." */
+    0x47,0x01,0x80,0x00,0x80,0x00,0x00,0x01,  /* 00001390    "G......." */
+    0x47,0x01,0x84,0x00,0x84,0x00,0x00,0x03,  /* 00001398    "G......." */
+    0x47,0x01,0x88,0x00,0x88,0x00,0x00,0x01,  /* 000013A0    "G......." */
+    0x47,0x01,0x8C,0x00,0x8C,0x00,0x00,0x03,  /* 000013A8    "G......." */
+    0x47,0x01,0x90,0x00,0x90,0x00,0x00,0x10,  /* 000013B0    "G......." */
+    0x47,0x01,0xA2,0x00,0xA2,0x00,0x00,0x1C,  /* 000013B8    "G......." */
+    0x47,0x01,0xE0,0x00,0xE0,0x00,0x00,0x10,  /* 000013C0    "G......." */
+    0x47,0x01,0xA0,0x08,0xA0,0x08,0x00,0x04,  /* 000013C8    "G......." */
+    0x47,0x01,0xC0,0x0C,0xC0,0x0C,0x00,0x10,  /* 000013D0    "G......." */
+    0x47,0x01,0xD0,0x04,0xD0,0x04,0x00,0x02,  /* 000013D8    "G......." */
+    0x79,0x00,0x14,0x0B,0x5F,0x43,0x52,0x53,  /* 000013E0    "y..._CRS" */
+    0x00,0xA4,0x43,0x52,0x53,0x5F,0x5B,0x82,  /* 000013E8    "..CRS_[." */
+    0x2B,0x50,0x49,0x43,0x5F,0x08,0x5F,0x48,  /* 000013F0    "+PIC_._H" */
+    0x49,0x44,0x0B,0x41,0xD0,0x08,0x5F,0x43,  /* 000013F8    "ID.A.._C" */
+    0x52,0x53,0x11,0x18,0x0A,0x15,0x47,0x01,  /* 00001400    "RS....G." */
+    0x20,0x00,0x20,0x00,0x01,0x02,0x47,0x01,  /* 00001408    " . ...G." */
+    0xA0,0x00,0xA0,0x00,0x01,0x02,0x22,0x04,  /* 00001410    "......"." */
+    0x00,0x79,0x00,0x5B,0x82,0x47,0x05,0x44,  /* 00001418    ".y.[.G.D" */
+    0x4D,0x41,0x30,0x08,0x5F,0x48,0x49,0x44,  /* 00001420    "MA0._HID" */
+    0x0C,0x41,0xD0,0x02,0x00,0x08,0x5F,0x43,  /* 00001428    ".A...._C" */
+    0x52,0x53,0x11,0x41,0x04,0x0A,0x3D,0x2A,  /* 00001430    "RS.A..=*" */
+    0x10,0x04,0x47,0x01,0x00,0x00,0x00,0x00,  /* 00001438    "..G....." */
+    0x00,0x10,0x47,0x01,0x81,0x00,0x81,0x00,  /* 00001440    "..G....." */
+    0x00,0x03,0x47,0x01,0x87,0x00,0x87,0x00,  /* 00001448    "..G....." */
+    0x00,0x01,0x47,0x01,0x89,0x00,0x89,0x00,  /* 00001450    "..G....." */
+    0x00,0x03,0x47,0x01,0x8F,0x00,0x8F,0x00,  /* 00001458    "..G....." */
+    0x00,0x01,0x47,0x01,0xC0,0x00,0xC0,0x00,  /* 00001460    "..G....." */
+    0x00,0x20,0x47,0x01,0x80,0x04,0x80,0x04,  /* 00001468    ". G....." */
+    0x00,0x10,0x79,0x00,0x5B,0x82,0x25,0x54,  /* 00001470    "..y.[.%T" */
+    0x4D,0x52,0x5F,0x08,0x5F,0x48,0x49,0x44,  /* 00001478    "MR_._HID" */
+    0x0C,0x41,0xD0,0x01,0x00,0x08,0x5F,0x43,  /* 00001480    ".A...._C" */
+    0x52,0x53,0x11,0x10,0x0A,0x0D,0x47,0x01,  /* 00001488    "RS....G." */
+    0x40,0x00,0x40,0x00,0x00,0x04,0x22,0x01,  /* 00001490    "@.@..."." */
+    0x00,0x79,0x00,0x5B,0x82,0x25,0x52,0x54,  /* 00001498    ".y.[.%RT" */
+    0x43,0x5F,0x08,0x5F,0x48,0x49,0x44,0x0C,  /* 000014A0    "C_._HID." */
+    0x41,0xD0,0x0B,0x00,0x08,0x5F,0x43,0x52,  /* 000014A8    "A...._CR" */
+    0x53,0x11,0x10,0x0A,0x0D,0x47,0x01,0x70,  /* 000014B0    "S....G.p" */
+    0x00,0x70,0x00,0x00,0x02,0x22,0x00,0x01,  /* 000014B8    ".p...".." */
+    0x79,0x00,0x5B,0x82,0x22,0x53,0x50,0x4B,  /* 000014C0    "y.[."SPK" */
+    0x52,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 000014C8    "R._HID.A" */
+    0xD0,0x08,0x00,0x08,0x5F,0x43,0x52,0x53,  /* 000014D0    "...._CRS" */
+    0x11,0x0D,0x0A,0x0A,0x47,0x01,0x61,0x00,  /* 000014D8    "....G.a." */
+    0x61,0x00,0x00,0x01,0x79,0x00,0x5B,0x82,  /* 000014E0    "a...y.[." */
+    0x31,0x50,0x53,0x32,0x4D,0x08,0x5F,0x48,  /* 000014E8    "1PS2M._H" */
+    0x49,0x44,0x0C,0x41,0xD0,0x0F,0x13,0x08,  /* 000014F0    "ID.A...." */
+    0x5F,0x43,0x49,0x44,0x0C,0x41,0xD0,0x0F,  /* 000014F8    "_CID.A.." */
+    0x13,0x14,0x09,0x5F,0x53,0x54,0x41,0x00,  /* 00001500    "..._STA." */
+    0xA4,0x0A,0x0F,0x08,0x5F,0x43,0x52,0x53,  /* 00001508    "...._CRS" */
+    0x11,0x08,0x0A,0x05,0x22,0x00,0x10,0x79,  /* 00001510    "...."..y" */
+    0x00,0x5B,0x82,0x42,0x04,0x50,0x53,0x32,  /* 00001518    ".[.B.PS2" */
+    0x4B,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 00001520    "K._HID.A" */
+    0xD0,0x03,0x03,0x08,0x5F,0x43,0x49,0x44,  /* 00001528    "...._CID" */
+    0x0C,0x41,0xD0,0x03,0x0B,0x14,0x09,0x5F,  /* 00001530    ".A....._" */
+    0x53,0x54,0x41,0x00,0xA4,0x0A,0x0F,0x08,  /* 00001538    "STA....." */
+    0x5F,0x43,0x52,0x53,0x11,0x18,0x0A,0x15,  /* 00001540    "_CRS...." */
+    0x47,0x01,0x60,0x00,0x60,0x00,0x00,0x01,  /* 00001548    "G.`.`..." */
+    0x47,0x01,0x64,0x00,0x64,0x00,0x00,0x01,  /* 00001550    "G.d.d..." */
+    0x22,0x02,0x00,0x79,0x00,0x5B,0x82,0x3A,  /* 00001558    ""..y.[.:" */
+    0x46,0x44,0x43,0x30,0x08,0x5F,0x48,0x49,  /* 00001560    "FDC0._HI" */
+    0x44,0x0C,0x41,0xD0,0x07,0x00,0x14,0x09,  /* 00001568    "D.A....." */
+    0x5F,0x53,0x54,0x41,0x00,0xA4,0x0A,0x0F,  /* 00001570    "_STA...." */
+    0x08,0x5F,0x43,0x52,0x53,0x11,0x1B,0x0A,  /* 00001578    "._CRS..." */
+    0x18,0x47,0x01,0xF0,0x03,0xF0,0x03,0x01,  /* 00001580    ".G......" */
+    0x06,0x47,0x01,0xF7,0x03,0xF7,0x03,0x01,  /* 00001588    ".G......" */
+    0x01,0x22,0x40,0x00,0x2A,0x04,0x00,0x79,  /* 00001590    "."@.*..y" */
+    0x00,0x5B,0x82,0x46,0x04,0x55,0x41,0x52,  /* 00001598    ".[.F.UAR" */
+    0x31,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 000015A0    "1._HID.A" */
+    0xD0,0x05,0x01,0x08,0x5F,0x55,0x49,0x44,  /* 000015A8    "...._UID" */
+    0x01,0x14,0x19,0x5F,0x53,0x54,0x41,0x00,  /* 000015B0    "..._STA." */
+    0xA0,0x0D,0x93,0x5E,0x5E,0x5E,0x5E,0x55,  /* 000015B8    "...^^^^U" */
+    0x41,0x52,0x31,0x00,0xA4,0x00,0xA1,0x04,  /* 000015C0    "AR1....." */
+    0xA4,0x0A,0x0F,0x08,0x5F,0x43,0x52,0x53,  /* 000015C8    "...._CRS" */
+    0x11,0x10,0x0A,0x0D,0x47,0x01,0xF8,0x03,  /* 000015D0    "....G..." */
+    0xF8,0x03,0x08,0x08,0x22,0x10,0x00,0x79,  /* 000015D8    "...."..y" */
+    0x00,0x5B,0x82,0x47,0x04,0x55,0x41,0x52,  /* 000015E0    ".[.G.UAR" */
+    0x32,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 000015E8    "2._HID.A" */
+    0xD0,0x05,0x01,0x08,0x5F,0x55,0x49,0x44,  /* 000015F0    "...._UID" */
+    0x0A,0x02,0x14,0x19,0x5F,0x53,0x54,0x41,  /* 000015F8    "...._STA" */
+    0x00,0xA0,0x0D,0x93,0x5E,0x5E,0x5E,0x5E,  /* 00001600    "....^^^^" */
+    0x55,0x41,0x52,0x32,0x00,0xA4,0x00,0xA1,  /* 00001608    "UAR2...." */
+    0x04,0xA4,0x0A,0x0F,0x08,0x5F,0x43,0x52,  /* 00001610    "....._CR" */
+    0x53,0x11,0x10,0x0A,0x0D,0x47,0x01,0xF8,  /* 00001618    "S....G.." */
+    0x02,0xF8,0x02,0x08,0x08,0x22,0x08,0x00,  /* 00001620    ".....".." */
+    0x79,0x00,0x5B,0x82,0x36,0x4C,0x54,0x50,  /* 00001628    "y.[.6LTP" */
+    0x31,0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,  /* 00001630    "1._HID.A" */
+    0xD0,0x04,0x00,0x08,0x5F,0x55,0x49,0x44,  /* 00001638    "...._UID" */
+    0x0A,0x02,0x14,0x09,0x5F,0x53,0x54,0x41,  /* 00001640    "...._STA" */
+    0x00,0xA4,0x0A,0x0F,0x08,0x5F,0x43,0x52,  /* 00001648    "....._CR" */
+    0x53,0x11,0x10,0x0A,0x0D,0x47,0x01,0x78,  /* 00001650    "S....G.x" */
+    0x03,0x78,0x03,0x08,0x08,0x22,0x80,0x00,  /* 00001658    ".x...".." */
+    0x79,0x00,0x5B,0x82,0x49,0x0A,0x53,0x30,  /* 00001660    "y.[.I.S0" */
+    0x30,0x5F,0x08,0x5F,0x41,0x44,0x52,0x00,  /* 00001668    "0_._ADR." */
+    0x08,0x5F,0x53,0x55,0x4E,0x00,0x14,0x1F,  /* 00001670    "._SUN..." */
+    0x5F,0x50,0x53,0x30,0x00,0x70,0x00,0x5C,  /* 00001678    "_PS0.p.\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001680    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00001688    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x1F,  /* 00001690    "PEDPT2.." */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x00,0x5C,  /* 00001698    "_PS3.p.\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000016A0    "._GPEDPT" */
+    0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,0x47,  /* 000016A8    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x2B,  /* 000016B0    "PEDPT2.+" */
+    0x5F,0x45,0x4A,0x30,0x01,0x70,0x00,0x5C,  /* 000016B8    "_EJ0.p.\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000016C0    "._GPEDPT" */
+    0x31,0x70,0x0A,0x88,0x5C,0x2E,0x5F,0x47,  /* 000016C8    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x70,0x01,  /* 000016D0    "PEDPT2p." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x50,0x48,  /* 000016D8    "\._GPEPH" */
+    0x30,0x30,0x14,0x2A,0x5F,0x53,0x54,0x41,  /* 000016E0    "00.*_STA" */
+    0x00,0x70,0x00,0x5C,0x2E,0x5F,0x47,0x50,  /* 000016E8    ".p.\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x89,  /* 000016F0    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000016F8    "\._GPEDP" */
+    0x54,0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001700    "T2.\._GP" */
+    0x45,0x50,0x48,0x30,0x30,0x5B,0x82,0x4D,  /* 00001708    "EPH00[.M" */
+    0x0A,0x53,0x30,0x31,0x5F,0x08,0x5F,0x41,  /* 00001710    ".S01_._A" */
+    0x44,0x52,0x0C,0x00,0x00,0x01,0x00,0x08,  /* 00001718    "DR......" */
+    0x5F,0x53,0x55,0x4E,0x01,0x14,0x1F,0x5F,  /* 00001720    "_SUN..._" */
+    0x50,0x53,0x30,0x00,0x70,0x01,0x5C,0x2E,  /* 00001728    "PS0.p.\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001730    "_GPEDPT1" */
+    0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001738    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0x14,0x1F,0x5F,  /* 00001740    "EDPT2.._" */
+    0x50,0x53,0x33,0x00,0x70,0x01,0x5C,0x2E,  /* 00001748    "PS3.p.\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001750    "_GPEDPT1" */
+    0x70,0x0A,0x83,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001758    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0x14,0x2B,0x5F,  /* 00001760    "EDPT2.+_" */
+    0x45,0x4A,0x30,0x01,0x70,0x01,0x5C,0x2E,  /* 00001768    "EJ0.p.\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001770    "_GPEDPT1" */
+    0x70,0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001778    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0x70,0x01,0x5C,  /* 00001780    "EDPT2p.\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x50,0x48,0x30,  /* 00001788    "._GPEPH0" */
+    0x31,0x14,0x2A,0x5F,0x53,0x54,0x41,0x00,  /* 00001790    "1.*_STA." */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001798    "p.\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 000017A0    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000017A8    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000017B0    "2.\._GPE" */
+    0x50,0x48,0x30,0x31,0x5B,0x82,0x42,0x0B,  /* 000017B8    "PH01[.B." */
+    0x53,0x30,0x32,0x5F,0x08,0x5F,0x41,0x44,  /* 000017C0    "S02_._AD" */
+    0x52,0x0C,0x00,0x00,0x02,0x00,0x08,0x5F,  /* 000017C8    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x02,0x14,0x20,0x5F,  /* 000017D0    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x02,0x5C,  /* 000017D8    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000017E0    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 000017E8    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 000017F0    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x02,  /* 000017F8    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001800    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00001808    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00001810    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00001818    ",_EJ0.p." */
+    0x02,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001820    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00001828    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001830    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001838    "p.\._GPE" */
+    0x50,0x48,0x30,0x32,0x14,0x2B,0x5F,0x53,  /* 00001840    "PH02.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x02,0x5C,0x2E,  /* 00001848    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001850    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001858    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00001860    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x32,  /* 00001868    "_GPEPH02" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x33,0x5F,  /* 00001870    "[.B.S03_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00001878    "._ADR..." */
+    0x03,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00001880    "..._SUN." */
+    0x03,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00001888    ".. _PS0." */
+    0x70,0x0A,0x03,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001890    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00001898    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000018A0    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 000018A8    "T2. _PS3" */
+    0x00,0x70,0x0A,0x03,0x5C,0x2E,0x5F,0x47,  /* 000018B0    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 000018B8    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 000018C0    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 000018C8    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x03,0x5C,0x2E,0x5F,  /* 000018D0    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 000018D8    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000018E0    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 000018E8    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x33,  /* 000018F0    "_GPEPH03" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 000018F8    ".+_STA.p" */
+    0x0A,0x03,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001900    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00001908    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001910    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001918    "2.\._GPE" */
+    0x50,0x48,0x30,0x33,0x5B,0x82,0x42,0x0B,  /* 00001920    "PH03[.B." */
+    0x53,0x30,0x34,0x5F,0x08,0x5F,0x41,0x44,  /* 00001928    "S04_._AD" */
+    0x52,0x0C,0x00,0x00,0x04,0x00,0x08,0x5F,  /* 00001930    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x04,0x14,0x20,0x5F,  /* 00001938    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x04,0x5C,  /* 00001940    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001948    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00001950    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00001958    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x04,  /* 00001960    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001968    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00001970    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00001978    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00001980    ",_EJ0.p." */
+    0x04,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001988    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00001990    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001998    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000019A0    "p.\._GPE" */
+    0x50,0x48,0x30,0x34,0x14,0x2B,0x5F,0x53,  /* 000019A8    "PH04.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x04,0x5C,0x2E,  /* 000019B0    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 000019B8    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 000019C0    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 000019C8    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x34,  /* 000019D0    "_GPEPH04" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x35,0x5F,  /* 000019D8    "[.B.S05_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 000019E0    "._ADR..." */
+    0x05,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 000019E8    "..._SUN." */
+    0x05,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 000019F0    ".. _PS0." */
+    0x70,0x0A,0x05,0x5C,0x2E,0x5F,0x47,0x50,  /* 000019F8    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00001A00    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001A08    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00001A10    "T2. _PS3" */
+    0x00,0x70,0x0A,0x05,0x5C,0x2E,0x5F,0x47,  /* 00001A18    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00001A20    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001A28    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00001A30    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x05,0x5C,0x2E,0x5F,  /* 00001A38    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00001A40    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001A48    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00001A50    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x35,  /* 00001A58    "_GPEPH05" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00001A60    ".+_STA.p" */
+    0x0A,0x05,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001A68    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00001A70    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001A78    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001A80    "2.\._GPE" */
+    0x50,0x48,0x30,0x35,0x5B,0x82,0x42,0x0B,  /* 00001A88    "PH05[.B." */
+    0x53,0x30,0x36,0x5F,0x08,0x5F,0x41,0x44,  /* 00001A90    "S06_._AD" */
+    0x52,0x0C,0x00,0x00,0x06,0x00,0x08,0x5F,  /* 00001A98    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x06,0x14,0x20,0x5F,  /* 00001AA0    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x06,0x5C,  /* 00001AA8    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001AB0    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00001AB8    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00001AC0    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x06,  /* 00001AC8    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001AD0    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00001AD8    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00001AE0    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00001AE8    ",_EJ0.p." */
+    0x06,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001AF0    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00001AF8    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001B00    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001B08    "p.\._GPE" */
+    0x50,0x48,0x30,0x36,0x14,0x2B,0x5F,0x53,  /* 00001B10    "PH06.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x06,0x5C,0x2E,  /* 00001B18    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001B20    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001B28    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00001B30    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x36,  /* 00001B38    "_GPEPH06" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x37,0x5F,  /* 00001B40    "[.B.S07_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00001B48    "._ADR..." */
+    0x07,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00001B50    "..._SUN." */
+    0x07,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00001B58    ".. _PS0." */
+    0x70,0x0A,0x07,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001B60    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00001B68    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001B70    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00001B78    "T2. _PS3" */
+    0x00,0x70,0x0A,0x07,0x5C,0x2E,0x5F,0x47,  /* 00001B80    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00001B88    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001B90    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00001B98    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x07,0x5C,0x2E,0x5F,  /* 00001BA0    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00001BA8    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001BB0    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00001BB8    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x37,  /* 00001BC0    "_GPEPH07" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00001BC8    ".+_STA.p" */
+    0x0A,0x07,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001BD0    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00001BD8    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001BE0    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001BE8    "2.\._GPE" */
+    0x50,0x48,0x30,0x37,0x5B,0x82,0x42,0x0B,  /* 00001BF0    "PH07[.B." */
+    0x53,0x30,0x38,0x5F,0x08,0x5F,0x41,0x44,  /* 00001BF8    "S08_._AD" */
+    0x52,0x0C,0x00,0x00,0x08,0x00,0x08,0x5F,  /* 00001C00    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x08,0x14,0x20,0x5F,  /* 00001C08    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x08,0x5C,  /* 00001C10    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001C18    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00001C20    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00001C28    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x08,  /* 00001C30    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001C38    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00001C40    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00001C48    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00001C50    ",_EJ0.p." */
+    0x08,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001C58    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00001C60    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001C68    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001C70    "p.\._GPE" */
+    0x50,0x48,0x30,0x38,0x14,0x2B,0x5F,0x53,  /* 00001C78    "PH08.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x08,0x5C,0x2E,  /* 00001C80    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001C88    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001C90    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00001C98    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x38,  /* 00001CA0    "_GPEPH08" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x39,0x5F,  /* 00001CA8    "[.B.S09_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00001CB0    "._ADR..." */
+    0x09,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00001CB8    "..._SUN." */
+    0x09,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00001CC0    ".. _PS0." */
+    0x70,0x0A,0x09,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001CC8    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00001CD0    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001CD8    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00001CE0    "T2. _PS3" */
+    0x00,0x70,0x0A,0x09,0x5C,0x2E,0x5F,0x47,  /* 00001CE8    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00001CF0    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001CF8    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00001D00    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x09,0x5C,0x2E,0x5F,  /* 00001D08    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00001D10    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001D18    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00001D20    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x39,  /* 00001D28    "_GPEPH09" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00001D30    ".+_STA.p" */
+    0x0A,0x09,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001D38    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00001D40    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001D48    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001D50    "2.\._GPE" */
+    0x50,0x48,0x30,0x39,0x5B,0x82,0x42,0x0B,  /* 00001D58    "PH09[.B." */
+    0x53,0x30,0x41,0x5F,0x08,0x5F,0x41,0x44,  /* 00001D60    "S0A_._AD" */
+    0x52,0x0C,0x00,0x00,0x0A,0x00,0x08,0x5F,  /* 00001D68    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x0A,0x14,0x20,0x5F,  /* 00001D70    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x0A,0x5C,  /* 00001D78    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001D80    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00001D88    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00001D90    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x0A,  /* 00001D98    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001DA0    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00001DA8    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00001DB0    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00001DB8    ",_EJ0.p." */
+    0x0A,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001DC0    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00001DC8    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001DD0    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001DD8    "p.\._GPE" */
+    0x50,0x48,0x30,0x41,0x14,0x2B,0x5F,0x53,  /* 00001DE0    "PH0A.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x0A,0x5C,0x2E,  /* 00001DE8    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001DF0    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001DF8    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00001E00    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x41,  /* 00001E08    "_GPEPH0A" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x42,0x5F,  /* 00001E10    "[.B.S0B_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00001E18    "._ADR..." */
+    0x0B,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00001E20    "..._SUN." */
+    0x0B,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00001E28    ".. _PS0." */
+    0x70,0x0A,0x0B,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001E30    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00001E38    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001E40    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00001E48    "T2. _PS3" */
+    0x00,0x70,0x0A,0x0B,0x5C,0x2E,0x5F,0x47,  /* 00001E50    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00001E58    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001E60    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00001E68    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x0B,0x5C,0x2E,0x5F,  /* 00001E70    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00001E78    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001E80    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00001E88    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x42,  /* 00001E90    "_GPEPH0B" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00001E98    ".+_STA.p" */
+    0x0A,0x0B,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001EA0    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00001EA8    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001EB0    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001EB8    "2.\._GPE" */
+    0x50,0x48,0x30,0x42,0x5B,0x82,0x42,0x0B,  /* 00001EC0    "PH0B[.B." */
+    0x53,0x30,0x43,0x5F,0x08,0x5F,0x41,0x44,  /* 00001EC8    "S0C_._AD" */
+    0x52,0x0C,0x00,0x00,0x0C,0x00,0x08,0x5F,  /* 00001ED0    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x0C,0x14,0x20,0x5F,  /* 00001ED8    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x0C,0x5C,  /* 00001EE0    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00001EE8    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00001EF0    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00001EF8    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x0C,  /* 00001F00    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001F08    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00001F10    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00001F18    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00001F20    ",_EJ0.p." */
+    0x0C,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001F28    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00001F30    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00001F38    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001F40    "p.\._GPE" */
+    0x50,0x48,0x30,0x43,0x14,0x2B,0x5F,0x53,  /* 00001F48    "PH0C.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x0C,0x5C,0x2E,  /* 00001F50    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00001F58    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001F60    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00001F68    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x43,  /* 00001F70    "_GPEPH0C" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x44,0x5F,  /* 00001F78    "[.B.S0D_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00001F80    "._ADR..." */
+    0x0D,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00001F88    "..._SUN." */
+    0x0D,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00001F90    ".. _PS0." */
+    0x70,0x0A,0x0D,0x5C,0x2E,0x5F,0x47,0x50,  /* 00001F98    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00001FA0    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00001FA8    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00001FB0    "T2. _PS3" */
+    0x00,0x70,0x0A,0x0D,0x5C,0x2E,0x5F,0x47,  /* 00001FB8    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00001FC0    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00001FC8    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00001FD0    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x0D,0x5C,0x2E,0x5F,  /* 00001FD8    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00001FE0    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00001FE8    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00001FF0    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x44,  /* 00001FF8    "_GPEPH0D" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002000    ".+_STA.p" */
+    0x0A,0x0D,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002008    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002010    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002018    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002020    "2.\._GPE" */
+    0x50,0x48,0x30,0x44,0x5B,0x82,0x42,0x0B,  /* 00002028    "PH0D[.B." */
+    0x53,0x30,0x45,0x5F,0x08,0x5F,0x41,0x44,  /* 00002030    "S0E_._AD" */
+    0x52,0x0C,0x00,0x00,0x0E,0x00,0x08,0x5F,  /* 00002038    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x0E,0x14,0x20,0x5F,  /* 00002040    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x0E,0x5C,  /* 00002048    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002050    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00002058    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002060    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x0E,  /* 00002068    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002070    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00002078    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00002080    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00002088    ",_EJ0.p." */
+    0x0E,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002090    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002098    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 000020A0    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000020A8    "p.\._GPE" */
+    0x50,0x48,0x30,0x45,0x14,0x2B,0x5F,0x53,  /* 000020B0    "PH0E.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x0E,0x5C,0x2E,  /* 000020B8    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 000020C0    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 000020C8    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 000020D0    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x45,  /* 000020D8    "_GPEPH0E" */
+    0x5B,0x82,0x42,0x0B,0x53,0x30,0x46,0x5F,  /* 000020E0    "[.B.S0F_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 000020E8    "._ADR..." */
+    0x0F,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 000020F0    "..._SUN." */
+    0x0F,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 000020F8    ".. _PS0." */
+    0x70,0x0A,0x0F,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002100    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002108    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002110    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002118    "T2. _PS3" */
+    0x00,0x70,0x0A,0x0F,0x5C,0x2E,0x5F,0x47,  /* 00002120    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002128    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002130    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00002138    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x0F,0x5C,0x2E,0x5F,  /* 00002140    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00002148    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002150    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00002158    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x30,0x46,  /* 00002160    "_GPEPH0F" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002168    ".+_STA.p" */
+    0x0A,0x0F,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002170    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002178    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002180    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002188    "2.\._GPE" */
+    0x50,0x48,0x30,0x46,0x5B,0x82,0x42,0x0B,  /* 00002190    "PH0F[.B." */
+    0x53,0x31,0x30,0x5F,0x08,0x5F,0x41,0x44,  /* 00002198    "S10_._AD" */
+    0x52,0x0C,0x00,0x00,0x10,0x00,0x08,0x5F,  /* 000021A0    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x10,0x14,0x20,0x5F,  /* 000021A8    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x10,0x5C,  /* 000021B0    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000021B8    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 000021C0    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 000021C8    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x10,  /* 000021D0    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000021D8    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 000021E0    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 000021E8    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 000021F0    ",_EJ0.p." */
+    0x10,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 000021F8    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002200    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00002208    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002210    "p.\._GPE" */
+    0x50,0x48,0x31,0x30,0x14,0x2B,0x5F,0x53,  /* 00002218    "PH10.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x10,0x5C,0x2E,  /* 00002220    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00002228    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002230    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00002238    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x30,  /* 00002240    "_GPEPH10" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x31,0x5F,  /* 00002248    "[.B.S11_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00002250    "._ADR..." */
+    0x11,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00002258    "..._SUN." */
+    0x11,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002260    ".. _PS0." */
+    0x70,0x0A,0x11,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002268    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002270    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002278    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002280    "T2. _PS3" */
+    0x00,0x70,0x0A,0x11,0x5C,0x2E,0x5F,0x47,  /* 00002288    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002290    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002298    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 000022A0    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x11,0x5C,0x2E,0x5F,  /* 000022A8    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 000022B0    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000022B8    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 000022C0    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x31,  /* 000022C8    "_GPEPH11" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 000022D0    ".+_STA.p" */
+    0x0A,0x11,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000022D8    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 000022E0    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000022E8    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000022F0    "2.\._GPE" */
+    0x50,0x48,0x31,0x31,0x5B,0x82,0x42,0x0B,  /* 000022F8    "PH11[.B." */
+    0x53,0x31,0x32,0x5F,0x08,0x5F,0x41,0x44,  /* 00002300    "S12_._AD" */
+    0x52,0x0C,0x00,0x00,0x12,0x00,0x08,0x5F,  /* 00002308    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x12,0x14,0x20,0x5F,  /* 00002310    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x12,0x5C,  /* 00002318    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002320    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00002328    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002330    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x12,  /* 00002338    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002340    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00002348    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00002350    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00002358    ",_EJ0.p." */
+    0x12,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002360    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002368    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00002370    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002378    "p.\._GPE" */
+    0x50,0x48,0x31,0x32,0x14,0x2B,0x5F,0x53,  /* 00002380    "PH12.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x12,0x5C,0x2E,  /* 00002388    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00002390    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002398    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 000023A0    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x32,  /* 000023A8    "_GPEPH12" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x33,0x5F,  /* 000023B0    "[.B.S13_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 000023B8    "._ADR..." */
+    0x13,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 000023C0    "..._SUN." */
+    0x13,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 000023C8    ".. _PS0." */
+    0x70,0x0A,0x13,0x5C,0x2E,0x5F,0x47,0x50,  /* 000023D0    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 000023D8    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000023E0    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 000023E8    "T2. _PS3" */
+    0x00,0x70,0x0A,0x13,0x5C,0x2E,0x5F,0x47,  /* 000023F0    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 000023F8    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002400    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00002408    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x13,0x5C,0x2E,0x5F,  /* 00002410    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00002418    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002420    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00002428    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x33,  /* 00002430    "_GPEPH13" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002438    ".+_STA.p" */
+    0x0A,0x13,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002440    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002448    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002450    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002458    "2.\._GPE" */
+    0x50,0x48,0x31,0x33,0x5B,0x82,0x42,0x0B,  /* 00002460    "PH13[.B." */
+    0x53,0x31,0x34,0x5F,0x08,0x5F,0x41,0x44,  /* 00002468    "S14_._AD" */
+    0x52,0x0C,0x00,0x00,0x14,0x00,0x08,0x5F,  /* 00002470    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x14,0x14,0x20,0x5F,  /* 00002478    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x14,0x5C,  /* 00002480    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002488    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00002490    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002498    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x14,  /* 000024A0    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000024A8    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 000024B0    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 000024B8    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 000024C0    ",_EJ0.p." */
+    0x14,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 000024C8    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 000024D0    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 000024D8    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000024E0    "p.\._GPE" */
+    0x50,0x48,0x31,0x34,0x14,0x2B,0x5F,0x53,  /* 000024E8    "PH14.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x14,0x5C,0x2E,  /* 000024F0    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 000024F8    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002500    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00002508    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x34,  /* 00002510    "_GPEPH14" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x35,0x5F,  /* 00002518    "[.B.S15_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00002520    "._ADR..." */
+    0x15,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00002528    "..._SUN." */
+    0x15,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002530    ".. _PS0." */
+    0x70,0x0A,0x15,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002538    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002540    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002548    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002550    "T2. _PS3" */
+    0x00,0x70,0x0A,0x15,0x5C,0x2E,0x5F,0x47,  /* 00002558    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002560    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002568    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00002570    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x15,0x5C,0x2E,0x5F,  /* 00002578    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00002580    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002588    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00002590    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x35,  /* 00002598    "_GPEPH15" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 000025A0    ".+_STA.p" */
+    0x0A,0x15,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000025A8    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 000025B0    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000025B8    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000025C0    "2.\._GPE" */
+    0x50,0x48,0x31,0x35,0x5B,0x82,0x42,0x0B,  /* 000025C8    "PH15[.B." */
+    0x53,0x31,0x36,0x5F,0x08,0x5F,0x41,0x44,  /* 000025D0    "S16_._AD" */
+    0x52,0x0C,0x00,0x00,0x16,0x00,0x08,0x5F,  /* 000025D8    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x16,0x14,0x20,0x5F,  /* 000025E0    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x16,0x5C,  /* 000025E8    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000025F0    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 000025F8    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002600    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x16,  /* 00002608    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002610    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00002618    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00002620    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00002628    ",_EJ0.p." */
+    0x16,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002630    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002638    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00002640    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002648    "p.\._GPE" */
+    0x50,0x48,0x31,0x36,0x14,0x2B,0x5F,0x53,  /* 00002650    "PH16.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x16,0x5C,0x2E,  /* 00002658    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00002660    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002668    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00002670    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x36,  /* 00002678    "_GPEPH16" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x37,0x5F,  /* 00002680    "[.B.S17_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00002688    "._ADR..." */
+    0x17,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00002690    "..._SUN." */
+    0x17,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002698    ".. _PS0." */
+    0x70,0x0A,0x17,0x5C,0x2E,0x5F,0x47,0x50,  /* 000026A0    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 000026A8    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000026B0    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 000026B8    "T2. _PS3" */
+    0x00,0x70,0x0A,0x17,0x5C,0x2E,0x5F,0x47,  /* 000026C0    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 000026C8    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 000026D0    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 000026D8    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x17,0x5C,0x2E,0x5F,  /* 000026E0    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 000026E8    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000026F0    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 000026F8    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x37,  /* 00002700    "_GPEPH17" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002708    ".+_STA.p" */
+    0x0A,0x17,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002710    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002718    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002720    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002728    "2.\._GPE" */
+    0x50,0x48,0x31,0x37,0x5B,0x82,0x42,0x0B,  /* 00002730    "PH17[.B." */
+    0x53,0x31,0x38,0x5F,0x08,0x5F,0x41,0x44,  /* 00002738    "S18_._AD" */
+    0x52,0x0C,0x00,0x00,0x18,0x00,0x08,0x5F,  /* 00002740    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x18,0x14,0x20,0x5F,  /* 00002748    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x18,0x5C,  /* 00002750    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002758    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00002760    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002768    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x18,  /* 00002770    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002778    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00002780    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00002788    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00002790    ",_EJ0.p." */
+    0x18,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002798    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 000027A0    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 000027A8    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000027B0    "p.\._GPE" */
+    0x50,0x48,0x31,0x38,0x14,0x2B,0x5F,0x53,  /* 000027B8    "PH18.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x18,0x5C,0x2E,  /* 000027C0    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 000027C8    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 000027D0    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 000027D8    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x38,  /* 000027E0    "_GPEPH18" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x39,0x5F,  /* 000027E8    "[.B.S19_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 000027F0    "._ADR..." */
+    0x19,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 000027F8    "..._SUN." */
+    0x19,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002800    ".. _PS0." */
+    0x70,0x0A,0x19,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002808    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002810    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002818    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002820    "T2. _PS3" */
+    0x00,0x70,0x0A,0x19,0x5C,0x2E,0x5F,0x47,  /* 00002828    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002830    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002838    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00002840    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x19,0x5C,0x2E,0x5F,  /* 00002848    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00002850    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002858    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00002860    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x39,  /* 00002868    "_GPEPH19" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002870    ".+_STA.p" */
+    0x0A,0x19,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002878    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002880    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002888    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002890    "2.\._GPE" */
+    0x50,0x48,0x31,0x39,0x5B,0x82,0x42,0x0B,  /* 00002898    "PH19[.B." */
+    0x53,0x31,0x41,0x5F,0x08,0x5F,0x41,0x44,  /* 000028A0    "S1A_._AD" */
+    0x52,0x0C,0x00,0x00,0x1A,0x00,0x08,0x5F,  /* 000028A8    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x1A,0x14,0x20,0x5F,  /* 000028B0    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x1A,0x5C,  /* 000028B8    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000028C0    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 000028C8    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 000028D0    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x1A,  /* 000028D8    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 000028E0    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 000028E8    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 000028F0    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 000028F8    ",_EJ0.p." */
+    0x1A,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002900    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002908    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00002910    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002918    "p.\._GPE" */
+    0x50,0x48,0x31,0x41,0x14,0x2B,0x5F,0x53,  /* 00002920    "PH1A.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x1A,0x5C,0x2E,  /* 00002928    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00002930    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002938    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00002940    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x41,  /* 00002948    "_GPEPH1A" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x42,0x5F,  /* 00002950    "[.B.S1B_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00002958    "._ADR..." */
+    0x1B,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00002960    "..._SUN." */
+    0x1B,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002968    ".. _PS0." */
+    0x70,0x0A,0x1B,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002970    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002978    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002980    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002988    "T2. _PS3" */
+    0x00,0x70,0x0A,0x1B,0x5C,0x2E,0x5F,0x47,  /* 00002990    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002998    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 000029A0    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 000029A8    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x1B,0x5C,0x2E,0x5F,  /* 000029B0    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 000029B8    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000029C0    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 000029C8    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x42,  /* 000029D0    "_GPEPH1B" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 000029D8    ".+_STA.p" */
+    0x0A,0x1B,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000029E0    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 000029E8    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 000029F0    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 000029F8    "2.\._GPE" */
+    0x50,0x48,0x31,0x42,0x5B,0x82,0x42,0x0B,  /* 00002A00    "PH1B[.B." */
+    0x53,0x31,0x43,0x5F,0x08,0x5F,0x41,0x44,  /* 00002A08    "S1C_._AD" */
+    0x52,0x0C,0x00,0x00,0x1C,0x00,0x08,0x5F,  /* 00002A10    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x1C,0x14,0x20,0x5F,  /* 00002A18    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x1C,0x5C,  /* 00002A20    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002A28    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00002A30    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002A38    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x1C,  /* 00002A40    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002A48    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00002A50    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00002A58    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00002A60    ",_EJ0.p." */
+    0x1C,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002A68    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002A70    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00002A78    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002A80    "p.\._GPE" */
+    0x50,0x48,0x31,0x43,0x14,0x2B,0x5F,0x53,  /* 00002A88    "PH1C.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x1C,0x5C,0x2E,  /* 00002A90    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00002A98    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002AA0    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00002AA8    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x43,  /* 00002AB0    "_GPEPH1C" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x44,0x5F,  /* 00002AB8    "[.B.S1D_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00002AC0    "._ADR..." */
+    0x1D,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00002AC8    "..._SUN." */
+    0x1D,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002AD0    ".. _PS0." */
+    0x70,0x0A,0x1D,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002AD8    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002AE0    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002AE8    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002AF0    "T2. _PS3" */
+    0x00,0x70,0x0A,0x1D,0x5C,0x2E,0x5F,0x47,  /* 00002AF8    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002B00    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002B08    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00002B10    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x1D,0x5C,0x2E,0x5F,  /* 00002B18    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00002B20    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002B28    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00002B30    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x44,  /* 00002B38    "_GPEPH1D" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002B40    ".+_STA.p" */
+    0x0A,0x1D,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002B48    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002B50    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002B58    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002B60    "2.\._GPE" */
+    0x50,0x48,0x31,0x44,0x5B,0x82,0x42,0x0B,  /* 00002B68    "PH1D[.B." */
+    0x53,0x31,0x45,0x5F,0x08,0x5F,0x41,0x44,  /* 00002B70    "S1E_._AD" */
+    0x52,0x0C,0x00,0x00,0x1E,0x00,0x08,0x5F,  /* 00002B78    "R......_" */
+    0x53,0x55,0x4E,0x0A,0x1E,0x14,0x20,0x5F,  /* 00002B80    "SUN... _" */
+    0x50,0x53,0x30,0x00,0x70,0x0A,0x1E,0x5C,  /* 00002B88    "PS0.p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002B90    "._GPEDPT" */
+    0x31,0x70,0x0A,0x80,0x5C,0x2E,0x5F,0x47,  /* 00002B98    "1p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x32,0x14,0x20,  /* 00002BA0    "PEDPT2. " */
+    0x5F,0x50,0x53,0x33,0x00,0x70,0x0A,0x1E,  /* 00002BA8    "_PS3.p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002BB0    "\._GPEDP" */
+    0x54,0x31,0x70,0x0A,0x83,0x5C,0x2E,0x5F,  /* 00002BB8    "T1p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x32,0x14,  /* 00002BC0    "GPEDPT2." */
+    0x2C,0x5F,0x45,0x4A,0x30,0x01,0x70,0x0A,  /* 00002BC8    ",_EJ0.p." */
+    0x1E,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002BD0    ".\._GPED" */
+    0x50,0x54,0x31,0x70,0x0A,0x88,0x5C,0x2E,  /* 00002BD8    "PT1p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x32,  /* 00002BE0    "_GPEDPT2" */
+    0x70,0x01,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002BE8    "p.\._GPE" */
+    0x50,0x48,0x31,0x45,0x14,0x2B,0x5F,0x53,  /* 00002BF0    "PH1E.+_S" */
+    0x54,0x41,0x00,0x70,0x0A,0x1E,0x5C,0x2E,  /* 00002BF8    "TA.p..\." */
+    0x5F,0x47,0x50,0x45,0x44,0x50,0x54,0x31,  /* 00002C00    "_GPEDPT1" */
+    0x70,0x0A,0x89,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002C08    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x32,0xA4,0x5C,0x2E,  /* 00002C10    "EDPT2.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x45,  /* 00002C18    "_GPEPH1E" */
+    0x5B,0x82,0x42,0x0B,0x53,0x31,0x46,0x5F,  /* 00002C20    "[.B.S1F_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00002C28    "._ADR..." */
+    0x1F,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 00002C30    "..._SUN." */
+    0x1F,0x14,0x20,0x5F,0x50,0x53,0x30,0x00,  /* 00002C38    ".. _PS0." */
+    0x70,0x0A,0x1F,0x5C,0x2E,0x5F,0x47,0x50,  /* 00002C40    "p..\._GP" */
+    0x45,0x44,0x50,0x54,0x31,0x70,0x0A,0x80,  /* 00002C48    "EDPT1p.." */
+    0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,  /* 00002C50    "\._GPEDP" */
+    0x54,0x32,0x14,0x20,0x5F,0x50,0x53,0x33,  /* 00002C58    "T2. _PS3" */
+    0x00,0x70,0x0A,0x1F,0x5C,0x2E,0x5F,0x47,  /* 00002C60    ".p..\._G" */
+    0x50,0x45,0x44,0x50,0x54,0x31,0x70,0x0A,  /* 00002C68    "PEDPT1p." */
+    0x83,0x5C,0x2E,0x5F,0x47,0x50,0x45,0x44,  /* 00002C70    ".\._GPED" */
+    0x50,0x54,0x32,0x14,0x2C,0x5F,0x45,0x4A,  /* 00002C78    "PT2.,_EJ" */
+    0x30,0x01,0x70,0x0A,0x1F,0x5C,0x2E,0x5F,  /* 00002C80    "0.p..\._" */
+    0x47,0x50,0x45,0x44,0x50,0x54,0x31,0x70,  /* 00002C88    "GPEDPT1p" */
+    0x0A,0x88,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002C90    "..\._GPE" */
+    0x44,0x50,0x54,0x32,0x70,0x01,0x5C,0x2E,  /* 00002C98    "DPT2p.\." */
+    0x5F,0x47,0x50,0x45,0x50,0x48,0x31,0x46,  /* 00002CA0    "_GPEPH1F" */
+    0x14,0x2B,0x5F,0x53,0x54,0x41,0x00,0x70,  /* 00002CA8    ".+_STA.p" */
+    0x0A,0x1F,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002CB0    "..\._GPE" */
+    0x44,0x50,0x54,0x31,0x70,0x0A,0x89,0x5C,  /* 00002CB8    "DPT1p..\" */
+    0x2E,0x5F,0x47,0x50,0x45,0x44,0x50,0x54,  /* 00002CC0    "._GPEDPT" */
+    0x32,0xA4,0x5C,0x2E,0x5F,0x47,0x50,0x45,  /* 00002CC8    "2.\._GPE" */
+    0x50,0x48,0x31,0x46,0x10,0x4D,0x52,0x5F,  /* 00002CD0    "PH1F.MR_" */
+    0x47,0x50,0x45,0x5B,0x80,0x50,0x48,0x50,  /* 00002CD8    "GPE[.PHP" */
+    0x5F,0x01,0x0B,0xC0,0x10,0x0A,0x22,0x5B,  /* 00002CE0    "_....."[" */
+    0x81,0x41,0x0B,0x50,0x48,0x50,0x5F,0x01,  /* 00002CE8    ".A.PHP_." */
+    0x50,0x53,0x54,0x41,0x08,0x50,0x53,0x54,  /* 00002CF0    "PSTA.PST" */
+    0x42,0x08,0x50,0x48,0x30,0x30,0x08,0x50,  /* 00002CF8    "B.PH00.P" */
+    0x48,0x30,0x31,0x08,0x50,0x48,0x30,0x32,  /* 00002D00    "H01.PH02" */
+    0x08,0x50,0x48,0x30,0x33,0x08,0x50,0x48,  /* 00002D08    ".PH03.PH" */
+    0x30,0x34,0x08,0x50,0x48,0x30,0x35,0x08,  /* 00002D10    "04.PH05." */
+    0x50,0x48,0x30,0x36,0x08,0x50,0x48,0x30,  /* 00002D18    "PH06.PH0" */
+    0x37,0x08,0x50,0x48,0x30,0x38,0x08,0x50,  /* 00002D20    "7.PH08.P" */
+    0x48,0x30,0x39,0x08,0x50,0x48,0x30,0x41,  /* 00002D28    "H09.PH0A" */
+    0x08,0x50,0x48,0x30,0x42,0x08,0x50,0x48,  /* 00002D30    ".PH0B.PH" */
+    0x30,0x43,0x08,0x50,0x48,0x30,0x44,0x08,  /* 00002D38    "0C.PH0D." */
+    0x50,0x48,0x30,0x45,0x08,0x50,0x48,0x30,  /* 00002D40    "PH0E.PH0" */
+    0x46,0x08,0x50,0x48,0x31,0x30,0x08,0x50,  /* 00002D48    "F.PH10.P" */
+    0x48,0x31,0x31,0x08,0x50,0x48,0x31,0x32,  /* 00002D50    "H11.PH12" */
+    0x08,0x50,0x48,0x31,0x33,0x08,0x50,0x48,  /* 00002D58    ".PH13.PH" */
+    0x31,0x34,0x08,0x50,0x48,0x31,0x35,0x08,  /* 00002D60    "14.PH15." */
+    0x50,0x48,0x31,0x36,0x08,0x50,0x48,0x31,  /* 00002D68    "PH16.PH1" */
+    0x37,0x08,0x50,0x48,0x31,0x38,0x08,0x50,  /* 00002D70    "7.PH18.P" */
+    0x48,0x31,0x39,0x08,0x50,0x48,0x31,0x41,  /* 00002D78    "H19.PH1A" */
+    0x08,0x50,0x48,0x31,0x42,0x08,0x50,0x48,  /* 00002D80    ".PH1B.PH" */
+    0x31,0x43,0x08,0x50,0x48,0x31,0x44,0x08,  /* 00002D88    "1C.PH1D." */
+    0x50,0x48,0x31,0x45,0x08,0x50,0x48,0x31,  /* 00002D90    "PH1E.PH1" */
+    0x46,0x08,0x5B,0x80,0x44,0x47,0x31,0x5F,  /* 00002D98    "F.[.DG1_" */
+    0x01,0x0B,0x44,0xB0,0x0A,0x04,0x5B,0x81,  /* 00002DA0    "..D...[." */
+    0x10,0x44,0x47,0x31,0x5F,0x01,0x44,0x50,  /* 00002DA8    ".DG1_.DP" */
+    0x54,0x31,0x08,0x44,0x50,0x54,0x32,0x08,  /* 00002DB0    "T1.DPT2." */
+    0x14,0x49,0x44,0x5F,0x4C,0x30,0x33,0x08,  /* 00002DB8    ".ID_L03." */
+    0x08,0x5F,0x54,0x5F,0x30,0x00,0x08,0x53,  /* 00002DC0    "._T_0..S" */
+    0x4C,0x54,0x5F,0x00,0x08,0x45,0x56,0x54,  /* 00002DC8    "LT_..EVT" */
+    0x5F,0x00,0x70,0x50,0x53,0x54,0x41,0x61,  /* 00002DD0    "_.pPSTAa" */
+    0x7B,0x61,0x0A,0x0F,0x45,0x56,0x54,0x5F,  /* 00002DD8    "{a..EVT_" */
+    0x70,0x50,0x53,0x54,0x42,0x61,0x7B,0x61,  /* 00002DE0    "pPSTBa{a" */
+    0x0A,0xFF,0x53,0x4C,0x54,0x5F,0x70,0x53,  /* 00002DE8    "..SLT_pS" */
+    0x4C,0x54,0x5F,0x44,0x50,0x54,0x31,0x70,  /* 00002DF0    "LT_DPT1p" */
+    0x45,0x56,0x54,0x5F,0x44,0x50,0x54,0x32,  /* 00002DF8    "EVT_DPT2" */
+    0x70,0x53,0x4C,0x54,0x5F,0x5F,0x54,0x5F,  /* 00002E00    "pSLT__T_" */
+    0x30,0xA0,0x1B,0x93,0x5F,0x54,0x5F,0x30,  /* 00002E08    "0..._T_0" */
+    0x00,0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,  /* 00002E10    "..\/._SB" */
+    0x5F,0x50,0x43,0x49,0x30,0x53,0x30,0x30,  /* 00002E18    "_PCI0S00" */
+    0x5F,0x45,0x56,0x54,0x5F,0xA1,0x4C,0x3D,  /* 00002E20    "_EVT_.L=" */
+    0xA0,0x1B,0x93,0x5F,0x54,0x5F,0x30,0x01,  /* 00002E28    "..._T_0." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002E30    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x31,0x5F,  /* 00002E38    "PCI0S01_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x3B,0xA0,  /* 00002E40    "EVT_.M;." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x02,  /* 00002E48    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002E50    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x32,0x5F,  /* 00002E58    "PCI0S02_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x39,0xA0,  /* 00002E60    "EVT_.M9." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x03,  /* 00002E68    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002E70    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x33,0x5F,  /* 00002E78    "PCI0S03_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x37,0xA0,  /* 00002E80    "EVT_.M7." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x04,  /* 00002E88    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002E90    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x34,0x5F,  /* 00002E98    "PCI0S04_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x35,0xA0,  /* 00002EA0    "EVT_.M5." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x05,  /* 00002EA8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002EB0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x35,0x5F,  /* 00002EB8    "PCI0S05_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x33,0xA0,  /* 00002EC0    "EVT_.M3." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x06,  /* 00002EC8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002ED0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x36,0x5F,  /* 00002ED8    "PCI0S06_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x31,0xA0,  /* 00002EE0    "EVT_.M1." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x07,  /* 00002EE8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002EF0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x37,0x5F,  /* 00002EF8    "PCI0S07_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x2F,0xA0,  /* 00002F00    "EVT_.M/." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x08,  /* 00002F08    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002F10    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x38,0x5F,  /* 00002F18    "PCI0S08_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x2D,0xA0,  /* 00002F20    "EVT_.M-." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x09,  /* 00002F28    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002F30    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x39,0x5F,  /* 00002F38    "PCI0S09_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x2B,0xA0,  /* 00002F40    "EVT_.M+." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x0A,  /* 00002F48    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002F50    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x41,0x5F,  /* 00002F58    "PCI0S0A_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x29,0xA0,  /* 00002F60    "EVT_.M)." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x0B,  /* 00002F68    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002F70    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x42,0x5F,  /* 00002F78    "PCI0S0B_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x27,0xA0,  /* 00002F80    "EVT_.M'." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x0C,  /* 00002F88    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002F90    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x43,0x5F,  /* 00002F98    "PCI0S0C_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x25,0xA0,  /* 00002FA0    "EVT_.M%." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x0D,  /* 00002FA8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002FB0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x44,0x5F,  /* 00002FB8    "PCI0S0D_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x23,0xA0,  /* 00002FC0    "EVT_.M#." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x0E,  /* 00002FC8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002FD0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x45,0x5F,  /* 00002FD8    "PCI0S0E_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x21,0xA0,  /* 00002FE0    "EVT_.M!." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x0F,  /* 00002FE8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00002FF0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x30,0x46,0x5F,  /* 00002FF8    "PCI0S0F_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x1F,0xA0,  /* 00003000    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x10,  /* 00003008    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003010    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x30,0x5F,  /* 00003018    "PCI0S10_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x1D,0xA0,  /* 00003020    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x11,  /* 00003028    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003030    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x31,0x5F,  /* 00003038    "PCI0S11_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x1B,0xA0,  /* 00003040    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x12,  /* 00003048    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003050    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x32,0x5F,  /* 00003058    "PCI0S12_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x19,0xA0,  /* 00003060    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x13,  /* 00003068    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003070    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x33,0x5F,  /* 00003078    "PCI0S13_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x17,0xA0,  /* 00003080    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x14,  /* 00003088    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003090    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x34,0x5F,  /* 00003098    "PCI0S14_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x15,0xA0,  /* 000030A0    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x15,  /* 000030A8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 000030B0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x35,0x5F,  /* 000030B8    "PCI0S15_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x13,0xA0,  /* 000030C0    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x16,  /* 000030C8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 000030D0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x36,0x5F,  /* 000030D8    "PCI0S16_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x11,0xA0,  /* 000030E0    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x17,  /* 000030E8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 000030F0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x37,0x5F,  /* 000030F8    "PCI0S17_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x0F,0xA0,  /* 00003100    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x18,  /* 00003108    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003110    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x38,0x5F,  /* 00003118    "PCI0S18_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x0D,0xA0,  /* 00003120    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x19,  /* 00003128    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003130    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x39,0x5F,  /* 00003138    "PCI0S19_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x0B,0xA0,  /* 00003140    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x1A,  /* 00003148    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003150    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x41,0x5F,  /* 00003158    "PCI0S1A_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x09,0xA0,  /* 00003160    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x1B,  /* 00003168    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003170    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x42,0x5F,  /* 00003178    "PCI0S1B_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x07,0xA0,  /* 00003180    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x1C,  /* 00003188    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 00003190    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x43,0x5F,  /* 00003198    "PCI0S1C_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x4D,0x05,0xA0,  /* 000031A0    "EVT_.M.." */
+    0x1C,0x93,0x5F,0x54,0x5F,0x30,0x0A,0x1D,  /* 000031A8    ".._T_0.." */
+    0x86,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 000031B0    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x53,0x31,0x44,0x5F,  /* 000031B8    "PCI0S1D_" */
+    0x45,0x56,0x54,0x5F,0xA1,0x3D,0xA0,0x1C,  /* 000031C0    "EVT_.=.." */
+    0x93,0x5F,0x54,0x5F,0x30,0x0A,0x1E,0x86,  /* 000031C8    "._T_0..." */
+    0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,  /* 000031D0    "\/._SB_P" */
+    0x43,0x49,0x30,0x53,0x31,0x45,0x5F,0x45,  /* 000031D8    "CI0S1E_E" */
+    0x56,0x54,0x5F,0xA1,0x1E,0xA0,0x1C,0x93,  /* 000031E0    "VT_....." */
+    0x5F,0x54,0x5F,0x30,0x0A,0x1F,0x86,0x5C,  /* 000031E8    "_T_0...\" */
+    0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,0x43,  /* 000031F0    "/._SB_PC" */
+    0x49,0x30,0x53,0x31,0x46,0x5F,0x45,0x56,  /* 000031F8    "I0S1F_EV" */
     0x54,0x5F,
 };
 int DsdtLen=sizeof(AmlCode);
index 8a93218c1423a5a991eebabaa2e99c5b1f87c348..ab544e15fdf3510c8920ec8fc6c224b37cba4213 100644 (file)
@@ -67,7 +67,7 @@ struct acpi_20_fadt Fadt = {
 
     .p_lvl2_lat = 0x0fff, /* >100,  means we do not support C2 state */
     .p_lvl3_lat = 0x0fff, /* >1000, means we do not support C3 state */
-    .iapc_boot_arch = ACPI_LEGACY_DEVICES | ACPI_8042,
+    .iapc_boot_arch = ACPI_8042,
     .flags = (ACPI_PROC_C1 | ACPI_SLP_BUTTON |
               ACPI_WBINVD | ACPI_PWR_BUTTON |
               ACPI_FIX_RTC | ACPI_TMR_VAL_EXT),
index 5dc2c7f8077e1b206a274897771b6f6faf92405f..ae23b3e159a5f8e371b15f25c1a67a6b9c2bbca1 100644 (file)
@@ -88,11 +88,25 @@ void cacheattr_init(void)
     nr_var_ranges = (uint8_t)mtrr_cap;
     if ( nr_var_ranges != 0 )
     {
-        /* A single UC range covering PCI space. */
-        wrmsr(MSR_MTRRphysBase(0), PCI_MEMBASE);
-        wrmsr(MSR_MTRRphysMask(0),
-              ((uint64_t)(int32_t)PCI_MEMBASE & addr_mask) | (1u << 11));
-        printf("var MTRRs ... ");
+        unsigned long base = pci_mem_start, size;
+        int i;
+
+        for ( i = 0; (base != pci_mem_end) && (i < nr_var_ranges); i++ )
+        {
+            size = PAGE_SIZE;
+            while ( !(base & size) )
+                size <<= 1;
+            while ( ((base + size) < base) || ((base + size) > pci_mem_end) )
+                size >>= 1;
+
+            wrmsr(MSR_MTRRphysBase(i), base);
+            wrmsr(MSR_MTRRphysMask(i),
+                  (~(uint64_t)(size-1) & addr_mask) | (1u << 11));
+
+            base += size;
+        }
+
+        printf("var MTRRs [%d/%d] ... ", i, nr_var_ranges);
     }
 
     wrmsr(MSR_MTRRdefType, mtrr_def);
index 32011cd5a1a1bc6aaf1473ed2d24c88cd459b14f..aa7479a8d32f7d1618f33f33f808ee481a020558 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef __HVMLOADER_CONFIG_H__
 #define __HVMLOADER_CONFIG_H__
 
+#define PAGE_SHIFT 12
+#define PAGE_SIZE  (1ul << PAGE_SHIFT)
+
 #define IOAPIC_BASE_ADDRESS 0xfec00000
 #define IOAPIC_ID           0x01
 #define IOAPIC_VERSION      0x11
 #define PCI_ISA_DEVFN       0x08    /* dev 1, fn 0 */
 #define PCI_ISA_IRQ_MASK    0x0c20U /* ISA IRQs 5,10,11 are PCI connected */
 
-#define PCI_MEMBASE         0xf0000000
-#define PCI_MEMSIZE         0x0c000000
+/* MMIO hole: Hardcoded defaults, which can be dynamically expanded. */
+#define PCI_MEM_START       0xf0000000
+#define PCI_MEM_END         0xfc000000
+extern unsigned long pci_mem_start, pci_mem_end;
+
+/* We reserve 16MB for special BIOS mappings, etc. */
+#define RESERVED_MEMBASE    0xfc000000
+#define RESERVED_MEMSIZE    0x01000000
 
 #define ROMBIOS_SEG            0xF000
 #define ROMBIOS_BEGIN          0x000F0000
 #define ROMBIOS_END            (ROMBIOS_BEGIN + ROMBIOS_SIZE)
 
 /* Memory map. */
+#define SCRATCH_PHYSICAL_ADDRESS      0x00010000
 #define HYPERCALL_PHYSICAL_ADDRESS    0x00080000
 #define VGABIOS_PHYSICAL_ADDRESS      0x000C0000
-#define ETHERBOOT_PHYSICAL_ADDRESS    0x000D0000
-#define SMBIOS_PHYSICAL_ADDRESS       0x000E9000
-#define SMBIOS_MAXIMUM_SIZE           0x00001000
-#define ACPI_PHYSICAL_ADDRESS         0x000EA000
+#define OPTIONROM_PHYSICAL_ADDRESS    0x000C8000
+#define OPTIONROM_PHYSICAL_END        0x000EA000
+#define BIOS_INFO_PHYSICAL_ADDRESS    0x000EA000
+#define ACPI_PHYSICAL_ADDRESS         0x000EA020
+#define E820_PHYSICAL_ADDRESS         0x000EA100
+#define SMBIOS_PHYSICAL_ADDRESS       0x000EB000
+#define SMBIOS_MAXIMUM_SIZE           0x00005000
 #define ROMBIOS_PHYSICAL_ADDRESS      0x000F0000
-#define SCRATCH_PHYSICAL_ADDRESS      0x00010000
+
+/* Offsets from E820_PHYSICAL_ADDRESS. */
+#define E820_NR_OFFSET                0x0
+#define E820_OFFSET                   0x8
 
 /* Xen Platform Device */
+#define XEN_PF_IOBASE   0x10
 #define PFFLAG_ROM_LOCK 1 /* Sets whether ROM memory area is RW or RO */
 
+/* Located at BIOS_INFO_PHYSICAL_ADDRESS. */
 struct bios_info {
-    uint8_t  com1_present:1;
-    uint8_t  com2_present:1;
-    uint8_t  hpet_present:1;
-    uint32_t pci_min, pci_len;
-    uint16_t xen_pfiob;
+    uint8_t  com1_present:1;    /* 0[0] - System has COM1? */
+    uint8_t  com2_present:1;    /* 0[1] - System has COM2? */
+    uint8_t  hpet_present:1;    /* 0[2] - System has HPET? */
+    uint32_t pci_min, pci_len;  /* 4, 8 - PCI I/O hole boundaries */
+    uint32_t bios32_entry;      /* 12   - Entry point for 32-bit BIOS */
 };
+#define BIOSINFO_OFF_bios32_entry 12
 
 #endif /* __HVMLOADER_CONFIG_H__ */
index f59f70cbaeb58a1cf55ff0c0c28fe2f79829e58d..940c8ed3949cb69f264f3baee50d3ed9324008a0 100644 (file)
@@ -17,7 +17,7 @@ struct e820entry {
     uint32_t type;
 } __attribute__((packed));
 
-#define HVM_E820_NR ((unsigned char *)HVM_E820_PAGE + HVM_E820_NR_OFFSET)
-#define HVM_E820    ((struct e820entry *)(HVM_E820_PAGE + HVM_E820_OFFSET))
+#define E820_NR ((uint16_t *)(E820_PHYSICAL_ADDRESS + E820_NR_OFFSET))
+#define E820    ((struct e820entry *)(E820_PHYSICAL_ADDRESS + E820_OFFSET))
 
 #endif /* __HVMLOADER_E820_H__ */
index 9dff7cc08dbd7bb57a753897a6fc4da6be20328a..39aa949e1b60ddda0fe7d0596e19d09f51c5f3f4 100644 (file)
@@ -31,6 +31,7 @@
 #include "option_rom.h"
 #include <xen/version.h>
 #include <xen/hvm/params.h>
+#include <xen/memory.h>
 
 asm (
     "    .text                       \n"
@@ -99,6 +100,9 @@ asm (
     "    .text                       \n"
     );
 
+unsigned long pci_mem_start = PCI_MEM_START;
+unsigned long pci_mem_end = PCI_MEM_END;
+
 static enum { VGA_none, VGA_std, VGA_cirrus } virtual_vga = VGA_none;
 
 static void init_hypercalls(void)
@@ -148,16 +152,14 @@ static void apic_setup(void)
 
 static void pci_setup(void)
 {
-    uint32_t base, devfn, bar_reg, bar_data, bar_sz, cmd;
+    uint32_t base, devfn, bar_reg, bar_data, bar_sz, cmd, mmio_total = 0;
     uint16_t class, vendor_id, device_id;
     unsigned int bar, pin, link, isa_irq;
 
     /* Resources assignable to PCI devices via BARs. */
     struct resource {
         uint32_t base, max;
-    } *resource;
-    struct resource mem_resource = { PCI_MEMBASE, PCI_MEMBASE + PCI_MEMSIZE };
-    struct resource io_resource  = { 0xc000, 0x10000 };
+    } *resource, mem_resource, io_resource;
 
     /* Create a list of device BARs in descending order of size. */
     struct bars {
@@ -248,6 +250,10 @@ static void pci_setup(void)
             bars[i].bar_reg = bar_reg;
             bars[i].bar_sz  = bar_sz;
 
+            if ( (bar_data & PCI_BASE_ADDRESS_SPACE) ==
+                 PCI_BASE_ADDRESS_SPACE_MEMORY )
+                mmio_total += bar_sz;
+
             nr_bars++;
 
             /* Skip the upper-half of the address for a 64-bit BAR. */
@@ -269,8 +275,35 @@ static void pci_setup(void)
             printf("pci dev %02x:%x INT%c->IRQ%u\n",
                    devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
         }
+
+        /* Enable bus mastering. */
+        cmd = pci_readw(devfn, PCI_COMMAND);
+        cmd |= PCI_COMMAND_MASTER;
+        pci_writew(devfn, PCI_COMMAND, cmd);
     }
 
+    while ( (mmio_total > (pci_mem_end - pci_mem_start)) &&
+            ((pci_mem_start << 1) != 0) )
+        pci_mem_start <<= 1;
+
+    while ( (pci_mem_start >> PAGE_SHIFT) < hvm_info->low_mem_pgend )
+    {
+        struct xen_add_to_physmap xatp;
+        if ( hvm_info->high_mem_pgend == 0 )
+            hvm_info->high_mem_pgend = 1ull << (32 - PAGE_SHIFT);
+        xatp.domid = DOMID_SELF;
+        xatp.space = XENMAPSPACE_gmfn;
+        xatp.idx   = --hvm_info->low_mem_pgend;
+        xatp.gpfn  = hvm_info->high_mem_pgend++;
+        if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
+            BUG();
+    }
+
+    mem_resource.base = pci_mem_start;
+    mem_resource.max = pci_mem_end;
+    io_resource.base = 0xc000;
+    io_resource.max = 0x10000;
+
     /* Assign iomem and ioport resources in descending order of size. */
     for ( i = 0; i < nr_bars; i++ )
     {
@@ -322,60 +355,56 @@ static void pci_setup(void)
 }
 
 /*
- * Scan the PCI bus for the first NIC supported by etherboot, and copy
- * the corresponding rom data to *copy_rom_dest. Returns the length of the
- * selected rom, or 0 if no NIC found.
+ * Scan the list of Option ROMs at @roms for one which supports 
+ * PCI (@vendor_id, @device_id) found at slot @devfn. If one is found,
+ * copy it to @dest and return its size rounded up to a multiple 2kB. This
+ * function will not copy ROMs beyond address OPTIONROM_PHYSICAL_END.
  */
-static int scan_etherboot_nic(void *copy_rom_dest)
+#define round_option_rom(x) (((x) + 2047) & ~2047)
+static int scan_option_rom(
+    uint8_t devfn, uint16_t vendor_id, uint16_t device_id,
+    void *roms, uint32_t dest)
 {
     struct option_rom_header *rom;
     struct option_rom_pnp_header *pnph;
     struct option_rom_pci_header *pcih;
-    uint32_t devfn;
-    uint16_t class, vendor_id, device_id;
     uint8_t csum;
     int i;
 
-    for ( devfn = 0; devfn < 128; devfn++ )
-    {
-        class     = pci_readw(devfn, PCI_CLASS_DEVICE);
-        vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
-        device_id = pci_readw(devfn, PCI_DEVICE_ID);
+    static uint32_t orom_ids[64];
+    static int nr_roms;
 
-        if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
-            continue;
+    /* Avoid duplicate ROMs. */
+    for ( i = 0; i < nr_roms; i++ )
+        if ( orom_ids[i] == (vendor_id | ((uint32_t)device_id << 16)) )
+            return 0;
 
-        /* We're only interested in NICs. */
-        if ( class != 0x0200 )
-            continue;
+    rom = roms;
+    for ( ; ; )
+    {
+        /* Invalid signature means we're out of option ROMs. */
+        if ( strncmp((char *)rom->signature, "\x55\xaa", 2) ||
+             (rom->rom_size == 0) )
+            break;
 
-        rom = (struct option_rom_header *)etherboot;
-        for ( ; ; )
-        {
-            /* Invalid signature means we're out of option ROMs. */
-            if ( strncmp((char *)rom->signature, "\x55\xaa", 2) ||
-                 (rom->rom_size == 0) )
-                break;
-
-            /* Invalid checksum means we're out of option ROMs. */
-            csum = 0;
-            for ( i = 0; i < (rom->rom_size * 512); i++ )
-                csum += ((uint8_t *)rom)[i];
-            if ( csum != 0 )
-                break;
-
-            /* Check the PCI PnP header (if any) for a match. */
-            pcih = (struct option_rom_pci_header *)
-                ((char *)rom + rom->pci_header_offset);
-            if ( (rom->pci_header_offset != 0) &&
-                 !strncmp((char *)pcih->signature, "PCIR", 4) &&
-                 (pcih->vendor_id == vendor_id) &&
-                 (pcih->device_id == device_id) )
-                goto found;
-
-            rom = (struct option_rom_header *)
-                ((char *)rom + rom->rom_size * 512);
-        }
+        /* Invalid checksum means we're out of option ROMs. */
+        csum = 0;
+        for ( i = 0; i < (rom->rom_size * 512); i++ )
+            csum += ((uint8_t *)rom)[i];
+        if ( csum != 0 )
+            break;
+
+        /* Check the PCI PnP header (if any) for a match. */
+        pcih = (struct option_rom_pci_header *)
+            ((char *)rom + rom->pci_header_offset);
+        if ( (rom->pci_header_offset != 0) &&
+             !strncmp((char *)pcih->signature, "PCIR", 4) &&
+             (pcih->vendor_id == vendor_id) &&
+             (pcih->device_id == device_id) )
+            goto found;
+
+        rom = (struct option_rom_header *)
+            ((char *)rom + rom->rom_size * 512);
     }
 
     return 0;
@@ -392,37 +421,109 @@ static int scan_etherboot_nic(void *copy_rom_dest)
                    ((char *)rom + pnph->next_header_offset))
                 : ((struct option_rom_pnp_header *)NULL));
 
-    printf("Loading PXE ROM ...\n");
+    printf("Loading PCI Option ROM ...\n");
     if ( (pnph != NULL) && (pnph->manufacturer_name_offset != 0) )
         printf(" - Manufacturer: %s\n",
                (char *)rom + pnph->manufacturer_name_offset);
     if ( (pnph != NULL) && (pnph->product_name_offset != 0) )
         printf(" - Product name: %s\n",
                (char *)rom + pnph->product_name_offset);
-    memcpy(copy_rom_dest, rom, rom->rom_size * 512);
-    return rom->rom_size * 512;
+
+    if ( (dest + rom->rom_size * 512 + 1) > OPTIONROM_PHYSICAL_END )
+    {
+        printf("Option ROM size %x exceeds available space\n",
+               rom->rom_size * 512);
+        return 0;
+    }
+
+    orom_ids[nr_roms++] = vendor_id | ((uint32_t)device_id << 16);
+    memcpy((void *)dest, rom, rom->rom_size * 512);
+    *(uint8_t *)(dest + rom->rom_size * 512) = devfn;
+    return round_option_rom(rom->rom_size * 512 + 1);
 }
 
-/* Replace possibly erroneous memory-size CMOS fields with correct values. */
-static void cmos_write_memory_size(void)
+/*
+ * Scan the PCI bus for the first NIC supported by etherboot, and copy
+ * the corresponding rom data to *copy_rom_dest. Returns the length of the
+ * selected rom, or 0 if no NIC found.
+ */
+static int scan_etherboot_nic(uint32_t copy_rom_dest)
 {
-    struct e820entry *map = HVM_E820;
-    int i, nr = *HVM_E820_NR;
-    uint32_t base_mem = 640, ext_mem = 0, alt_mem = 0;
+    uint8_t devfn;
+    uint16_t class, vendor_id, device_id;
 
-    for ( i = 0; i < nr; i++ )
-        if ( (map[i].addr >= 0x100000) && (map[i].type == E820_RAM) )
-            break;
+    for ( devfn = 0; devfn < 128; devfn++ )
+    {
+        class     = pci_readw(devfn, PCI_CLASS_DEVICE);
+        vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
+        device_id = pci_readw(devfn, PCI_DEVICE_ID);
+
+        /* We're only interested in NICs. */
+        if ( (vendor_id != 0xffff) &&
+             (device_id != 0xffff) &&
+             (class == 0x0200) )
+            return scan_option_rom(
+                devfn, vendor_id, device_id, etherboot, copy_rom_dest);
+    }
+
+    return 0;
+}
 
-    if ( i != nr )
+/*
+ * Scan the PCI bus for the devices that have an option ROM, and copy
+ * the corresponding rom data to rom_phys_addr.
+ */
+static int pci_load_option_roms(uint32_t rom_base_addr)
+{
+    uint32_t option_rom_addr, rom_phys_addr = rom_base_addr;
+    uint16_t vendor_id, device_id;
+    uint8_t devfn, class;
+
+    for ( devfn = 0; devfn < 128; devfn++ )
     {
-        alt_mem = ext_mem = map[i].addr + map[i].size;
-        ext_mem = (ext_mem > 0x0100000) ? (ext_mem - 0x0100000) >> 10 : 0;
-        if ( ext_mem > 0xffff )
-            ext_mem = 0xffff;
-        alt_mem = (alt_mem > 0x1000000) ? (alt_mem - 0x1000000) >> 16 : 0;
+        class     = pci_readb(devfn, PCI_CLASS_DEVICE + 1);
+        vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
+        device_id = pci_readw(devfn, PCI_DEVICE_ID);
+
+        if ( (vendor_id == 0xffff) && (device_id == 0xffff) )
+            continue;
+
+        /*
+         * Currently only scan options from mass storage devices and serial
+         * bus controller (Fibre Channel included).
+         */
+        if ( (class != 0x1) && (class != 0xc) )
+            continue;
+
+        option_rom_addr = pci_readl(devfn, PCI_ROM_ADDRESS);
+        if ( !option_rom_addr )
+            continue;
+
+        /* Ensure Expansion Bar is enabled before copying */
+        pci_writel(devfn, PCI_ROM_ADDRESS, option_rom_addr | 0x1);
+
+        rom_phys_addr += scan_option_rom(
+            devfn, vendor_id, device_id,
+            (void *)(option_rom_addr & ~2047), rom_phys_addr);
+
+        /* Restore the default original value of Expansion Bar */
+        pci_writel(devfn, PCI_ROM_ADDRESS, option_rom_addr);
     }
 
+    return rom_phys_addr - rom_base_addr;
+}
+
+/* Replace possibly erroneous memory-size CMOS fields with correct values. */
+static void cmos_write_memory_size(void)
+{
+    uint32_t base_mem = 640, ext_mem, alt_mem;
+
+    alt_mem = ext_mem = hvm_info->low_mem_pgend << PAGE_SHIFT;
+    ext_mem = (ext_mem > 0x0100000) ? (ext_mem - 0x0100000) >> 10 : 0;
+    if ( ext_mem > 0xffff )
+        ext_mem = 0xffff;
+    alt_mem = (alt_mem > 0x1000000) ? (alt_mem - 0x1000000) >> 16 : 0;
+
     /* All BIOSes: conventional memory (CMOS *always* reports 640kB). */
     cmos_outb(0x15, (uint8_t)(base_mem >> 0));
     cmos_outb(0x16, (uint8_t)(base_mem >> 8));
@@ -438,32 +539,96 @@ static void cmos_write_memory_size(void)
     cmos_outb(0x35, (uint8_t)( alt_mem >> 8));
 }
 
-static uint16_t init_xen_platform_io_base(void)
+/*
+ * Set up an empty TSS area for virtual 8086 mode to use. 
+ * The only important thing is that it musn't have any bits set 
+ * in the interrupt redirection bitmap, so all zeros will do.
+ */
+static void init_vm86_tss(void)
 {
-    struct bios_info *bios_info = (struct bios_info *)ACPI_PHYSICAL_ADDRESS;
-    uint32_t devfn, bar_data;
-    uint16_t vendor_id, device_id;
-
-    bios_info->xen_pfiob = 0;
+    void *tss;
+    struct xen_hvm_param p;
+
+    tss = mem_alloc(128, 128);
+    memset(tss, 0, 128);
+    p.domid = DOMID_SELF;
+    p.index = HVM_PARAM_VM86_TSS;
+    p.value = virt_to_phys(tss);
+    hypercall_hvm_op(HVMOP_set_param, &p);
+    printf("vm86 TSS at %08lx\n", virt_to_phys(tss));
+}
 
-    for ( devfn = 0; devfn < 128; devfn++ )
+/* Create an E820 table based on memory parameters provided in hvm_info. */
+static void build_e820_table(void)
+{
+    struct e820entry *e820 = E820;
+    unsigned int nr = 0;
+
+    /* 0x0-0x9FC00: Ordinary RAM. */
+    e820[nr].addr = 0x0;
+    e820[nr].size = 0x9FC00;
+    e820[nr].type = E820_RAM;
+    nr++;
+
+    /* 0x9FC00-0xA0000: Extended BIOS Data Area (EBDA). */
+    e820[nr].addr = 0x9FC00;
+    e820[nr].size = 0x400;
+    e820[nr].type = E820_RESERVED;
+    nr++;
+
+    /*
+     * Following regions are standard regions of the PC memory map.
+     * They are not covered by e820 regions. OSes will not use as RAM.
+     * 0xA0000-0xC0000: VGA memory-mapped I/O. Not covered by E820.
+     * 0xC0000-0xE0000: 16-bit devices, expansion ROMs (inc. vgabios).
+     * TODO: free pages which turn out to be unused.
+     */
+
+    /*
+     * 0xE0000-0x0F0000: PC-specific area. We place various tables here.
+     * 0xF0000-0x100000: System BIOS.
+     * TODO: free pages which turn out to be unused.
+     */
+    e820[nr].addr = 0xE0000;
+    e820[nr].size = 0x20000;
+    e820[nr].type = E820_RESERVED;
+    nr++;
+
+    /* Low RAM goes here. Reserve space for special pages. */
+    BUG_ON((hvm_info->low_mem_pgend << PAGE_SHIFT) < (2u << 20));
+    e820[nr].addr = 0x100000;
+    e820[nr].size = (hvm_info->low_mem_pgend << PAGE_SHIFT) - e820[nr].addr;
+    e820[nr].type = E820_RAM;
+    nr++;
+
+    /*
+     * Explicitly reserve space for special pages.
+     * This space starts at RESERVED_MEMBASE an extends to cover various
+     * fixed hardware mappings (e.g., LAPIC, IOAPIC, default SVGA framebuffer).
+     */
+    e820[nr].addr = RESERVED_MEMBASE;
+    e820[nr].size = (uint32_t)-e820[nr].addr;
+    e820[nr].type = E820_RESERVED;
+    nr++;
+
+    if ( hvm_info->high_mem_pgend )
     {
-        vendor_id = pci_readw(devfn, PCI_VENDOR_ID);
-        device_id = pci_readw(devfn, PCI_DEVICE_ID);
-        if ( (vendor_id != 0x5853) || (device_id != 0x0001) )
-            continue;
-        bar_data = pci_readl(devfn, PCI_BASE_ADDRESS_0);
-        bios_info->xen_pfiob = bar_data & PCI_BASE_ADDRESS_IO_MASK;
+        e820[nr].addr = ((uint64_t)1 << 32);
+        e820[nr].size =
+            ((uint64_t)hvm_info->high_mem_pgend << PAGE_SHIFT) - e820[nr].addr;
+        e820[nr].type = E820_RAM;
+        nr++;
     }
 
-    return bios_info->xen_pfiob;
+    *E820_NR = nr;
 }
 
 int main(void)
 {
-    int vgabios_sz = 0, etherboot_sz = 0, rombios_sz, smbios_sz;
-    uint32_t vga_ram = 0;
-    uint16_t xen_pfiob;
+    int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0;
+    int rombios_sz, smbios_sz;
+    uint32_t etherboot_phys_addr, option_rom_phys_addr, bios32_addr;
+    struct bios_info *bios_info;
 
     printf("HVM Loader\n");
 
@@ -471,6 +636,9 @@ int main(void)
 
     printf("CPU speed is %u MHz\n", get_cpu_mhz());
 
+    apic_setup();
+    pci_setup();
+
     smp_initialise();
 
     perform_tests();
@@ -483,12 +651,9 @@ int main(void)
     if ( rombios_sz > 0x10000 )
         rombios_sz = 0x10000;
     memcpy((void *)ROMBIOS_PHYSICAL_ADDRESS, rombios, rombios_sz);
-    highbios_setup();
-
-    apic_setup();
-    pci_setup();
+    bios32_addr = highbios_setup();
 
-    if ( (get_vcpu_nr() > 1) || get_apic_mode() )
+    if ( (hvm_info->nr_vcpus > 1) || hvm_info->apic_mode )
         create_mp_tables();
 
     switch ( virtual_vga )
@@ -497,33 +662,35 @@ int main(void)
         printf("Loading Cirrus VGABIOS ...\n");
         memcpy((void *)VGABIOS_PHYSICAL_ADDRESS,
                vgabios_cirrusvga, sizeof(vgabios_cirrusvga));
-        vgabios_sz = sizeof(vgabios_cirrusvga);
+        vgabios_sz = round_option_rom(sizeof(vgabios_cirrusvga));
         break;
     case VGA_std:
         printf("Loading Standard VGABIOS ...\n");
         memcpy((void *)VGABIOS_PHYSICAL_ADDRESS,
                vgabios_stdvga, sizeof(vgabios_stdvga));
-        vgabios_sz = sizeof(vgabios_stdvga);
+        vgabios_sz = round_option_rom(sizeof(vgabios_stdvga));
         break;
     default:
         printf("No emulated VGA adaptor ...\n");
         break;
     }
 
-    if ( virtual_vga != VGA_none )
-    {
-        vga_ram = e820_malloc(8 << 20, 4096);
-        printf("VGA RAM at %08x\n", vga_ram);
-    }
+    etherboot_phys_addr = VGABIOS_PHYSICAL_ADDRESS + vgabios_sz;
+    if ( etherboot_phys_addr < OPTIONROM_PHYSICAL_ADDRESS )
+        etherboot_phys_addr = OPTIONROM_PHYSICAL_ADDRESS;
+    etherboot_sz = scan_etherboot_nic(etherboot_phys_addr);
 
-    etherboot_sz = scan_etherboot_nic((void*)ETHERBOOT_PHYSICAL_ADDRESS);
+    option_rom_phys_addr = etherboot_phys_addr + etherboot_sz;
+    option_rom_sz = pci_load_option_roms(option_rom_phys_addr);
 
-    if ( get_acpi_enabled() )
+    if ( hvm_info->acpi_enabled )
     {
         printf("Loading ACPI ...\n");
         acpi_build_tables();
     }
 
+    init_vm86_tss();
+
     cmos_write_memory_size();
 
     printf("BIOS map:\n");
@@ -533,8 +700,12 @@ int main(void)
                VGABIOS_PHYSICAL_ADDRESS + vgabios_sz - 1);
     if ( etherboot_sz )
         printf(" %05x-%05x: Etherboot ROM\n",
-               ETHERBOOT_PHYSICAL_ADDRESS,
-               ETHERBOOT_PHYSICAL_ADDRESS + etherboot_sz - 1);
+               etherboot_phys_addr,
+               etherboot_phys_addr + etherboot_sz - 1);
+    if ( option_rom_sz )
+        printf(" %05x-%05x: PCI Option ROMs\n",
+               option_rom_phys_addr,
+               option_rom_phys_addr + option_rom_sz - 1);
     if ( smbios_sz )
         printf(" %05x-%05x: SMBIOS tables\n",
                SMBIOS_PHYSICAL_ADDRESS,
@@ -544,9 +715,16 @@ int main(void)
                ROMBIOS_PHYSICAL_ADDRESS,
                ROMBIOS_PHYSICAL_ADDRESS + rombios_sz - 1);
 
-    xen_pfiob = init_xen_platform_io_base();
-    if ( xen_pfiob && vga_ram )
-        outl(xen_pfiob + 4, vga_ram);
+    build_e820_table();
+
+    bios_info = (struct bios_info *)BIOS_INFO_PHYSICAL_ADDRESS;
+    memset(bios_info, 0, sizeof(*bios_info));
+    bios_info->com1_present = uart_exists(0x3f8);
+    bios_info->com2_present = uart_exists(0x2f8);
+    bios_info->hpet_present = hpet_exists(ACPI_HPET_ADDRESS);
+    bios_info->pci_min = pci_mem_start;
+    bios_info->pci_len = pci_mem_end - pci_mem_start;
+    bios_info->bios32_entry = bios32_addr;
 
     printf("Invoking ROMBIOS ...\n");
     return 0;
index 2c42f6e4b469d4793d11c06188174396be87c272..ad09edafa0d83d5d089d8a7fbdf3b89fd992f3a2 100644 (file)
@@ -155,7 +155,7 @@ static void fill_mp_config_table(struct mp_config_table *mpct, int length)
     int vcpu_nr, i;
     uint8_t checksum;
 
-    vcpu_nr = get_vcpu_nr();
+    vcpu_nr = hvm_info->nr_vcpus;
 
     /* fill in the MP configuration table signature, "PCMP" */
     mpct->signature[0] = 'P';
@@ -317,7 +317,7 @@ void create_mp_tables(void)
     char *p;
     int vcpu_nr, i, length;
 
-    vcpu_nr = get_vcpu_nr();
+    vcpu_nr = hvm_info->nr_vcpus;
 
     printf("Creating MP tables ...\n");
 
index e1464220d0fda8ef0236a09637bba3d0b93b6d56..64fa799c0b882d9d515142951e603a7ccef16932 100644 (file)
@@ -118,8 +118,9 @@ write_smbios_tables(void *start,
     do_struct(smbios_type_16_init(p, memsize, nr_mem_devs));
     for ( i = 0; i < nr_mem_devs; i++ )
     {
-        uint32_t dev_memsize = ((i == (nr_mem_devs - 1))
-                                ? (memsize & 0x3fff) : 0x4000);
+        uint32_t dev_memsize = 0x4000; /* all but last covers 16GB */
+        if ( (i == (nr_mem_devs - 1)) && ((memsize & 0x3fff) != 0) )
+            dev_memsize = memsize & 0x3fff; /* last dev is <16GB */
         do_struct(smbios_type_17_init(p, dev_memsize, i));
         do_struct(smbios_type_19_init(p, dev_memsize, i));
         do_struct(smbios_type_20_init(p, dev_memsize, i));
@@ -143,28 +144,18 @@ write_smbios_tables(void *start,
 static uint64_t
 get_memsize(void)
 {
-    struct e820entry *map = HVM_E820;
-    uint8_t num_entries = *HVM_E820_NR;
-    uint64_t memsize = 0;
-    int i;
+    uint64_t sz;
 
-    /*
-     * Walk through e820map, ignoring any entries that aren't marked
-     * as usable or reserved.
-     */
-    for ( i = 0; i < num_entries; i++ )
-    {
-        if ( (map->type == E820_RAM) || (map->type == E820_RESERVED) )
-            memsize += map->size;
-        map++;
-    }
+    sz = (uint64_t)hvm_info->low_mem_pgend << PAGE_SHIFT;
+    if ( hvm_info->high_mem_pgend )
+        sz += (hvm_info->high_mem_pgend << PAGE_SHIFT) - (1ull << 32);
 
     /*
      * Round up to the nearest MB.  The user specifies domU pseudo-physical 
      * memory in megabytes, so not doing this could easily lead to reporting 
      * one less MB than the user specified.
      */
-    return (memsize + (1 << 20) - 1) >> 20;
+    return (sz + (1ul << 20) - 1) >> 20;
 }
 
 int
@@ -229,7 +220,7 @@ hvm_write_smbios_tables(void)
 
     /* SCRATCH_PHYSICAL_ADDRESS is a safe large memory area for scratch. */
     len = write_smbios_tables((void *)SCRATCH_PHYSICAL_ADDRESS,
-                              get_vcpu_nr(), get_memsize(),
+                              hvm_info->nr_vcpus, get_memsize(),
                               uuid, xen_version_str,
                               xen_major_version, xen_minor_version);
     if ( len > SMBIOS_MAXIMUM_SIZE )
index f64f73e3f0d6515711404c34fc2dfebba6c420fb..76d1c280403c315f09c189697074a512f3babd91 100644 (file)
@@ -121,7 +121,7 @@ static void boot_cpu(unsigned int cpu)
 
 void smp_initialise(void)
 {
-    unsigned int i, nr_cpus = get_vcpu_nr();
+    unsigned int i, nr_cpus = hvm_info->nr_vcpus;
 
     memcpy((void *)AP_BOOT_EIP, ap_boot_start, ap_boot_end - ap_boot_start);
 
index fe33b0f124069722687e7108425bc674942e9517..ccf9bf94d3f0a49302bd8f168f094af14f5b91f8 100644 (file)
@@ -25,7 +25,6 @@
 #include <stdint.h>
 #include <xen/xen.h>
 #include <xen/memory.h>
-#include <xen/hvm/hvm_info_table.h>
 
 void wrmsr(uint32_t idx, uint64_t v)
 {
@@ -304,63 +303,63 @@ uuid_to_string(char *dest, uint8_t *uuid)
     *p = '\0';
 }
 
-static void e820_collapse(void)
+void *mem_alloc(uint32_t size, uint32_t align)
 {
-    int i = 0;
-    struct e820entry *ent = (struct e820entry *)HVM_E820;
-
-    while ( i < (*HVM_E820_NR-1) )
-    {
-        if ( (ent[i].type == ent[i+1].type) &&
-             ((ent[i].addr + ent[i].size) == ent[i+1].addr) )
-        {
-            ent[i].size += ent[i+1].size;
-            memcpy(&ent[i+1], &ent[i+2], (*HVM_E820_NR-i-2) * sizeof(*ent));
-            (*HVM_E820_NR)--;
-        }
-        else
-        {
-            i++;
-        }
-    }
-}
-
-uint32_t e820_malloc(uint32_t size, uint32_t align)
-{
-    uint32_t addr;
-    int i;
-    struct e820entry *ent = (struct e820entry *)HVM_E820;
+    static uint32_t reserve = RESERVED_MEMBASE - 1;
+    static int over_allocated;
+    struct xen_add_to_physmap xatp;
+    struct xen_memory_reservation xmr;
+    xen_pfn_t mfn;
+    uint32_t s, e;
 
-    /* Align to at leats one kilobyte. */
+    /* Align to at least one kilobyte. */
     if ( align < 1024 )
         align = 1024;
 
-    for ( i = *HVM_E820_NR - 1; i >= 0; i-- )
+    s = (reserve + align) & ~(align - 1);
+    e = s + size - 1;
+
+    BUG_ON((e < s) || (e >> PAGE_SHIFT) >= hvm_info->reserved_mem_pgstart);
+
+    while ( (reserve >> PAGE_SHIFT) != (e >> PAGE_SHIFT) )
     {
-        addr = (ent[i].addr + ent[i].size - size) & ~(align-1);
-        if ( (ent[i].type != E820_RAM) || /* not ram? */
-             (addr < ent[i].addr) ||      /* too small or starts above 4gb? */
-             ((addr + size) < addr) )     /* ends above 4gb? */
-            continue;
+        reserve += PAGE_SIZE;
+        mfn = reserve >> PAGE_SHIFT;
 
-        if ( addr != ent[i].addr )
+        /* Try to allocate a brand new page in the reserved area. */
+        if ( !over_allocated )
         {
-            memmove(&ent[i+1], &ent[i], (*HVM_E820_NR-i) * sizeof(*ent));
-            (*HVM_E820_NR)++;
-            ent[i].size = addr - ent[i].addr;
-            ent[i+1].addr = addr;
-            ent[i+1].size -= ent[i].size;
-            i++;
+            xmr.domid = DOMID_SELF;
+            xmr.mem_flags = 0;
+            xmr.extent_order = 0;
+            xmr.nr_extents = 1;
+            set_xen_guest_handle(xmr.extent_start, &mfn);
+            if ( hypercall_memory_op(XENMEM_populate_physmap, &xmr) == 1 )
+                continue;
+            over_allocated = 1;
         }
 
-        ent[i].type = E820_RESERVED;
-
-        e820_collapse();
-
-        return addr;
+        /* Otherwise, relocate a page from the ordinary RAM map. */
+        if ( hvm_info->high_mem_pgend )
+        {
+            xatp.idx = --hvm_info->high_mem_pgend;
+            if ( xatp.idx == (1ull << (32 - PAGE_SHIFT)) )
+                hvm_info->high_mem_pgend = 0;
+        }
+        else
+        {
+            xatp.idx = --hvm_info->low_mem_pgend;
+        }
+        xatp.domid = DOMID_SELF;
+        xatp.space = XENMAPSPACE_gmfn;
+        xatp.gpfn  = mfn;
+        if ( hypercall_memory_op(XENMEM_add_to_physmap, &xatp) != 0 )
+            BUG();
     }
 
-    return 0;
+    reserve = e;
+
+    return (void *)(unsigned long)s;
 }
 
 uint32_t ioapic_read(uint32_t reg)
@@ -543,30 +542,35 @@ void __bug(char *file, int line)
         asm volatile ( "ud2" );
 }
 
-static int validate_hvm_info(struct hvm_info_table *t)
+static void validate_hvm_info(struct hvm_info_table *t)
 {
-    char signature[] = "HVM INFO";
     uint8_t *ptr = (uint8_t *)t;
     uint8_t sum = 0;
     int i;
 
-    /* strncmp(t->signature, "HVM INFO", 8) */
-    for ( i = 0; i < 8; i++ )
+    if ( strncmp(t->signature, "HVM INFO", 8) )
     {
-        if ( signature[i] != t->signature[i] )
-        {
-            printf("Bad hvm info signature\n");
-            return 0;
-        }
+        printf("Bad hvm info signature\n");
+        BUG();
+    }
+
+    if ( t->length < sizeof(struct hvm_info_table) )
+    {
+        printf("Bad hvm info length\n");
+        BUG();
     }
 
     for ( i = 0; i < t->length; i++ )
         sum += ptr[i];
 
-    return (sum == 0);
+    if ( sum != 0 )
+    {
+        printf("Bad hvm info checksum\n");
+        BUG();
+    }
 }
 
-static struct hvm_info_table *get_hvm_info_table(void)
+struct hvm_info_table *get_hvm_info_table(void)
 {
     static struct hvm_info_table *table;
     struct hvm_info_table *t;
@@ -576,35 +580,13 @@ static struct hvm_info_table *get_hvm_info_table(void)
 
     t = (struct hvm_info_table *)HVM_INFO_PADDR;
 
-    if ( !validate_hvm_info(t) )
-    {
-        printf("Bad hvm info table\n");
-        return NULL;
-    }
+    validate_hvm_info(t);
 
     table = t;
 
     return table;
 }
 
-int get_vcpu_nr(void)
-{
-    struct hvm_info_table *t = get_hvm_info_table();
-    return (t ? t->nr_vcpus : 1);
-}
-
-int get_acpi_enabled(void)
-{
-    struct hvm_info_table *t = get_hvm_info_table();
-    return (t ? t->acpi_enabled : 1);
-}
-
-int get_apic_mode(void)
-{
-    struct hvm_info_table *t = get_hvm_info_table();
-    return (t ? t->apic_mode : 1);
-}
-
 uint16_t get_cpu_mhz(void)
 {
     struct xen_add_to_physmap xatp;
@@ -647,6 +629,27 @@ uint16_t get_cpu_mhz(void)
     return cpu_mhz;
 }
 
+int uart_exists(uint16_t uart_base)
+{
+    uint16_t ier = uart_base + 1;
+    uint8_t a, b, c;
+
+    a = inb(ier);
+    outb(ier, 0);
+    b = inb(ier);
+    outb(ier, 0xf);
+    c = inb(ier);
+    outb(ier, a);
+
+    return ((b == 0) && (c == 0xf));
+}
+
+int hpet_exists(unsigned long hpet_base)
+{
+    uint32_t hpet_id = *(uint32_t *)hpet_base;
+    return ((hpet_id >> 16) == 0x8086);
+}
+
 /*
  * Local variables:
  * mode: C
index 81f0e4f4c44d3f77bebb0f5ce75c33dbcc29cd34..e9e0dfd0729aa0a81ba7aef117475fd0d02d5f3c 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <stdarg.h>
 #include <stdint.h>
+#include <xen/hvm/hvm_info_table.h>
 
 #undef offsetof
 #define offsetof(t, m) ((unsigned long)&((t *)0)->m)
@@ -56,6 +57,10 @@ void pci_write(uint32_t devfn, uint32_t reg, uint32_t len, uint32_t val);
 /* Get CPU speed in MHz. */
 uint16_t get_cpu_mhz(void);
 
+/* Hardware detection. */
+int uart_exists(uint16_t uart_base);
+int hpet_exists(unsigned long hpet_base);
+
 /* Do cpuid instruction, with operation 'idx' */
 void cpuid(uint32_t idx, uint32_t *eax, uint32_t *ebx,
            uint32_t *ecx, uint32_t *edx);
@@ -103,9 +108,8 @@ static inline void cpu_relax(void)
 })
 
 /* HVM-builder info. */
-int get_vcpu_nr(void);
-int get_acpi_enabled(void);
-int get_apic_mode(void);
+struct hvm_info_table *get_hvm_info_table(void);
+#define hvm_info (get_hvm_info_table())
 
 /* String and memory functions */
 int strcmp(const char *cs, const char *ct);
@@ -131,11 +135,12 @@ void uuid_to_string(char *dest, uint8_t *uuid);
 int printf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 int vprintf(const char *fmt, va_list ap);
 
-/* Reserve a RAM region in the e820 table. */
-uint32_t e820_malloc(uint32_t size, uint32_t align);
+/* Allocate memory in a reserved region below 4GB. */
+void *mem_alloc(uint32_t size, uint32_t align);
+#define virt_to_phys(v) ((unsigned long)(v))
 
 /* Prepare the 32bit BIOS */
-void highbios_setup(void);
+uint32_t highbios_setup(void);
 
 /* Miscellaneous. */
 void cacheattr_init(void);
index 551a9ffdc038349b395202fdcb86117da743dc85..22f83f4cf01f646ed786c93480fcc8924a0c01e5 100644 (file)
  *
  * Author: Stefan Berger <stefanb@us.ibm.com>
  */
-#include "rombios_compat.h"
-#include "32bitprotos.h"
-
-/*
-   the jumptable that will be copied into the rombios in the 0xf000 segment
-   for every function that is to be called from the lower BIOS, make an entry
-   here.
- */
-#define TABLE_ENTRY(idx, func) [idx] = (uint32_t)func
-uint32_t jumptable[IDX_LAST+1] __attribute__((section (".biosjumptable"))) =
-{
-       TABLE_ENTRY(IDX_TCPA_ACPI_INIT, tcpa_acpi_init),
-       TABLE_ENTRY(IDX_TCPA_EXTEND_ACPI_LOG, tcpa_extend_acpi_log),
 
-       TABLE_ENTRY(IDX_TCGINTERRUPTHANDLER, TCGInterruptHandler),
-
-       TABLE_ENTRY(IDX_TCPA_CALLING_INT19H, tcpa_calling_int19h),
-       TABLE_ENTRY(IDX_TCPA_RETURNED_INT19H, tcpa_returned_int19h),
-       TABLE_ENTRY(IDX_TCPA_ADD_EVENT_SEPARATORS, tcpa_add_event_separators),
-       TABLE_ENTRY(IDX_TCPA_WAKE_EVENT, tcpa_wake_event),
-       TABLE_ENTRY(IDX_TCPA_ADD_BOOTDEVICE, tcpa_add_bootdevice),
-       TABLE_ENTRY(IDX_TCPA_START_OPTION_ROM_SCAN, tcpa_start_option_rom_scan),
-       TABLE_ENTRY(IDX_TCPA_OPTION_ROM, tcpa_option_rom),
-       TABLE_ENTRY(IDX_TCPA_IPL, tcpa_ipl),
-       TABLE_ENTRY(IDX_TCPA_MEASURE_POST, tcpa_measure_post),
-
-       TABLE_ENTRY(IDX_TCPA_INITIALIZE_TPM, tcpa_initialize_tpm),
-
-       TABLE_ENTRY(IDX_GET_S3_WAKING_VECTOR, get_s3_waking_vector),
+#include "rombios_compat.h"
 
-       TABLE_ENTRY(IDX_LAST       , 0)     /* keep last */
-};
+asm (
+    "    .text                       \n"
+    "     movzwl %bx,%eax            \n"
+    "     jmp *jumptable(,%eax,4)    \n"
+    "    .data                       \n"
+    "jumptable:                      \n"
+#define X(idx, ret, fn, args...) " .long "#fn"\n"
+#include "32bitprotos.h"
+#undef X
+    );
index cdad7561b9c338e760a1dbef3009a46c759bb5e3..407faff9563954fb8a953b92933e45b42eace190 100644 (file)
@@ -1,24 +1,24 @@
 XEN_ROOT = ../../../..
 include $(XEN_ROOT)/tools/firmware/Rules.mk
 
-SOURCES = util.c
 TARGET = 32bitbios_flat.h
 
-CFLAGS += $(CFLAGS_include) -I.. -DGCC_PROTOS
+CFLAGS += $(CFLAGS_include) -I..
 
 SUBDIRS = tcgbios
 
-MODULES = tcgbios/tcgbiosext.o
-
 .PHONY: all
 all: subdirs-all
        $(MAKE) $(TARGET)
 
 .PHONY: clean
 clean: subdirs-clean
-       rm -rf *.o $(TARGET)
+       rm -rf *.o $(TARGET) $(DEPS)
+
+$(TARGET): 32bitbios_all.o
+       sh mkhex highbios_array 32bitbios_all.o > $@
 
-$(TARGET): 32bitbios.o $(MODULES) util.o
+32bitbios_all.o: 32bitbios.o tcgbios/tcgbiosext.o util.o pmm.o
        $(LD) $(LDFLAGS_DIRECT) -s -r $^ -o 32bitbios_all.o
        @nm 32bitbios_all.o |                                \
          egrep '^ +U ' >/dev/null && {                      \
@@ -26,4 +26,5 @@ $(TARGET): 32bitbios.o $(MODULES) util.o
            nm -u 32bitbios_all.o;                           \
            exit 11;                                         \
          } || :
-       sh mkhex highbios_array 32bitbios_all.o > $@
+
+-include $(DEPS)
diff --git a/tools/firmware/rombios/32bit/pmm.c b/tools/firmware/rombios/32bit/pmm.c
new file mode 100644 (file)
index 0000000..0e5c5b8
--- /dev/null
@@ -0,0 +1,534 @@
+/*
+ *  pmm.c - POST(Power On Self Test) Memory Manager
+ *  according to the specification described in
+ *  http://www.phoenix.com/NR/rdonlyres/873A00CF-33AC-4775-B77E-08E7B9754993/0/specspmm101.pdf
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (C) 2009 FUJITSU LIMITED
+ *
+ *  Author: Kouya Shimura <kouya@jp.fujitsu.com>
+ */
+
+/*
+ * Algorithm:
+ *
+ * This is not a fast storage allocator but simple one.  There is no
+ * segregated management by block size and it does nothing special for
+ * avoiding the fragmentation.
+ *
+ * The allocation algorithm is a first-fit. All memory blocks are
+ * managed by linear single linked list in order of the address.
+ * (i.e. There is no backward pointer) It searches the first available
+ * equal or larger block from the head (lowest address) of memory
+ * heap. The larger block is splitted into two blocks unless one side
+ * becomes too small.
+ * 
+ * For de-allocation, the specified block is just marked as available
+ * and it does nothing else. Thus, the fragmentation will occur. The
+ * collection of continuous available blocks are done on the search
+ * phase of another block allocation.
+ *
+ * The following is an abstract of this algorithm. The actual code
+ * looks complicated on account of alignment and checking the handle.
+ *
+ *     static memblk_t *
+ *     alloc(heap_t *heap, uint32_t size)
+ *     {
+ *         static memblk_t *mb;
+ *         for_each_memblk(heap, mb) // search memory blocks
+ *             if (memblk_is_avail(mb))
+ *             {
+ *                 collect_avail_memblks(heap, mb);
+ *                 if (size <= memblk_bufsize(mb))
+ *                 {
+ *                     split_memblk(mb, size);
+ *                     set_inuse(mb);
+ *                     return mb;
+ *                 }
+ *             }
+ *         return NULL;
+ *     }
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <../hvmloader/config.h>
+#include <../hvmloader/e820.h>
+#include "util.h"
+
+#define DEBUG_PMM 0
+
+#define __stringify(a) #a
+#define stringify(a) __stringify(a)
+
+#define ASSERT(_expr, _action)                                  \
+    if (!(_expr)) {                                             \
+        printf("ASSERTION FAIL: %s %s:%d %s()\n",               \
+               stringify(_expr), __FILE__, __LINE__, __func__); \
+        _action;                                                \
+    } else
+
+#if DEBUG_PMM
+# define PMM_DEBUG(format, p...) printf("PMM " format, ##p)
+#else
+# define PMM_DEBUG(format, p...)
+#endif
+
+struct pmmAllocArgs {
+    uint16_t function;
+    uint32_t length;
+    uint32_t handle;
+    uint16_t flags;
+} __attribute__ ((packed));
+
+struct pmmFindArgs {
+    uint16_t function;
+    uint32_t handle;
+} __attribute__ ((packed));
+
+struct pmmDeallocateArgs {
+    uint16_t function;
+    uint32_t buffer;
+} __attribute__ ((packed));
+
+#define PMM_FUNCTION_ALLOCATE   0
+#define PMM_FUNCTION_FIND       1         
+#define PMM_FUNCTION_DEALLOC    2
+
+#define PARAGRAPH_LENGTH        16  // unit of length
+
+#define PMM_HANDLE_ANONYMOUS    0xffffffff
+
+#define PMM_FLAGS_MEMORY_TYPE_MASK      0x0003
+#define PMM_FLAGS_MEMORY_INVALID        0
+#define PMM_FLAGS_MEMORY_CONVENTIONAL   1  // 0 to 1MB
+#define PMM_FLAGS_MEMORY_EXTENDED       2  // 1MB to 4GB
+#define PMM_FLAGS_MEMORY_ANY            3  // whichever is available
+#define PMM_FLAGS_ALIGINMENT            0x0004
+
+/* Error code */
+#define PMM_ENOMEM      (0)     // Out of memory, duplicate handle
+#define PMM_EINVAL      (-1)    // Invalid argument
+
+#define ALIGN_UP(addr, size)    (((addr)+((size)-1))&(~((size)-1)))
+#define ALIGN_DOWN(addr, size)  ((addr)&(~((size)-1)))
+
+typedef struct memblk {
+    uint32_t magic;      // inuse or available
+    struct memblk *next; // points the very next of this memblk
+    uint32_t handle;     // identifier of this block
+    uint32_t __fill;     // for 16byte alignment, not used
+    uint8_t buffer[0];
+} memblk_t;
+
+typedef struct heap {
+    memblk_t *head;     // start address of heap
+    memblk_t *end;      // end address of heap
+} heap_t;
+
+#define HEAP_NOT_INITIALIZED    (memblk_t *)-1
+#define HEAP_ALIGNMENT          16
+
+/*
+ * PMM handles two memory heaps, the caller chooses either.
+ *
+ * - conventional memroy (below 1MB)
+ *    In HVM, the area is fixed. 0x00010000-0x0007FFFF
+ *    (from SCRATCH_PHYSICAL_ADDRESS to HYPERCALL_PHYSICAL_ADDRESS)
+ *
+ * - extended memory (start at 1MB, below 4GB)
+ *    In HVM, the area starts at memory address 0x00100000.
+ *    The end address is variable. We read low RAM address from e820 table.
+ *
+ * The following struct must be located in the data segment since bss
+ * in 32bitbios doesn't be relocated.
+ */
+static struct {
+    heap_t heap;     // conventional memory
+    heap_t ext_heap; // extended memory
+} pmm_data = { {HEAP_NOT_INITIALIZED, NULL}, {NULL, NULL} };
+
+/* These values are private use, not a spec in PMM */
+#define MEMBLK_MAGIC_INUSE   0x2A4D4D50  // 'PMM*'
+#define MEMBLK_MAGIC_AVAIL   0x5F4D4D50  // 'PMM_'
+
+#define memblk_is_inuse(_mb)  ((_mb)->magic == MEMBLK_MAGIC_INUSE)
+#define memblk_is_avail(_mb)  ((_mb)->magic == MEMBLK_MAGIC_AVAIL)
+
+static void set_inuse(memblk_t *mb, uint32_t handle)
+{
+    mb->magic = MEMBLK_MAGIC_INUSE;
+    mb->handle = handle;
+}
+
+static void set_avail(memblk_t *mb)
+{
+    mb->magic = MEMBLK_MAGIC_AVAIL;
+    mb->handle = PMM_HANDLE_ANONYMOUS;
+}
+
+#define MEMBLK_HEADER_SIZE   ((int)(&((memblk_t *)0)->buffer))
+#define MIN_MEMBLK_SIZE      (MEMBLK_HEADER_SIZE + PARAGRAPH_LENGTH)
+
+#define memblk_size(_mb)     ((void *)((_mb)->next) - (void *)(_mb))
+#define memblk_buffer(_mb)   ((uint32_t)(&(_mb)->buffer))
+#define memblk_bufsize(_mb)  (memblk_size(_mb) - MEMBLK_HEADER_SIZE)
+
+#define buffer_memblk(_buf)  (memblk_t *)((_buf) - MEMBLK_HEADER_SIZE)
+
+#define memblk_loop_mbondition(_h, _mb) \
+    (((_mb) < (_h)->end) && (/* avoid infinite loop */ (_mb) < (_mb)->next))
+
+#define for_each_memblk(_h, _mb)                \
+    for ((_mb) = (_h)->head;                    \
+         memblk_loop_mbondition(_h, _mb);       \
+         (_mb) = (_mb)->next)
+
+#define for_remain_memblk(_h, _mb)              \
+    for (;                                      \
+         memblk_loop_mbondition(_h, _mb);       \
+         (_mb) = (_mb)->next)
+
+/*
+ *                                       <-size->
+ *    +==================+======+       +========+========+======+
+ *    |      avail       |      |       | avail  | avail  |      |
+ *    |      memblk      |memblk|...    | memblk | memblk |memblk|...
+ *    +==================+======+   =>  +========+========+======+
+ *    ^ |                ^ |    ^         |      ^ |      ^ |    ^
+ *    | |next            | |next|         |next  | |next  | |next|
+ *    | \________________/ \____/         \______/ \______/ \____/
+ *    |                                          ^
+ *    |                                          |
+ *    mb                                         +- sb(return value)
+ */
+static memblk_t *
+split_memblk(memblk_t *mb, uint32_t size)
+{
+    memblk_t *sb = (void *)memblk_buffer(mb) + size;
+
+    /* Only split if the remaining fragment is big enough. */
+    if ( (memblk_bufsize(mb) - size) < MIN_MEMBLK_SIZE)
+        return mb;
+
+    sb->next = mb->next;
+    set_avail(sb);
+
+    mb->next = sb;
+    return sb;
+}
+
+/*
+ *    +======+======+======+======+       +=================+======+
+ *    |avail |avail |avail |inuse |       |      avail      |inuse |   
+ *    |memblk|memblk|memblk|memblk|...    |      memblk     |memblk|...
+ *    +======+======+======+======+   =>  +=================+======+
+ *    ^ |    ^ |    ^ |    ^ |    ^         |               ^ |    ^
+ *    | |next| |next| |next| |next|         |next           | |next|
+ *    | \____/ \____/ \____/ \____/         \_______________/ \____/
+ *    |
+ *    mb
+ */
+static void
+collect_avail_memblks(heap_t *heap, memblk_t *mb)
+{
+    memblk_t *nb = mb->next;
+
+    for_remain_memblk ( heap, nb )
+        if ( memblk_is_inuse(nb) )
+            break;
+    mb->next = nb;
+}
+
+static void
+pmm_init_heap(heap_t *heap, uint32_t from_addr, uint32_t to_addr)
+{
+    memblk_t *mb = (memblk_t *)ALIGN_UP(from_addr, HEAP_ALIGNMENT);
+
+    mb->next = (memblk_t *)ALIGN_DOWN(to_addr, HEAP_ALIGNMENT);
+    set_avail(mb);
+
+    heap->head = mb;
+    heap->end = mb->next;
+}
+
+static void
+pmm_initalize(void)
+{
+    int i, e820_nr = *E820_NR;
+    struct e820entry *e820 = E820;
+
+    /* Extended memory: RAM below 4GB, 0x100000-0xXXXXXXXX */
+    for ( i = 0; i < e820_nr; i++ )
+    {
+        if ( (e820[i].type == E820_RAM) && (e820[i].addr >= 0x00100000) )
+        {
+            pmm_init_heap(&pmm_data.ext_heap, e820[i].addr, 
+                          e820[i].addr + e820[i].size);
+            break;
+        }
+    }
+
+    /* convectional memory: RAM below 1MB, 0x10000-0x7FFFF */
+    pmm_init_heap(&pmm_data.heap, SCRATCH_PHYSICAL_ADDRESS,
+                  HYPERCALL_PHYSICAL_ADDRESS);
+}
+
+static uint32_t
+pmm_max_avail_length(heap_t *heap)
+{
+    memblk_t *mb;
+    uint32_t size, max = 0;
+
+    for_each_memblk ( heap, mb )
+    {
+        if ( !memblk_is_avail(mb) )
+            continue;
+        collect_avail_memblks(heap, mb);
+        size = memblk_bufsize(mb);
+        if ( size > max )
+            max = size;
+    }
+
+    return (max / PARAGRAPH_LENGTH);
+}
+
+static memblk_t *
+first_fit(heap_t *heap, uint32_t size, uint32_t handle, uint32_t flags)
+{
+    memblk_t *mb;
+    int32_t align = 0;
+
+    if ( flags & PMM_FLAGS_ALIGINMENT )
+        align = ((size ^ (size - 1)) >> 1) + 1;
+
+    for_each_memblk ( heap, mb )
+    {
+        if ( memblk_is_avail(mb) )
+        {
+            collect_avail_memblks(heap, mb);
+
+            if ( align )
+            {
+                uint32_t addr = memblk_buffer(mb);
+                uint32_t offset = ALIGN_UP(addr, align) - addr;
+
+                if ( offset > 0 )
+                {
+                    ASSERT(offset >= MEMBLK_HEADER_SIZE, continue);
+
+                    if ( (offset + size) > memblk_bufsize(mb) )
+                        continue;
+
+                    mb = split_memblk(mb, offset - MEMBLK_HEADER_SIZE);
+                    return mb;
+                }
+            }
+
+            if ( size <= memblk_bufsize(mb) )
+                return mb;
+        }
+        else
+        {
+            ASSERT(memblk_is_inuse(mb), return NULL);
+
+            /* Duplication check for handle. */
+            if ( (handle != PMM_HANDLE_ANONYMOUS) && (mb->handle == handle) )
+                return NULL;
+        }
+    }
+
+    return NULL;
+}
+
+static memblk_t *
+pmm_find_handle(heap_t *heap, uint32_t handle)
+{
+    memblk_t *mb;
+
+    if ( handle == PMM_HANDLE_ANONYMOUS )
+        return NULL;
+
+    for_each_memblk ( heap, mb )
+        if ( mb->handle == handle )
+            return mb;
+
+    return NULL;
+}
+
+/*
+ * allocate a memory block of the specified type and size, and returns
+ * the address of the memory block.
+ *
+ * A client-specified identifier to be associated with the allocated
+ * memory block. A handle of 0xFFFFFFFF indicates that no identifier
+ * should be associated with the block. Such a memory block is known
+ * as an "anonymous" memory block and cannot be found using the
+ * pmmFind function. If a specified handle for a requested memory
+ * block is already used in a currently allocated memory block, the
+ * error value of 0x00000000 is returned
+ *
+ * If length is 0x00000000, no memory is allocated and the value
+ * returned is the size of the largest memory block available for the
+ * memory type specified in the flags parameter. The alignment bit in
+ * the flags register is ignored when calculating the largest memory
+ * block available.
+ *
+ * If a specified handle for a requested memory block is already used
+ * in a currently allocated memory block, the error value of
+ * 0x00000000 is returned.
+ * 
+ * A return value of 0x00000000 indicates that an error occurred and
+ * no memory has been allocated. 
+ */
+static uint32_t
+pmmAllocate(uint32_t length, uint32_t handle, uint16_t flags)
+{
+    heap_t *heap;
+    memblk_t *mb;
+    uint32_t size;
+
+    switch ( flags & PMM_FLAGS_MEMORY_TYPE_MASK )
+    {
+    case PMM_FLAGS_MEMORY_CONVENTIONAL:
+        heap = &pmm_data.heap;
+        break;
+
+    case PMM_FLAGS_MEMORY_EXTENDED:
+    case PMM_FLAGS_MEMORY_ANY: /* XXX: ignore conventional memory for now */
+        heap = &pmm_data.ext_heap;
+        break;
+
+    default:
+        return PMM_EINVAL;
+    }
+
+    /* return the largest memory block available */
+    if ( length == 0 )
+        return pmm_max_avail_length(heap);
+
+    size = length * PARAGRAPH_LENGTH;
+    mb = first_fit(heap, size, handle, flags);
+
+    if ( mb == NULL )
+        return PMM_ENOMEM;
+
+    /* duplication check for handle */
+    if ( handle != PMM_HANDLE_ANONYMOUS )
+    {
+        memblk_t *nb = mb->next;
+
+        for_remain_memblk(heap, nb)
+            if (nb->handle == handle)
+                return PMM_ENOMEM;
+    }
+
+    split_memblk(mb, size);
+    set_inuse(mb, handle);
+
+    return memblk_buffer(mb);
+}
+
+/*
+ * returns the address of the memory block associated with the
+ * specified handle.  
+ *
+ * A return value of 0x00000000 indicates that the handle does not
+ * correspond to a currently allocated memory block.
+ */
+static uint32_t
+pmmFind(uint32_t handle)
+{
+    memblk_t *mb;
+
+    if ( handle == PMM_HANDLE_ANONYMOUS )
+        return 0;
+
+    mb = pmm_find_handle(&pmm_data.heap, handle);
+    if ( mb == NULL )
+        mb = pmm_find_handle(&pmm_data.ext_heap, handle);
+
+    return mb ? memblk_buffer(mb) : 0;
+}
+
+/* 
+ * frees the specified memory block that was previously allocated by
+ * pmmAllocate.
+ *
+ * If the memory block was deallocated correctly, the return value is
+ * 0x00000000. If there was an error, the return value is non-zero.
+ */
+static uint32_t
+pmmDeallocate(uint32_t buffer)
+{
+    memblk_t *mb = buffer_memblk(buffer);
+
+    if ( !memblk_is_inuse(mb) )
+        return PMM_EINVAL;
+
+    set_avail(mb);
+    return 0;
+}
+
+
+union pmm_args {
+    uint16_t function;
+    struct pmmAllocArgs alloc;
+    struct pmmFindArgs find;
+    struct pmmDeallocateArgs dealloc;
+} __attribute__ ((packed));
+
+/*
+ * entry function of all PMM services.
+ *
+ * Values returned to the caller are placed in the DX:AX register
+ * pair. The flags and all registers, other than DX and AX, are
+ * preserved across calls to PMM services.
+ */
+uint32_t
+pmm(void *argp)
+{
+    union pmm_args *ap = argp;
+    uint32_t ret = PMM_EINVAL;
+
+    if ( pmm_data.heap.head == HEAP_NOT_INITIALIZED )
+        pmm_initalize();
+
+    switch ( ap->function )
+    {
+    case PMM_FUNCTION_ALLOCATE:
+        ret = pmmAllocate(ap->alloc.length, ap->alloc.handle, ap->alloc.flags);
+        PMM_DEBUG("Alloc length=%x handle=%x flags=%x ret=%x\n", 
+                  ap->alloc.length, ap->alloc.handle, ap->alloc.flags, ret);
+        break;
+
+    case PMM_FUNCTION_FIND:
+        ret = pmmFind(ap->find.handle);
+        PMM_DEBUG("Find handle=%x ret=%x\n", ap->find.handle, ret);
+        break;
+
+    case PMM_FUNCTION_DEALLOC:
+        ret = pmmDeallocate(ap->dealloc.buffer);
+        PMM_DEBUG("Dealloc buffer=%x ret=%x\n", ap->dealloc.buffer, ret);
+        break;
+
+    default:
+        PMM_DEBUG("Invalid function:%d\n", ap->function);
+        break;
+    }
+
+    return ret;
+}
index 2198645d1d585341811c25cb45cbb801d8671e50..f33e3e783f18db08e9df01948e6808cedddf0f44 100644 (file)
@@ -89,4 +89,8 @@ static inline void write_byte(Bit16u seg, Bit16u off, Bit8u val)
        *addr = val;
 }
 
+#define X(idx, ret, fn, args...) ret fn (args);
+#include "32bitprotos.h"
+#undef X
+
 #endif
index 1b3cf2b94a3496aead65fef88adea833042e5f81..8ce1d77e3520aebd460a154d66b94009164f907a 100644 (file)
@@ -2,17 +2,17 @@ XEN_ROOT = ../../../../..
 include $(XEN_ROOT)/tools/firmware/Rules.mk
 
 TARGET  = tcgbiosext.o
-FILES   = tcgbios tpm_drivers
-OBJECTS = $(foreach f,$(FILES),$(f).o)
 
-CFLAGS += $(CFLAGS_include) -I.. -I../.. -DGCC_PROTOS
-
-.PHONY: all clean
+CFLAGS += $(CFLAGS_include) -I.. -I../..
 
+.PHONY: all
 all: $(TARGET)
 
+.PHONY: clean
 clean:
-       rm -rf *.o $(TARGET)
+       rm -rf *.o $(TARGET) $(DEPS)
 
-$(TARGET): $(OBJECTS)
+$(TARGET): tcgbios.o tpm_drivers.o
        $(LD) $(LDFLAGS_DIRECT) -r $^ -o $@
+
+-include $(DEPS)
index b06af22f00616fdce09a4ed9be40a00865325961..d5e2202df4f11f8e74cdbdf21fe2266bedd23422 100644 (file)
@@ -26,7 +26,6 @@
 
 #include "util.h"
 #include "tcgbios.h"
-#include "32bitprotos.h"
 
 /* local structure and variables */
 struct ptti_cust {
@@ -259,6 +258,10 @@ uint8_t acpi_validate_entry(struct acpi_header *hdr)
 }
 
 
+/*
+   initialize the TCPA ACPI subsystem; find the ACPI tables and determine
+   where the TCPA table is.
+ */
 void tcpa_acpi_init(void)
 {
        struct acpi_20_rsdt *rsdt;
@@ -313,6 +316,16 @@ static void tcpa_reset_acpi_log(void)
 }
 
 
+/*
+ * Extend the ACPI log with the given entry by copying the
+ * entry data into the log.
+ * Input
+ *  Pointer to the structure to be copied into the log
+ *
+ * Output:
+ *  lower 16 bits of return code contain entry number
+ *  if entry number is '0', then upper 16 bits contain error code.
+ */
 uint32_t tcpa_extend_acpi_log(uint32_t entry_ptr)
 {
        uint32_t res = 0;
@@ -622,7 +635,8 @@ void tcpa_wake_event()
 }
 
 /*
- * add the boot device to the measurement log
+ * Add a measurement regarding the boot device (CDRom, Floppy, HDD) to
+ * the list of measurements.
  */
 void tcpa_add_bootdevice(uint32_t bootcd, uint32_t bootdrv)
 {
index 9592dfbca14ea6fc2692cdb86cc92dfb105c2ee2..8b3dffa591395a96f46b8cb59a6e5180d14f1180 100644 (file)
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  *
  * Copyright (C) IBM Corporation, 2006
+ * Copyright (c) 2008, Citrix Systems, Inc.
  *
  * Author: Stefan Berger <stefanb@us.ibm.com>
+ * Author: Keir Fraser <keir.fraser@citrix.com>
  */
 
 /*
  *  (4 bytes) even for uint16_t, so casting to 32bit from bcc is a good idea.
  */
 
-#define SEGMENT_OFFSET  0xf0000
-#define REAL_MODE_CODE_SEGMENT  0xf000
+/* At most 32 bytes in argument list to a 32-bit function. */
+#define MAX_ARG_BYTES 32
 
-#define START_PM_CODE  USE32
-#define END_PM_CODE    USE16
+#define REAL_MODE_CODE_OFFSET  0xf0000
 
-/* definition of used code/data segment descriptors */
-#define PM_NORMAL_CS (gdt_entry_pm_cs       - gdt_base)
+/* Definitions of code/data segment descriptors. */
+#define PM_32BIT_CS  (gdt_entry_pm_32bit_cs - gdt_base)
 #define PM_16BIT_CS  (gdt_entry_pm_16bit_cs - gdt_base)
 #define PM_32BIT_DS  (gdt_entry_pm_32bit_ds - gdt_base)
-
-  ASM_START
-
-    ; Switch into protected mode to allow access to 32 bit addresses.
-    ; This function allows switching into protected mode.
-    ; (the specs says big real mode, but that will not work)
-    ;
-    ; preserves all registers and prepares cs, ds, es, ss for usage
-    ; in protected mode; while in prot.mode interrupts remain disabled
-switch_to_protmode:
-    cli
-
-    ; have to fix the stack for proper return address in 32 bit mode
-    push WORD #(REAL_MODE_CODE_SEGMENT>>12)    ;extended return address
-    push bp                                    ;pop@A1
-    mov bp, sp
-    push eax                                   ;pop@A2
-    mov eax, 2[bp]                             ; fix return address
-    rol eax, #16
-    mov 2[bp], eax
-
-    mov eax, esp
-    ror eax, #16                               ; hi(esp)
-
-    push bx                                    ; preserve before function call
-    push cx
-    push dx
-
-    push ax                                    ; prepare stack for
-    push es                                    ; call
-    push ds
-    push cs
-    push ss
-    call _store_segment_registers
-    add sp, #10                                        ; pop ax,es-ss
-
-    pop dx                                     ; restore after function call
-    pop cx
-    pop bx
-
-    ; calculate protected-mode esp from ss:sp
-    and esp, #0xffff
-    xor eax, eax
-    mov ax, ss
-    rol eax, #4
-    add eax, esp
-    mov esp, eax
-
-    seg cs
-    lgdt my_gdtdesc                            ; switch to own table
-
-    mov eax, cr0
-    or al, #0x1                                ; protected mode 'on'
-    mov cr0, eax
-
-    jmpf DWORD (SEGMENT_OFFSET | switch_to_protmode_goon_1), #PM_NORMAL_CS
-
-    START_PM_CODE
-
-switch_to_protmode_goon_1:
-    mov ax, #PM_32BIT_DS                       ; 32 bit segment that allows
-    mov ds, ax                                 ; to reach all 32 bit
-    mov es, ax                                 ; addresses
-    mov ss, ax
-
-    pop eax                                    ;@A2
-    pop bp                                     ;@A1
-    ret
-
-    END_PM_CODE
-
-
+#define PM_16BIT_DS  (gdt_entry_pm_16bit_ds - gdt_base)
 
     .align 16
 gdt_base:
-    ; see Intel SW Dev. Manuals section 3.4.5, Volume 3 for meaning of bits
     .word 0,0
     .byte 0,0,0,0
-
-gdt_entry_pm_cs:
-    ; 32 bit code segment for protected mode
+gdt_entry_pm_32bit_cs:
     .word 0xffff, 0x0000
-    .byte 0x00, 0x9a, 0xcf, 0x00
-
+    .byte 0x00, 0x9b, 0xcf, 0x00
 gdt_entry_pm_16bit_cs:
-    ; temp. 16 bit code segment used while in protected mode
     .word 0xffff, 0x0000
-    .byte SEGMENT_OFFSET >> 16, 0x9a, 0x0, 0x0
-
+    .byte REAL_MODE_CODE_OFFSET >> 16, 0x9b, 0x0, 0x0
 gdt_entry_pm_32bit_ds:
-    ; (32 bit) data segment (r/w) reaching all possible areas in 32bit memory
-    ; 4kb granularity
     .word 0xffff, 0x0000
-    .byte 0x0, 0x92, 0xcf, 0x0
+    .byte 0x0, 0x93, 0xcf, 0x0
+gdt_entry_pm_16bit_ds:
+    .word 0xffff, 0x0000
+    .byte 0x0, 0x93, 0x0, 0x0
 gdt_entry_end:
 
-my_gdtdesc:
+protmode_gdtdesc:
     .word (gdt_entry_end - gdt_base) - 1
-    .long gdt_base | SEGMENT_OFFSET
-
+    .long gdt_base | REAL_MODE_CODE_OFFSET
 
-realmode_gdtdesc:                              ;to be used in real mode
+realmode_gdtdesc:
     .word 0xffff
     .long 0x0
 
+Upcall:
+    ; Do an upcall into 32 bit space
+    ;
+    ; Input:
+    ; bx: index of function to call
+    ; Ouput:
+    ; dx, ax: 32 bit result of call (even if 'void' is expected)
+
+    ; Save caller state, stack frame offsets listed below
+#define esp_off     0
+#define ss_off      4
+#define es_off      6
+#define ds_off      8
+#define flags_off   10
+#define retaddr_off 12
+#define args_off    14
+    pushf
+    cli
+    push ds
+    push es
+    push ss
+    push esp
 
+    ; Calculate protected-mode esp from ss:sp
+    and esp, #0xffff
+    xor eax, eax
+    mov ax, ss
+    shl eax, #4
+    add esp, eax
 
-switch_to_realmode:
-    ; Implementation of switching from protected mode to real mode
-    ; prepares cs, es, ds, ss to be used in real mode
-    ; spills   eax
-    START_PM_CODE
-
-    ; need to fix up the stack to return in 16 bit mode
-    ; currently the 32 bit return address is on the stack
-    pop eax
-    push ax
-
-    push bx                                    ;pop@1
-    push si                                    ;pop@2
-
-    call _ebda_ss_offset32                     ; get the offset of the ss
-    mov bx, ax                                 ; entry within the ebda.
-
-    jmpf switch_to_realmode_goon_1, #PM_16BIT_CS
-
-    END_PM_CODE
-
-switch_to_realmode_goon_1:
+    ; Switch to protected mode
+    seg cs
+    lgdt protmode_gdtdesc
     mov eax, cr0
-    and al, #0xfe                              ; protected mode 'off'
+    or al, #0x1  ; protected mode on
     mov cr0, eax
-
-    jmpf switch_to_realmode_goon_2, #REAL_MODE_CODE_SEGMENT
-
-switch_to_realmode_goon_2:
-
-    ; get orig. 'ss' without using the stack (no 'call'!)
-    xor eax, eax                       ; clear upper 16 bits (and lower)
-    mov ax, #0x40                      ; where is the ebda located?
+    jmpf DWORD (REAL_MODE_CODE_OFFSET|upcall1), #PM_32BIT_CS
+upcall1:
+    USE32
+    mov ax, #PM_32BIT_DS
     mov ds, ax
-    mov si, #0xe
-    seg ds
-    mov ax, [si]                       ; ax = segment of ebda
-
-    mov ds, ax                         ; segment of ebda
-    seg ds
-    mov ax, [bx]                       ; stack segment - bx has been set above
+    mov es, ax
     mov ss, ax
 
-    ; from esp and ss calculate real-mode sp
-    rol eax, #4
+    ; Marshal arguments and call 32-bit function
+    mov ecx, #MAX_ARG_BYTES/4
+upcall2:
+    push MAX_ARG_BYTES-4+args_off[esp]
+    loop upcall2
+    mov eax, [BIOS_INFO_PHYSICAL_ADDRESS + BIOSINFO_OFF_bios32_entry]
+    call eax
+    add esp, #MAX_ARG_BYTES
+    mov ecx, eax  ; Result in ecx
+
+    ; Restore real-mode stack pointer
+    xor eax, eax
+    mov ax, ss_off[esp]
+    mov bx, ax    ; Real-mode ss in bx
+    shl eax, 4
     sub esp, eax
 
-    push dx                            ;preserve before call(s)
-    push cx
-    push bx
-
-    call _get_register_ds              ; get orig. 'ds'
+    ; Return to real mode
+    jmpf upcall3, #PM_16BIT_CS
+upcall3:
+    USE16
+    mov ax, #PM_16BIT_DS
     mov ds, ax
-    call _get_register_es              ; get orig. 'es'
     mov es, ax
-    call _get_register_esp_hi          ; fix the upper 16 bits of esp
-    ror esp, #16
-    mov sp, ax
-    rol esp, #16
-
-    pop bx
-    pop cx
-    pop dx
-
+    mov ss, ax
+    mov eax, cr0
+    and al, #0xfe ; protected mode off
+    mov cr0, eax
+    jmpf upcall4, #REAL_MODE_CODE_OFFSET>>4
+upcall4:
     seg cs
     lgdt realmode_gdtdesc
 
-    sti                                                ; allow interrupts
-
-    pop si                                     ;@2
-    pop bx                                     ;@1
-
+    ; Restore real-mode ss
+    mov ss, bx
+
+    ; Convert result into dx:ax format
+    mov eax, ecx
+    ror eax, #16
+    mov dx, ax
+    ror eax, #16
+
+    ; Restore caller state and return
+    pop esp
+    pop bx ; skip ss
+    pop es
+    pop ds
+    popf
     ret
 
-    ASM_END
-
-/*
- * Helper function to get the offset of the reg_ss within the ebda struct
- * Only 'C' can tell the offset.
- */
-Bit16u
-ebda_ss_offset32()
-{
-    ASM_START
-    START_PM_CODE                              // need to have this
-    ASM_END                                    // compiled for protected mode
-    return &EbdaData->upcall.reg_ss;           // 'C' knows the offset!
-    ASM_START
-    END_PM_CODE
-    ASM_END
-}
-
-/*
- * Two often-used functions
- */
-Bit16u
-read_word_from_ebda(offset)
-    Bit16u offset;
-{
-       Bit16u ebda_seg = read_word(0x0040, 0x000E);
-       return read_word(ebda_seg, offset);
-}
-
-Bit32u
-read_dword_from_ebda(offset)
-    Bit16u offset;
-{
-       Bit16u ebda_seg = read_word(0x0040, 0x000E);
-       return read_dword(ebda_seg, offset);
-}
-
-/*
- * Store registers in the EBDA; used to keep the registers'
- * content in a well-defined place during protected mode execution
- */
-  void
-store_segment_registers(ss, cs, ds, es, esp_hi)
-  Bit16u ss, cs, ds, es, esp_hi;
-{
-       Bit16u ebda_seg = read_word(0x0040, 0x000E);
-       write_word(ebda_seg, &EbdaData->upcall.reg_ss, ss);
-       write_word(ebda_seg, &EbdaData->upcall.reg_cs, cs);
-       write_word(ebda_seg, &EbdaData->upcall.reg_ds, ds);
-       write_word(ebda_seg, &EbdaData->upcall.reg_es, es);
-       write_word(ebda_seg, &EbdaData->upcall.esp_hi, esp_hi);
-}
-
-
-  void
-store_returnaddress(retaddr)
-   Bit16u retaddr;
-{
-       Bit16u ebda_seg = read_word(0x0040, 0x000E);
-       write_word(ebda_seg, &EbdaData->upcall.retaddr, retaddr);
-}
-
-Bit16u
-get_returnaddress()
-{
-       return read_word_from_ebda(&EbdaData->upcall.retaddr);
-}
-
-/*
- * get the segment register 'cs' value from the EBDA
- */
-Bit16u
-get_register_cs()
-{
-       return read_word_from_ebda(&EbdaData->upcall.reg_cs);
-}
-
-/*
- * get the segment register 'ds' value from the EBDA
- */
-Bit16u
-get_register_ds()
-{
-       return read_word_from_ebda(&EbdaData->upcall.reg_ds);
-}
-
-/*
- * get the segment register 'es' value from the EBDA
- */
-Bit16u
-get_register_es()
-{
-       return read_word_from_ebda(&EbdaData->upcall.reg_es);
-}
-
-/*
- * get the upper 16 bits of the esp from the EBDA
- */
-Bit16u
-get_register_esp_hi()
-{
-       return read_word_from_ebda(&EbdaData->upcall.esp_hi);
-}
-
-
-
-/********************************************************/
-
-
-ASM_START
-
-Upcall:
-       ; do the upcall into 32 bit space
-       ; clear the stack frame so that 32 bit space sees all the parameters
-       ; on the stack as if they were prepared for it
-       ; ---> take the 16 bit return address off the stack and remember it
-       ;
-       ; Input:
-       ; bx: index of function to call
-       ; Ouput:
-       ; dx, ax: 32 bit result of call (even if 'void' is expected)
-
-       push bp                         ;pop @1
-       mov bp, sp
-       push si                         ;pop @2
-
-       mov ax, 2[bp]                   ; 16 bit return address
-       push ax
-       call _store_returnaddress       ; store away
-       pop ax
-
-       ; XXX GDT munging requires ROM to be writable!
-       call _enable_rom_write_access
-
-       rol bx, #2
-       mov si, #jmptable
-       seg cs
-       mov eax, dword ptr [si+bx]      ; address to call from table
-
-       pop si                          ;@2
-       pop bp                          ;@1
-
-       add sp, #2                      ; remove 16bit return address from stack
-
-       call switch_to_protmode
-       START_PM_CODE
-
-       call eax                        ; call 32bit function
-       push eax                        ; preserve result
-
-       call switch_to_realmode         ; back to realmode
-       END_PM_CODE
-
-       pop eax                         ; get result
-
-       push word 0x0000                ; placeholder for 16 bit return address
-       push bp
-       mov bp,sp
-       push eax                        ; preserve work register
-
-       call _disable_rom_write_access
-
-       call _get_returnaddress
-       mov 2[bp], ax                   ; 16bit return address onto stack
-
-       pop eax
-       pop bp
-
-       ror eax, #16                    ; result into dx/ax
-       mov dx, ax                      ; hi(res) -> dx
-       ror eax, #16
-
-       ret
-
-
-/* macro for functions to declare their call into 32bit space */
 MACRO DoUpcall
-       mov bx, #?1
-       jmp Upcall
+    mov bx, #?1
+    jmp Upcall
 MEND
 
-
-ASM_END
-
+#define X(idx, ret, fn, args...) _ ## fn: DoUpcall(idx)
 #include "32bitprotos.h"
-#include "32bitgateway.h"
-
-#include "tcgbios.c"
-
-Bit32u get_s3_waking_vector()
-{
-       ASM_START
-       DoUpcall(IDX_GET_S3_WAKING_VECTOR)
-       ASM_END
-}
+#undef X
index f0c401476a98e8894673f6ba44d6d95833d6f290..13d882840c383a1b257716617676e44afd1ebf71 100644 (file)
@@ -1,47 +1,16 @@
-#ifndef PROTOS_HIGHBIOS
-#define PROTOS_HIGHBIOS
-
-/* shared include file for bcc and gcc */
-
-/* bcc does not like 'enum' */
-#define IDX_TCGINTERRUPTHANDLER            0
-#define IDX_TCPA_ACPI_INIT                 1
-#define IDX_TCPA_EXTEND_ACPI_LOG           2
-#define IDX_TCPA_CALLING_INT19H            3
-#define IDX_TCPA_RETURNED_INT19H           4
-#define IDX_TCPA_ADD_EVENT_SEPARATORS      5
-#define IDX_TCPA_WAKE_EVENT                6
-#define IDX_TCPA_ADD_BOOTDEVICE            7
-#define IDX_TCPA_START_OPTION_ROM_SCAN     8
-#define IDX_TCPA_OPTION_ROM                9
-#define IDX_TCPA_IPL                       10
-#define IDX_TCPA_INITIALIZE_TPM            11
-#define IDX_TCPA_MEASURE_POST              12
-#define IDX_GET_S3_WAKING_VECTOR           13
-#define IDX_LAST                           14 /* keep last! */
-
-#ifdef GCC_PROTOS
-  #define PARMS(x...) x
-#else
-  /* bcc doesn't want any parameter types in prototypes */
-  #define PARMS(x...)
-#endif
-
-Bit32u TCGInterruptHandler( PARMS(pushad_regs_t *regs, Bit32u esds, Bit32u flags_ptr));
-
-void tcpa_acpi_init( PARMS(void) );
-Bit32u tcpa_extend_acpi_log( PARMS(Bit32u entry_ptr) );
-void tcpa_calling_int19h( PARMS(void) );
-void tcpa_returned_int19h( PARMS(void) );
-void tcpa_add_event_separators( PARMS(void) );
-void tcpa_wake_event( PARMS(void) );
-void tcpa_add_bootdevice( PARMS(Bit32u bootcd, Bit32u bootdrv) );
-void tcpa_start_option_rom_scan( PARMS(void) );
-void tcpa_option_rom( PARMS(Bit32u seg) );
-void tcpa_ipl( PARMS(Bit32u bootcd,Bit32u seg,Bit32u off,Bit32u count) );
-void tcpa_measure_post( PARMS(Bit32u from, Bit32u to) );
-Bit32u tcpa_initialize_tpm( PARMS(Bit32u physpres) );
-
-Bit32u get_s3_waking_vector( PARMS(void) );
-
-#endif
+X(0,  Bit32u, TCGInterruptHandler,
+  pushad_regs_t *regs, Bit32u esds, Bit32u flags_ptr)
+X(1,  void,   tcpa_acpi_init, void)
+X(2,  Bit32u, tcpa_extend_acpi_log, Bit32u entry_ptr)
+X(3,  void,   tcpa_calling_int19h,void)
+X(4,  void,   tcpa_returned_int19h, void)
+X(5,  void,   tcpa_add_event_separators, void)
+X(6,  void,   tcpa_wake_event, void)
+X(7,  void,   tcpa_add_bootdevice, Bit32u bootcd, Bit32u bootdrv)
+X(8,  void,   tcpa_start_option_rom_scan, void)
+X(9,  void,   tcpa_option_rom, Bit32u seg)
+X(10, void,   tcpa_ipl, Bit32u bootcd, Bit32u seg, Bit32u off, Bit32u count)
+X(11, void,   tcpa_measure_post, Bit32u from, Bit32u to)
+X(12, Bit32u, tcpa_initialize_tpm, Bit32u physpres)
+X(13, Bit32u, get_s3_waking_vector, void)
+X(14, Bit32u, pmm, void *argp)
index 8ea8cb756025454b985024459e7c87cfe8369116..8321eadd8519bb62b7eeb31875e61b6258b3d751 100644 (file)
@@ -13,6 +13,7 @@ clean: subdirs-clean
        rm -f  as86-sym.txt ld86-sym.txt 
        rm -f  rombios*.txt rombios*.sym usage biossums
        rm -f  BIOS-bochs-*
+       rm -f  $(DEPS)
 
 BIOS-bochs-latest: rombios.c biossums 32bitgateway.c tcgbios.c
        gcc -DBX_SMP_PROCESSORS=1 -E -P $< > _rombios_.c
@@ -27,3 +28,4 @@ BIOS-bochs-latest: rombios.c biossums 32bitgateway.c tcgbios.c
 biossums: biossums.c
        gcc -o biossums biossums.c
 
+-include $(DEPS)
index 547d5cff87ce4822256d64f8c60339b7f4f82def..0aea421e17a16f9e9a527e0c6ff756d9f9519f58 100644 (file)
@@ -1,5 +1,5 @@
 /////////////////////////////////////////////////////////////////////////
-// $Id: rombios.c,v 1.138 2005/05/07 15:55:26 vruppert Exp $
+// $Id: rombios.c,v 1.221 2008/12/07 17:32:29 sshwarts Exp $
 /////////////////////////////////////////////////////////////////////////
 //
 //  Copyright (C) 2002  MandrakeSoft S.A.
@@ -22,9 +22,9 @@
 //
 //  You should have received a copy of the GNU Lesser General Public
 //  License along with this library; if not, write to the Free Software
-//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
 
-// ROM BIOS for use with Bochs/Plex x86 emulation environment
+// ROM BIOS for use with Bochs/Plex86/QEMU emulation environment
 
 #define uint8_t unsigned char
 #define uint16_t unsigned short
 //
 // NOTES for El-Torito Boot (cbbochs@free.fr)
 //   - CD-ROM booting is only available if ATA/ATAPI Driver is available
-//   - Current code is only able to boot mono-session cds 
+//   - Current code is only able to boot mono-session cds
 //   - Current code can not boot and emulate a hard-disk
 //     the bios will panic otherwise
-//   - Current code also use memory in EBDA segement. 
+//   - Current code also use memory in EBDA segement.
 //   - I used cmos byte 0x3D to store extended information on boot-device
 //   - Code has to be modified modified to handle multiple cdrom drives
 //   - Here are the cdrom boot failure codes:
 //      12 : can not read cd - boot image
 //
 //   ATA driver
-//   - EBDA segment. 
+//   - EBDA segment.
 //     I used memory starting at 0x121 in the segment
 //   - the translation policy is defined in cmos regs 0x39 & 0x3a
 //
 // TODO :
 //
-//   int74 
+//   int74
 //     - needs to be reworked.  Uses direct [bp] offsets. (?)
 //
 //   int13:
 //   - Implement remaining int13_cdemu functions (as defined by El-Torito specs)
 //   - cdrom drive is hardcoded to ide 0 device 1 in several places. see "FIXME ElTorito Hardcoded"
 //   - int13 Fix DL when emulating a cd. In that case DL is decremented before calling real int13.
-//     This is ok. But DL should be reincremented afterwards. 
+//     This is ok. But DL should be reincremented afterwards.
 //   - Fix all "FIXME ElTorito Various"
 //   - should be able to boot any cdrom instead of the first one
 //
 //   BCC Bug: find a generic way to handle the bug of #asm after an "if"  (fixed in 0.16.7)
 
-#define DEBUG_ROMBIOS      0
+#include "rombios.h"
 
 #define DEBUG_ATA          0
 #define DEBUG_INT13_HD     0
 #define BX_USE_ATADRV    1
 #define BX_ELTORITO_BOOT 1
 
-#define BX_TCGBIOS       0              /* main switch for TCG BIOS ext. */
+#define BX_TCGBIOS       0   /* main switch for TCG BIOS ext. */
+
+#define BX_PMM           1   /* POST Memory Manager */
 
 #define BX_MAX_ATA_INTERFACES   4
 #define BX_MAX_ATA_DEVICES      (BX_MAX_ATA_INTERFACES*2)
 #  define BIOS_BUILD_DATE "06/23/99"
 #endif
 
+#define E820_SEG (Bit16u)(E820_PHYSICAL_ADDRESS >> 4)
+
   // 1K of base memory used for Extended Bios Data Area (EBDA)
   // EBDA is used for PS/2 mouse support, and IDE BIOS, etc.
 #define EBDA_SEG           0x9FC0
 #define EBDA_SIZE          1              // In KiB
 #define BASE_MEM_IN_K   (640 - EBDA_SIZE)
 
-  // Define the application NAME
-#ifdef HVMASSIST
-#  define BX_APPNAME "HVMAssist"
-#elif PLEX86
-#  define BX_APPNAME "Plex86"
-#else
-#  define BX_APPNAME "Bochs"
-#endif
+/* 256 bytes at 0x9ff00 -- 0x9ffff is used for the IPL boot table. */
+#define IPL_TABLE_OFFSET     0x0300  /* offset from EBDA */
+#define IPL_TABLE_ENTRIES    8
+#define IPL_COUNT_OFFSET     0x0380  /* u16: number of valid table entries */
+#define IPL_SEQUENCE_OFFSET  0x0382  /* u16: next boot device */
+#define IPL_BOOTFIRST_OFFSET 0x0384  /* u16: user selected device */
+#define IPL_SIZE             0xff
+#define IPL_TYPE_FLOPPY      0x01
+#define IPL_TYPE_HARDDISK    0x02
+#define IPL_TYPE_CDROM       0x03
+#define IPL_TYPE_BEV         0x80
+
 
   // Sanity Checks
 #if BX_USE_ATADRV && BX_CPU<3
 #    error APM BIOS can only be used with 386+ cpu
 #endif
 
-#ifndef BX_SMP_PROCESSORS
-#define BX_SMP_PROCESSORS 1
-#    warning BX_SMP_PROCESSORS not defined, defaulting to 1
-#endif
-  
-#define PANIC_PORT  0x400
-#define PANIC_PORT2 0x401
-#define INFO_PORT   0x402
-#define DEBUG_PORT  0x403
+// define this if you want to make PCIBIOS working on a specific bridge only
+// undef enables PCIBIOS when at least one PCI device is found
+// i440FX is emulated by Bochs and QEMU
+#define PCI_FIXED_HOST_BRIDGE 0x12378086 ;; i440FX PCI bridge
 
 // #20  is dec 20
 // #$20 is hex 20 = 32
@@ -250,7 +253,7 @@ use16 286
 
 MACRO HALT
   ;; the HALT macro is called with the line number of the HALT call.
-  ;; The line number is then sent to the PANIC_PORT, causing Bochs/Plex 
+  ;; The line number is then sent to the PANIC_PORT, causing Bochs/Plex
   ;; to print a BX_PANIC message.  This will normally halt the simulation
   ;; with a message such as "BIOS panic at rombios.c, line 4091".
   ;; However, users can choose to make panics non-fatal and continue.
@@ -289,9 +292,9 @@ typedef unsigned long  Bit32u;
   void memsetb(seg,offset,value,count);
   void memcpyb(dseg,doffset,sseg,soffset,count);
   void memcpyd(dseg,doffset,sseg,soffset,count);
-  
+
   // memset of count bytes
-    void 
+    void
   memsetb(seg,offset,value,count)
     Bit16u seg;
     Bit16u offset;
@@ -301,14 +304,14 @@ typedef unsigned long  Bit32u;
   ASM_START
     push bp
     mov  bp, sp
-  
+
       push ax
       push cx
       push es
       push di
-  
+
       mov  cx, 10[bp] ; count
-      cmp  cx, #0x00
+      test cx, cx
       je   memsetb_end
       mov  ax, 4[bp] ; segment
       mov  es, ax
@@ -318,19 +321,19 @@ typedef unsigned long  Bit32u;
       cld
       rep
        stosb
-  
+
   memsetb_end:
       pop di
       pop es
       pop cx
       pop ax
-  
+
     pop bp
   ASM_END
   }
-  
+
   // memcpy of count bytes
-    void 
+    void
   memcpyb(dseg,doffset,sseg,soffset,count)
     Bit16u dseg;
     Bit16u doffset;
@@ -341,16 +344,16 @@ typedef unsigned long  Bit32u;
   ASM_START
     push bp
     mov  bp, sp
-  
+
       push ax
       push cx
       push es
       push di
       push ds
       push si
-  
+
       mov  cx, 12[bp] ; count
-      cmp  cx, #0x0000
+      test cx, cx
       je   memcpyb_end
       mov  ax, 4[bp] ; dsegment
       mov  es, ax
@@ -363,7 +366,7 @@ typedef unsigned long  Bit32u;
       cld
       rep
        movsb
-  
+
   memcpyb_end:
       pop si
       pop ds
@@ -371,14 +374,13 @@ typedef unsigned long  Bit32u;
       pop es
       pop cx
       pop ax
-  
+
     pop bp
   ASM_END
   }
 
-#if 0 
   // memcpy of count dword
-    void 
+    void
   memcpyd(dseg,doffset,sseg,soffset,count)
     Bit16u dseg;
     Bit16u doffset;
@@ -389,16 +391,16 @@ typedef unsigned long  Bit32u;
   ASM_START
     push bp
     mov  bp, sp
-  
+
       push ax
       push cx
       push es
       push di
       push ds
       push si
-  
+
       mov  cx, 12[bp] ; count
-      cmp  cx, #0x0000
+      test cx, cx
       je   memcpyd_end
       mov  ax, 4[bp] ; dsegment
       mov  es, ax
@@ -411,7 +413,7 @@ typedef unsigned long  Bit32u;
       cld
       rep
        movsd
-  
+
   memcpyd_end:
       pop si
       pop ds
@@ -419,16 +421,15 @@ typedef unsigned long  Bit32u;
       pop es
       pop cx
       pop ax
-  
+
     pop bp
   ASM_END
   }
-#endif
 
   // read_dword and write_dword functions
   static Bit32u         read_dword();
   static void           write_dword();
-  
+
     Bit32u
   read_dword(seg, offset)
     Bit16u seg;
@@ -437,25 +438,24 @@ typedef unsigned long  Bit32u;
   ASM_START
     push bp
     mov  bp, sp
-  
+
       push bx
       push ds
       mov  ax, 4[bp] ; segment
       mov  ds, ax
       mov  bx, 6[bp] ; offset
       mov  ax, [bx]
-      inc  bx
-      inc  bx
+      add  bx, #2
       mov  dx, [bx]
       ;; ax = return value (word)
       ;; dx = return value (word)
       pop  ds
       pop  bx
-  
+
     pop  bp
   ASM_END
   }
-  
+
     void
   write_dword(seg, offset, data)
     Bit16u seg;
@@ -465,7 +465,7 @@ typedef unsigned long  Bit32u;
   ASM_START
     push bp
     mov  bp, sp
-  
+
       push ax
       push bx
       push ds
@@ -474,50 +474,49 @@ typedef unsigned long  Bit32u;
       mov  bx, 6[bp] ; offset
       mov  ax, 8[bp] ; data word
       mov  [bx], ax  ; write data word
-      inc  bx
-      inc  bx
+      add  bx, #2
       mov  ax, 10[bp] ; data word
       mov  [bx], ax  ; write data word
       pop  ds
       pop  bx
       pop  ax
-  
+
     pop  bp
   ASM_END
   }
-  
+
   // Bit32u (unsigned long) and long helper functions
   ASM_START
-  
+
   ;; and function
   landl:
   landul:
-    SEG SS 
+    SEG SS
       and ax,[di]
-    SEG SS 
+    SEG SS
       and bx,2[di]
     ret
-  
+
   ;; add function
   laddl:
   laddul:
-    SEG SS 
+    SEG SS
       add ax,[di]
-    SEG SS 
+    SEG SS
       adc bx,2[di]
     ret
-  
+
   ;; cmp function
   lcmpl:
   lcmpul:
     and eax, #0x0000FFFF
     shl ebx, #16
-    add eax, ebx
+    or  eax, ebx
     shr ebx, #16
     SEG SS
       cmp eax, dword ptr [di]
     ret
-  
+
   ;; sub function
   lsubl:
   lsubul:
@@ -526,26 +525,26 @@ typedef unsigned long  Bit32u;
     SEG SS
     sbb bx,2[di]
     ret
-  
+
   ;; mul function
   lmull:
   lmulul:
     and eax, #0x0000FFFF
     shl ebx, #16
-    add eax, ebx
+    or  eax, ebx
     SEG SS
     mul eax, dword ptr [di]
     mov ebx, eax
     shr ebx, #16
     ret
-  
+
   ;; dec function
   ldecl:
   ldecul:
     SEG SS
     dec dword ptr [bx]
     ret
-  
+
   ;; or function
   lorl:
   lorul:
@@ -554,31 +553,31 @@ typedef unsigned long  Bit32u;
     SEG SS
     or  bx,2[di]
     ret
-  
+
   ;; inc function
   lincl:
   lincul:
     SEG SS
     inc dword ptr [bx]
     ret
-  
+
   ;; tst function
   ltstl:
   ltstul:
     and eax, #0x0000FFFF
     shl ebx, #16
-    add eax, ebx
+    or  eax, ebx
     shr ebx, #16
     test eax, eax
     ret
-  
+
   ;; sr function
   lsrul:
     mov  cx,di
     jcxz lsr_exit
     and  eax, #0x0000FFFF
     shl  ebx, #16
-    add  eax, ebx
+    or   eax, ebx
   lsr_loop:
     shr  eax, #1
     loop lsr_loop
@@ -586,7 +585,7 @@ typedef unsigned long  Bit32u;
     shr  ebx, #16
   lsr_exit:
     ret
-  
+
   ;; sl function
   lsll:
   lslul:
@@ -594,15 +593,15 @@ typedef unsigned long  Bit32u;
     jcxz lsl_exit
     and  eax, #0x0000FFFF
     shl  ebx, #16
-    add  eax, ebx
-  lsl_loop: 
+    or   eax, ebx
+  lsl_loop:
     shl  eax, #1
     loop lsl_loop
     mov  ebx, eax
     shr  ebx, #16
   lsl_exit:
     ret
-  
+
   idiv_:
     cwd
     idiv bx
@@ -616,7 +615,7 @@ typedef unsigned long  Bit32u;
   ldivul:
     and  eax, #0x0000FFFF
     shl  ebx, #16
-    add  eax, ebx
+    or   eax, ebx
     xor  edx, edx
     SEG SS
     mov  bx,  2[di]
@@ -665,7 +664,7 @@ typedef struct {
     Bit8u  revision;
     Bit8u  checksum;
     } dpte_t;
+
   typedef struct {
     Bit8u  iface;        // ISA or PCI
     Bit16u iobase1;      // IO Base 1
@@ -678,15 +677,15 @@ typedef struct {
     Bit8u  device;       // Detected type of attached devices (hd/cd/none)
     Bit8u  removable;    // Removable device flag
     Bit8u  lock;         // Locks for removable devices
-    // Bit8u  lba_capable;  // LBA capable flag - always yes for bochs devices
-    Bit8u  mode;         // transfert mode : PIO 16/32 bits - IRQ - ISADMA - PCIDMA
+    Bit8u  mode;         // transfer mode : PIO 16/32 bits - IRQ - ISADMA - PCIDMA
     Bit16u blksize;      // block size
 
     Bit8u  translation;  // type of translation
     chs_t  lchs;         // Logical CHS
     chs_t  pchs;         // Physical CHS
 
-    Bit32u sectors;      // Total sectors count
+    Bit32u sectors_low;  // Total sectors count
+    Bit32u sectors_high;
     } ata_device_t;
 
   typedef struct {
@@ -697,10 +696,10 @@ typedef struct {
     ata_device_t  devices[BX_MAX_ATA_DEVICES];
     //
     // map between (bios hd id - 0x80) and ata channels
-    Bit8u  hdcount, hdidmap[BX_MAX_ATA_DEVICES];                
+    Bit8u  hdcount, hdidmap[BX_MAX_ATA_DEVICES];
 
     // map between (bios cd id - 0xE0) and ata channels
-    Bit8u  cdcount, cdidmap[BX_MAX_ATA_DEVICES];                
+    Bit8u  cdcount, cdidmap[BX_MAX_ATA_DEVICES];
 
     // Buffer for DPTE table
     dpte_t dpte;
@@ -710,9 +709,9 @@ typedef struct {
     Bit32u trsfbytes;
 
     } ata_t;
-  
+
 #if BX_ELTORITO_BOOT
-  // ElTorito Device Emulation data 
+  // ElTorito Device Emulation data
   typedef struct {
     Bit8u  active;
     Bit8u  media;
@@ -723,20 +722,22 @@ typedef struct {
     Bit16u buffer_segment;
     Bit16u load_segment;
     Bit16u sector_count;
-    
+
     // Virtual device
     chs_t  vdevice;
     } cdemu_t;
 #endif // BX_ELTORITO_BOOT
-  
-#include "32bitgateway.h"
+
+#define X(idx, ret, fn, arg...) ret fn ();
+#include "32bitprotos.h"
+#undef X
 
   // for access to EBDA area
-  //     The EBDA structure should conform to 
-  //     http://www.cybertrails.com/~fys/rombios.htm document
+  //     The EBDA structure should conform to
+  //     http://www.frontiernet.net/~fys/rombios.htm document
   //     I made the ata and cdemu structs begin at 0x121 in the EBDA seg
-  // EBDA must be at most 768 bytes; it lives at 0x9fc00, and the boot 
-  // device tables are at 0x9ff00 -- 0x9ffff
+  // EBDA must be at most 768 bytes; it lives at EBDA_SEG, and the boot
+  // device tables are at EBDA_SEG:IPL_TABLE_OFFSET
   typedef struct {
     unsigned char ebda_size;
     unsigned char cmos_shutdown_status;
@@ -755,10 +756,8 @@ typedef struct {
     // El Torito Emulation data
     cdemu_t cdemu;
 #endif // BX_ELTORITO_BOOT
-
-    upcall_t upcall;
     } ebda_data_t;
-  
+
   #define EBDA_CMOS_SHUTDOWN_STATUS_OFFSET 1
   #define EbdaData ((ebda_data_t *) 0)
 
@@ -772,7 +771,7 @@ typedef struct {
     Bit32u lba1;
     Bit32u lba2;
     } int13ext_t;
+
   #define Int13Ext ((int13ext_t *) 0)
 
   // Disk Physical Table definition
@@ -798,7 +797,7 @@ typedef struct {
     Bit8u   reserved3;
     Bit8u   checksum;
     } dpt_t;
+
   #define Int13DPT ((dpt_t *) 0)
 
 #endif // BX_USE_ATADRV
@@ -828,9 +827,9 @@ typedef struct {
     } r16;
   struct {
     Bit32u filler[4];
-    Bit8u  bl, bh; 
+    Bit8u  bl, bh;
     Bit16u filler1;
-    Bit8u  dl, dh; 
+    Bit8u  dl, dh;
     Bit16u filler2;
     Bit8u  cl, ch;
     Bit16u filler3;
@@ -864,6 +863,14 @@ typedef struct {
   flags_t flags;
   } iret_addr_t;
 
+typedef struct {
+  Bit16u type;
+  Bit16u flags;
+  Bit32u vector;
+  Bit32u description;
+  Bit32u reserved;
+  } ipl_entry_t;
+
 
 
 static Bit8u          inb();
@@ -880,7 +887,6 @@ static Bit16u         read_word();
 static void           write_byte();
 static void           write_word();
 static void           bios_printf();
-static void           copy_e820_table();
 
 static Bit8u          inhibit_mouse_int_and_events();
 static void           enable_mouse_int_and_events();
@@ -903,8 +909,6 @@ static void           int1a_function();
 static void           int70_function();
 static void           int74_function();
 static Bit16u         get_CS();
-//static Bit16u         get_DS();
-//static void           set_DS();
 static Bit16u         get_SS();
 static unsigned int   enqueue_key();
 static unsigned int   dequeue_key();
@@ -923,7 +927,10 @@ static void           keyboard_init();
 static void           keyboard_panic();
 static void           shutdown_status_panic();
 static void           nmi_handler_msg();
+static void           delay_ticks();
+static void           delay_ticks_and_check_for_keystroke();
 
+static void           interactive_bootkey();
 static void           print_bios_banner();
 static void           print_boot_device();
 static void           print_boot_failure();
@@ -957,33 +964,9 @@ Bit16u cdrom_boot();
 
 #endif // BX_ELTORITO_BOOT
 
-static char bios_cvs_version_string[] = "$Revision: 1.138 $";
-static char bios_date_string[] = "$Date: 2005/05/07 15:55:26 $";
-
-static char CVSID[] = "$Id: rombios.c,v 1.138 2005/05/07 15:55:26 vruppert Exp $";
-
-/* Offset to skip the CVS $Id: prefix */ 
-#define bios_version_string  (CVSID + 4)
+static char bios_cvs_version_string[] = "$Revision: 1.221 $ $Date: 2008/12/07 17:32:29 $";
 
-#define BIOS_PRINTF_HALT     1
-#define BIOS_PRINTF_SCREEN   2
-#define BIOS_PRINTF_INFO     4
-#define BIOS_PRINTF_DEBUG    8
-#define BIOS_PRINTF_ALL      (BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO)
-#define BIOS_PRINTF_DEBHALT  (BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO | BIOS_PRINTF_HALT)
-
-#define printf(format, p...)  bios_printf(BIOS_PRINTF_SCREEN, format, ##p)
-
-// Defines the output macros. 
-// BX_DEBUG goes to INFO port until we can easily choose debug info on a 
-// per-device basis. Debug info are sent only in debug mode
-#if DEBUG_ROMBIOS
-#  define BX_DEBUG(format, p...)  bios_printf(BIOS_PRINTF_INFO, format, ##p)    
-#else
-#  define BX_DEBUG(format, p...) 
-#endif
-#define BX_INFO(format, p...)   bios_printf(BIOS_PRINTF_INFO, format, ##p)
-#define BX_PANIC(format, p...)  bios_printf(BIOS_PRINTF_DEBHALT, format, ##p)
+#define BIOS_COPYRIGHT_STRING "(c) 2002 MandrakeSoft S.A. Written by Kevin Lawton & the Bochs team."
 
 #if DEBUG_ATA
 #  define BX_DEBUG_ATA(a...) BX_DEBUG(a)
@@ -1156,9 +1139,9 @@ static struct {
       { 0x5100, 0x5133, 0x7600,   none, 0x20 }, /* 3 PgDn */
       { 0x5200, 0x5230,   none,   none, 0x20 }, /* 0 Ins */
       { 0x5300, 0x532e,   none,   none, 0x20 }, /* Del */
-      {   none,   none,   none,   none, none }, /* ??? */
-      {   none,   none,   none,   none, none }, /* ??? */
-      {   none,   none,   none,   none, none }, /* ??? */
+      {   none,   none,   none,   none, none },
+      {   none,   none,   none,   none, none },
+      { 0x565c, 0x567c,   none,   none, none }, /* \| */
       { 0x8500, 0x8700, 0x8900, 0x8b00, none }, /* F11 */
       { 0x8600, 0x8800, 0x8a00, 0x8c00, none }, /* F12 */
       };
@@ -1415,31 +1398,6 @@ ASM_START
 ASM_END
 }
 
-//  Bit16u
-//get_DS()
-//{
-//ASM_START
-//  mov  ax, ds
-//ASM_END
-//}
-//
-//  void
-//set_DS(ds_selector)
-//  Bit16u ds_selector;
-//{
-//ASM_START
-//  push bp
-//  mov  bp, sp
-//
-//    push ax
-//    mov  ax, 4[bp] ; ds_selector
-//    mov  ds, ax
-//    pop  ax
-//
-//  pop  bp
-//ASM_END
-//}
-
   Bit16u
 get_SS()
 {
@@ -1450,47 +1408,24 @@ ASM_END
 
 #ifdef HVMASSIST
 void
-copy_e820_table()
+fixup_base_mem_in_k()
 {
-  Bit8u nr_entries = read_byte(0x9000, 0x1e8);
-  Bit32u base_mem;
-  if (nr_entries > 32)
-       nr_entries = 32;
-  write_word(0xe000, 0x8, nr_entries);
-  memcpyb(0xe000, 0x10, 0x9000, 0x2d0, nr_entries * 0x14);
   /* Report the proper base memory size at address 0x0413: otherwise
    * non-e820 code will clobber things if BASE_MEM_IN_K is bigger than
    * the first e820 entry.  Get the size by reading the second 64bit 
    * field of the first e820 slot. */ 
-  base_mem = read_dword(0x9000, 0x2d0 + 8);
+  Bit32u base_mem = read_dword(E820_SEG, E820_OFFSET + 8);
   write_word(0x40, 0x13, base_mem >> 10);
 }
 
-void
-set_rom_write_access(action)
-  Bit16u action;
-{
-    Bit16u off = (Bit16u)&((struct bios_info *)0)->xen_pfiob;
-ASM_START
-    mov si,.set_rom_write_access.off[bp]
-    push ds
-    mov ax,#(ACPI_PHYSICAL_ADDRESS >> 4)
-    mov ds,ax
-    mov dx,[si]
-    pop ds
-    mov ax,.set_rom_write_access.action[bp]
-    out dx,al
-ASM_END
-}
-
 void enable_rom_write_access()
 {
-    set_rom_write_access(0);
+    outb(XEN_PF_IOBASE, 0);
 }
 
 void disable_rom_write_access()
 {
-    set_rom_write_access(PFFLAG_ROM_LOCK);
+    outb(XEN_PF_IOBASE, PFFLAG_ROM_LOCK);
 }
     
 #endif /* HVMASSIST */
@@ -1563,7 +1498,7 @@ wrch(c)
   pop  bp
   ASM_END
 }
+
   void
 send(action, c)
   Bit16u action;
@@ -1619,14 +1554,121 @@ put_uint(action, val, width, neg)
   send(action, val - (nval * 10) + '0');
 }
 
+  void
+put_luint(action, val, width, neg)
+  Bit16u action;
+  unsigned long val;
+  short width;
+  bx_bool neg;
+{
+  unsigned long nval = val / 10;
+  if (nval)
+    put_luint(action, nval, width - 1, neg);
+  else {
+    while (--width > 0) send(action, ' ');
+    if (neg) send(action, '-');
+  }
+  send(action, val - (nval * 10) + '0');
+}
+
+void put_str(action, segment, offset)
+  Bit16u action;
+  Bit16u segment;
+  Bit16u offset;
+{
+  Bit8u c;
+
+  while (c = read_byte(segment, offset)) {
+    send(action, c);
+    offset++;
+  }
+}
+
+  void
+delay_ticks(ticks)
+  Bit16u ticks;
+{
+  long ticks_to_wait, delta;
+  Bit32u prev_ticks, t;
+
+   /*
+    * The 0:046c wraps around at 'midnight' according to a 18.2Hz clock.
+    * We also have to be careful about interrupt storms.
+    */
+ASM_START
+  pushf
+  sti
+ASM_END
+  ticks_to_wait = ticks;
+  prev_ticks = read_dword(0x0, 0x46c);
+  do
+  {
+ASM_START
+    hlt
+ASM_END
+    t = read_dword(0x0, 0x46c);
+    if (t > prev_ticks)
+    {
+      delta = t - prev_ticks;     /* The temp var is required or bcc screws up. */
+      ticks_to_wait -= delta;
+    }
+    else if (t < prev_ticks)
+    {
+      ticks_to_wait -= t;         /* wrapped */
+    }
+
+    prev_ticks = t;
+  } while (ticks_to_wait > 0);
+ASM_START
+  cli
+  popf
+ASM_END
+}
+
+  Bit8u
+check_for_keystroke()
+{
+ASM_START
+  mov  ax, #0x100
+  int  #0x16
+  jz   no_key
+  mov  al, #1
+  jmp  done
+no_key:
+  xor  al, al
+done:
+ASM_END
+}
+
+  Bit8u
+get_keystroke()
+{
+ASM_START
+  mov  ax, #0x0
+  int  #0x16
+  xchg ah, al
+ASM_END
+}
+
+  void
+delay_ticks_and_check_for_keystroke(ticks, count)
+  Bit16u ticks, count;
+{
+  Bit16u i;
+  for (i = 1; i <= count; i++) {
+    delay_ticks(ticks);
+    if (check_for_keystroke())
+      break;
+  }
+}
+
 //--------------------------------------------------------------------------
 // bios_printf()
-//   A compact variable argument printf function which prints its output via
-//   an I/O port so that it can be logged by Bochs/Plex.  
-//   Currently, only %x is supported (or %02x, %04x, etc).
+//   A compact variable argument printf function.
 //
-//   Supports %[format_width][format]
-//   where format can be d,x,c,s
+//   Supports %[format_width][length]format
+//   where format can be x,X,u,d,s,S,c
+//   and the optional length modifier is l (ell)
 //--------------------------------------------------------------------------
   void
 bios_printf(action, s)
@@ -1637,7 +1679,7 @@ bios_printf(action, s)
   bx_bool  in_format;
   short i;
   Bit16u  *arg_ptr;
-  Bit16u   arg_seg, arg, nibble, shift_count, format_width;
+  Bit16u   arg_seg, arg, nibble, hibyte, shift_count, format_width, hexadd;
 
   arg_ptr = &s;
   arg_seg = get_SS();
@@ -1664,17 +1706,49 @@ bios_printf(action, s)
       else {
         arg_ptr++; // increment to next arg
         arg = read_word(arg_seg, arg_ptr);
-        if (c == 'x') {
+        if (c == 'x' || c == 'X') {
           if (format_width == 0)
             format_width = 4;
+          if (c == 'x')
+            hexadd = 'a';
+          else
+            hexadd = 'A';
           for (i=format_width-1; i>=0; i--) {
             nibble = (arg >> (4 * i)) & 0x000f;
-            send (action, (nibble<=9)? (nibble+'0') : (nibble-10+'A'));
+            send (action, (nibble<=9)? (nibble+'0') : (nibble-10+hexadd));
             }
           }
         else if (c == 'u') {
           put_uint(action, arg, format_width, 0);
           }
+        else if (c == 'l') {
+          s++;
+          c = read_byte(get_CS(), s); /* is it ld,lx,lu? */
+          arg_ptr++; /* increment to next arg */
+          hibyte = read_word(arg_seg, arg_ptr);
+          if (c == 'd') {
+            if (hibyte & 0x8000)
+              put_luint(action, 0L-(((Bit32u) hibyte << 16) | arg), format_width-1, 1);
+            else
+              put_luint(action, ((Bit32u) hibyte << 16) | arg, format_width, 0);
+           }
+          else if (c == 'u') {
+            put_luint(action, ((Bit32u) hibyte << 16) | arg, format_width, 0);
+           }
+          else if (c == 'x' || c == 'X')
+           {
+            if (format_width == 0)
+              format_width = 8;
+            if (c == 'x')
+              hexadd = 'a';
+            else
+              hexadd = 'A';
+            for (i=format_width-1; i>=0; i--) {
+              nibble = ((((Bit32u) hibyte <<16) | arg) >> (4 * i)) & 0x000f;
+              send (action, (nibble<=9)? (nibble+'0') : (nibble-10+hexadd));
+              }
+           }
+          }
         else if (c == 'd') {
           if (arg & 0x8000)
             put_int(action, -arg, format_width - 1, 1);
@@ -1682,7 +1756,13 @@ bios_printf(action, s)
             put_int(action, arg, format_width, 0);
           }
         else if (c == 's') {
-          bios_printf(action & (~BIOS_PRINTF_HALT), arg);
+          put_str(action, get_CS(), arg);
+          }
+        else if (c == 'S') {
+          hibyte = arg;
+          arg_ptr++;
+          arg = read_word(arg_seg, arg_ptr);
+          put_str(action, hibyte, arg);
           }
         else if (c == 'c') {
           send(action, arg);
@@ -1699,7 +1779,7 @@ bios_printf(action, s)
     }
 
   if (action & BIOS_PRINTF_HALT) {
-    // freeze in a busy loop.  
+    // freeze in a busy loop.
 ASM_START
     cli
  halt2_loop:
@@ -1733,8 +1813,8 @@ keyboard_init()
             max = 0x2000;
             }
         }
-  
-    // Due to timer issues, and if the IPS setting is > 15000000, 
+
+    // Due to timer issues, and if the IPS setting is > 15000000,
     // the incoming keys might not be flushed here. That will
     // cause a panic a few lines below.  See sourceforge bug report :
     // [ 642031 ] FATAL: Keyboard RESET error:993
@@ -1871,13 +1951,12 @@ keyboard_init()
 keyboard_panic(status)
   Bit16u status;
 {
-  // If you're getting a 993 keyboard panic here, 
+  // If you're getting a 993 keyboard panic here,
   // please see the comment in keyboard_init
-  
+
   BX_PANIC("Keyboard error:%u\n",status);
 }
 
-
 #define CMOS_SHUTDOWN_S3 0xFE
 //--------------------------------------------------------------------------
 // machine_reset
@@ -1932,6 +2011,11 @@ shutdown_status_panic(status)
   BX_PANIC("Unimplemented shutdown status: %02x\n",(Bit8u)status);
 }
 
+void s3_resume_panic()
+{
+  BX_PANIC("Returned from s3_resume.\n");
+}
+
 //--------------------------------------------------------------------------
 // print_bios_banner
 //   displays a the bios version
@@ -1939,108 +2023,198 @@ shutdown_status_panic(status)
 void
 print_bios_banner()
 {
-  printf(BX_APPNAME" BIOS, %d cpu%s, ", BX_SMP_PROCESSORS, BX_SMP_PROCESSORS>1?"s":"");
-  printf("%s %s\n", bios_cvs_version_string, bios_date_string);
+  printf(BX_APPNAME" BIOS - build: %s\n%s\nOptions: ",
+    BIOS_BUILD_DATE, bios_cvs_version_string);
+  printf(
+#if BX_APM
+  "apmbios "
+#endif
+#if BX_PCIBIOS
+  "pcibios "
+#endif
+#if BX_ELTORITO_BOOT
+  "eltorito "
+#endif
+#if BX_ROMBIOS32
+  "rombios32 "
+#endif
 #if BX_TCGBIOS
-  printf("TCG-enabled BIOS.\n");
+  "TCG-enabled "
 #endif
-  printf("\n");
+#if BX_PMM
+  "PMM "
+#endif
+  "\n\n");
 }
 
-
 //--------------------------------------------------------------------------
 // BIOS Boot Specification 1.0.1 compatibility
 //
-// Very basic support for the BIOS Boot Specification, which allows expansion 
-// ROMs to register themselves as boot devices, instead of just stealing the 
+// Very basic support for the BIOS Boot Specification, which allows expansion
+// ROMs to register themselves as boot devices, instead of just stealing the
 // INT 19h boot vector.
-// 
+//
 // This is a hack: to do it properly requires a proper PnP BIOS and we aren't
-// one; we just lie to the option ROMs to make them behave correctly. 
-// We also don't support letting option ROMs register as bootable disk 
-// drives (BCVs), only as bootable devices (BEVs). 
+// one; we just lie to the option ROMs to make them behave correctly.
+// We also don't support letting option ROMs register as bootable disk
+// drives (BCVs), only as bootable devices (BEVs).
 //
 // http://www.phoenix.com/en/Customer+Services/White+Papers-Specs/pc+industry+specifications.htm
 //--------------------------------------------------------------------------
 
-/* 256 bytes at 0x9ff00 -- 0x9ffff is used for the IPL boot table. */
-#define IPL_SEG              0x9ff0
-#define IPL_TABLE_OFFSET     0x0000
-#define IPL_TABLE_ENTRIES    8
-#define IPL_COUNT_OFFSET     0x0080  /* u16: number of valid table entries */
-#define IPL_SEQUENCE_OFFSET  0x0082  /* u16: next boot device */
-
-struct ipl_entry {
-  Bit16u type;
-  Bit16u flags;
-  Bit32u vector;
-  Bit32u description;
-  Bit32u reserved;
-};
+static char drivetypes[][10]={"", "Floppy","Hard Disk","CD-Rom", "Network"};
 
-static void 
-init_boot_vectors() 
+static void
+init_boot_vectors()
 {
-  struct ipl_entry e; 
+  ipl_entry_t e;
   Bit16u count = 0;
   Bit16u ss = get_SS();
+  Bit16u ebda_seg = read_word(0x0040, 0x000E);
 
   /* Clear out the IPL table. */
-  memsetb(IPL_SEG, IPL_TABLE_OFFSET, 0, 0xff);
+  memsetb(ebda_seg, IPL_TABLE_OFFSET, 0, IPL_SIZE);
+
+  /* User selected device not set */
+  write_word(ebda_seg, IPL_BOOTFIRST_OFFSET, 0xFFFF);
 
   /* Floppy drive */
-  e.type = 1; e.flags = 0; e.vector = 0; e.description = 0; e.reserved = 0;
-  memcpyb(IPL_SEG, IPL_TABLE_OFFSET + count * sizeof (e), ss, &e, sizeof (e));
+  e.type = IPL_TYPE_FLOPPY; e.flags = 0; e.vector = 0; e.description = 0; e.reserved = 0;
+  memcpyb(ebda_seg, IPL_TABLE_OFFSET + count * sizeof (e), ss, &e, sizeof (e));
   count++;
 
   /* First HDD */
-  e.type = 2; e.flags = 0; e.vector = 0; e.description = 0; e.reserved = 0;
-  memcpyb(IPL_SEG, IPL_TABLE_OFFSET + count * sizeof (e), ss, &e, sizeof (e));
+  e.type = IPL_TYPE_HARDDISK; e.flags = 0; e.vector = 0; e.description = 0; e.reserved = 0;
+  memcpyb(ebda_seg, IPL_TABLE_OFFSET + count * sizeof (e), ss, &e, sizeof (e));
   count++;
 
 #if BX_ELTORITO_BOOT
   /* CDROM */
-  e.type = 3; e.flags = 0; e.vector = 0; e.description = 0; e.reserved = 0;
-  memcpyb(IPL_SEG, IPL_TABLE_OFFSET + count * sizeof (e), ss, &e, sizeof (e));
+  e.type = IPL_TYPE_CDROM; e.flags = 0; e.vector = 0; e.description = 0; e.reserved = 0;
+  memcpyb(ebda_seg, IPL_TABLE_OFFSET + count * sizeof (e), ss, &e, sizeof (e));
   count++;
-#endif  
+#endif
 
   /* Remember how many devices we have */
-  write_word(IPL_SEG, IPL_COUNT_OFFSET, count);
+  write_word(ebda_seg, IPL_COUNT_OFFSET, count);
   /* Not tried booting anything yet */
-  write_word(IPL_SEG, IPL_SEQUENCE_OFFSET, 0xffff);
+  write_word(ebda_seg, IPL_SEQUENCE_OFFSET, 0xffff);
 }
 
 static Bit8u
 get_boot_vector(i, e)
-Bit16u i; struct ipl_entry *e; 
+Bit16u i; ipl_entry_t *e;
 {
   Bit16u count;
   Bit16u ss = get_SS();
+  Bit16u ebda_seg = read_word(0x0040, 0x000E);
   /* Get the count of boot devices, and refuse to overrun the array */
-  count = read_word(IPL_SEG, IPL_COUNT_OFFSET);
+  count = read_word(ebda_seg, IPL_COUNT_OFFSET);
   if (i >= count) return 0;
   /* OK to read this device */
-  memcpyb(ss, e, IPL_SEG, IPL_TABLE_OFFSET + i * sizeof (*e), sizeof (*e));
+  memcpyb(ss, e, ebda_seg, IPL_TABLE_OFFSET + i * sizeof (*e), sizeof (*e));
   return 1;
 }
 
+#if BX_ELTORITO_BOOT
+  void
+interactive_bootkey()
+{
+  ipl_entry_t e;
+  Bit16u count;
+  char description[33];
+  Bit8u scan_code;
+  Bit8u i;
+  Bit16u ss = get_SS();
+  Bit16u valid_choice = 0;
+  Bit16u ebda_seg = read_word(0x0040, 0x000E);
+
+  printf("\n\nPress F12 for boot menu.\n\n");
+
+  while (check_for_keystroke())
+  {
+    scan_code = get_keystroke();
+    if (scan_code != 0x86) /* F12 */
+      continue;
+
+    while (check_for_keystroke())
+      get_keystroke();
+
+    printf("Select boot device:\n\n");
+
+    count = read_word(ebda_seg, IPL_COUNT_OFFSET);
+    for (i = 0; i < count; i++)
+    {
+      memcpyb(ss, &e, ebda_seg, IPL_TABLE_OFFSET + i * sizeof (e), sizeof (e));
+      printf("%d. ", i+1);
+      switch(e.type)
+      {
+        case IPL_TYPE_FLOPPY:
+        case IPL_TYPE_HARDDISK:
+        case IPL_TYPE_CDROM:
+          printf("%s\n", drivetypes[e.type]);
+          break;
+        case IPL_TYPE_BEV:
+          printf("%s", drivetypes[4]);
+          if (e.description != 0)
+          {
+            memcpyb(ss, &description, (Bit16u)(e.description >> 16), (Bit16u)(e.description & 0xffff), 32);
+            description[32] = 0;
+            printf(" [%S]", ss, description);
+         }
+         printf("\n");
+         break;
+      }
+    }
+
+    count++;
+    while (!valid_choice) {
+      scan_code = get_keystroke();
+      if (scan_code == 0x01 || scan_code == 0x58) /* ESC or F12 */
+      {
+        valid_choice = 1;
+      }
+      else if (scan_code <= count)
+      {
+        valid_choice = 1;
+        scan_code -= 1;
+        /* Set user selected device */
+        write_word(ebda_seg, IPL_BOOTFIRST_OFFSET, scan_code);
+      }
+    }
+
+    printf("\n");
+    break;
+  }
+}
+#endif // BX_ELTORITO_BOOT
 
 //--------------------------------------------------------------------------
 // print_boot_device
 //   displays the boot device
 //--------------------------------------------------------------------------
 
-static char drivetypes[][10]={"", "Floppy","Hard Disk","CD-Rom", "Network"};
-
 void
-print_boot_device(type)
-  Bit16u type;
+print_boot_device(e)
+  ipl_entry_t *e;
 {
-  /* NIC appears as type 0x80 */ 
-  if (type == 0x80 ) type = 0x4;
-  if (type == 0 || type > 0x4) BX_PANIC("Bad drive type\n"); 
-  printf("Booting from %s...\n", drivetypes[type]);
+  Bit16u type;
+  char description[33];
+  Bit16u ss = get_SS();
+  type = e->type;
+  /* NIC appears as type 0x80 */
+  if (type == IPL_TYPE_BEV) type = 0x4;
+  if (type == 0 || type > 0x4) BX_PANIC("Bad drive type\n");
+  printf("Booting from %s", drivetypes[type]);
+  /* print product string if BEV */
+  if (type == 4 && e->description != 0) {
+    /* first 32 bytes are significant */
+    memcpyb(ss, &description, (Bit16u)(e->description >> 16), (Bit16u)(e->description & 0xffff), 32);
+    /* terminate string */
+    description[32] = 0;
+    printf(" [%S]", ss, description);
+  }
+  printf("...\n");
 }
 
 //--------------------------------------------------------------------------
@@ -2051,17 +2225,17 @@ print_boot_device(type)
 print_boot_failure(type, reason)
   Bit16u type; Bit8u reason;
 {
-  if (type == 0 || type > 0x3) BX_PANIC("Bad drive type\n"); 
+  if (type == 0 || type > 0x3) BX_PANIC("Bad drive type\n");
 
   printf("Boot from %s failed", drivetypes[type]);
   if (type < 4) {
     /* Report the reason too */
-  if (reason==0) 
-    printf(": not a bootable disk");
-  else
-    printf(": could not read the boot disk");
+    if (reason==0)
+      printf(": not a bootable disk");
+    else
+      printf(": could not read the boot disk");
   }
-  printf("\n");
+  printf("\n\n");
 }
 
 //--------------------------------------------------------------------------
@@ -2073,238 +2247,29 @@ print_cdromboot_failure( code )
   Bit16u code;
 {
   bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "CDROM boot failure code : %04x\n",code);
-  
+
   return;
 }
 
-#define WAIT_HZ 18
-/**
- * Check for keystroke.
- * @returns    True if keystroke available, False if not.
- */
-Bit8u check_for_keystroke()
+void
+nmi_handler_msg()
 {
-ASM_START
-    mov  ax, #0x100
-    int  #0x16
-    jz   no_key
-    mov  al, #1
-    jmp  done
-no_key:
-    xor  al, al
-done:
-ASM_END
+  BX_PANIC("NMI Handler called\n");
 }
 
-/**
- * Get keystroke.
- * @returns    BIOS scan code.
- */
-Bit8u get_keystroke()
+void
+int18_panic_msg()
 {
-ASM_START
-    mov  ax, #0x0
-    int  #0x16
-    xchg ah, al
-ASM_END
+  BX_PANIC("INT18: BOOT FAILURE\n");
 }
 
-/**
- * Waits (sleeps) for the given number of ticks.
- * Checks for keystroke.
- *
- * @returns BIOS scan code if available, 0 if not.
- * @param   ticks       Number of ticks to sleep.
- * @param   stop_on_key Whether to stop immediately upon keypress.
- */
-Bit8u wait(ticks, stop_on_key)
-  Bit16u ticks;
-  Bit8u stop_on_key;
+void
+log_bios_start()
 {
-    long ticks_to_wait, delta;
-    Bit32u prev_ticks, t;
-    Bit8u scan_code = 0;
-
-    /*
-     * The 0:046c wraps around at 'midnight' according to a 18.2Hz clock.
-     * We also have to be careful about interrupt storms.
-     */
-    ticks_to_wait = ticks;
-    prev_ticks = read_dword(0x0, 0x46c);
-    do
-    {
-        t = read_dword(0x0, 0x46c);
-        if (t > prev_ticks)
-        {
-            delta = t - prev_ticks;     /* The temp var is required or bcc screws up. */
-            ticks_to_wait -= delta;
-        }
-        else if (t < prev_ticks)
-            ticks_to_wait -= t;         /* wrapped */
-        prev_ticks = t;
-
-        if (check_for_keystroke())
-        {
-            scan_code = get_keystroke();
-            bios_printf(BIOS_PRINTF_DEBUG, "Key pressed: %x\n", scan_code);
-            if (stop_on_key)
-                return scan_code;
-        }
-    } while (ticks_to_wait > 0);
-    return scan_code;
-}
-
-static void clearscreen() {
-    /* Hide cursor, clear screen and move cursor to starting position */
-ASM_START
-        push bx
-        push cx
-        push dx
-
-        mov  ax, #0x100
-        mov  cx, #0x1000
-        int  #0x10
-
-        mov  ax, #0x700
-        mov  bh, #7
-        xor  cx, cx
-        mov  dx, #0x184f
-        int  #0x10
-
-        mov  ax, #0x200
-        xor  bx, bx
-        xor  dx, dx
-        int  #0x10
-
-        pop  dx
-        pop  cx
-        pop  bx
-ASM_END
-}
-
-int bootmenu(selected)
-  int selected;
-{
-    Bit8u scode;
-    int max;
-
-    /* get the number of boot devices */
-    max = read_word(IPL_SEG, IPL_COUNT_OFFSET);
-
-    for(;;) {
-        if (selected > max || selected < 1) selected = 1;
-        clearscreen();
-        bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "\n\n\n\n\n\n\n");
-        bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "          Select boot device\n\n");
-        bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "            1. Floppy\n");
-        bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "            2. Hard drive\n");
-        bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "            3. CD-ROM\n");
-        if (max == 4)
-            bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "            4. Network\n");
-        bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO, "\n\n          Currently selected: %d\n", selected);
-
-        do {
-            scode = wait(WAIT_HZ, 1);
-        } while (scode == 0);
-        switch(scode) {
-        case 0x02:
-        case 0x03:
-        case 0x04:
-            selected = scode - 1;
-            break;
-        case 0x05:
-            if (max == 4)
-                selected = scode -1 ;
-            else
-                scode = 0;
-            break;
-        case 0x48:
-            selected -= 1;
-            if (selected < 1)
-                selected = 1;
-            scode = 0;
-            break;
-        case 0x50:
-            selected += 1;
-            if (selected > max)
-                selected = max;
-            scode = 0;
-            break;
-        case 0x1c:
-            break;
-        default:
-            scode = 0;
-            break;
-        }
-        if (scode != 0)
-            break;
-    }
-
-    switch (selected) {
-    case 1:
-        return 0x3D;
-    case 2:
-        return 0x3E;
-    case 3:
-        return 0x3F;
-    case 4:
-        return 0x58;
-    default:
-        return 0;
-    }
-}
-
-void interactive_bootkey()
-{
-    Bit16u i;
-    Bit8u scan = 0;
-
-    bios_printf(BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO,
-                "\n\nPress F10 to select boot device.\n");
-
-    scan = wait(1, 0);
-    if (scan == 0x44)
-        scan = bootmenu(inb_cmos(0x3d) & 0x0f);
-
-    /* set the default based on the keypress or menu */
-    switch(scan) {
-    case 0x3D:
-        outb_cmos(0x3d, 0x01);
-        break;
-    case 0x3E:
-        outb_cmos(0x3d, 0x02);
-        break;
-    case 0x3F:
-        outb_cmos(0x3d, 0x03);
-        break;
-    case 0x58:
-        outb_cmos(0x3d, 0x04);
-        break;
-    default:
-        break;
-    }
-}
-
-
-void
-nmi_handler_msg()
-{
-  BX_PANIC("NMI Handler called\n");
-}
-
-void
-int18_panic_msg()
-{
-  BX_PANIC("INT18: BOOT FAILURE\n");
-}
-
-void
-log_bios_start()
-{
-#if BX_DEBUG_SERIAL
-  outb(BX_DEBUG_PORT+UART_LCR, 0x03); /* setup for serial logging: 8N1 */
-#endif
-  BX_INFO("%s\n", bios_version_string);
+#if BX_DEBUG_SERIAL
+  outb(BX_DEBUG_PORT+UART_LCR, 0x03); /* setup for serial logging: 8N1 */
+#endif
+  BX_INFO("%s\n", bios_cvs_version_string);
 }
 
   bx_bool
@@ -2339,39 +2304,35 @@ debugger_off()
   outb(0xfedc, 0x00);
 }
 
-void 
+int
 s3_resume()
 {
     Bit32u s3_wakeup_vector;
-    Bit16u s3_wakeup_ip, s3_wakeup_cs;
-    Bit8u cmos_shutdown_status;
+    Bit8u s3_resume_flag;
 
-ASM_START
-    push ds
-    push ax
-    mov ax, #EBDA_SEG
-    mov ds, ax
-    mov al, [EBDA_CMOS_SHUTDOWN_STATUS_OFFSET]
-    mov .s3_resume.cmos_shutdown_status[bp], al
-    pop ax
-    pop ds
-ASM_END
+    s3_resume_flag = read_byte(0x40, 0xb0);
+#ifdef HVMASSIST
+    s3_wakeup_vector = get_s3_waking_vector();
+#else
+    s3_wakeup_vector = read_dword(0x40, 0xb2);
+#endif
 
-    if (cmos_shutdown_status != CMOS_SHUTDOWN_S3)
-        return;
+    BX_INFO("S3 resume called %x 0x%lx\n", s3_resume_flag, s3_wakeup_vector);
+    if (s3_resume_flag != CMOS_SHUTDOWN_S3 || !s3_wakeup_vector)
+           return 0;
 
-    s3_wakeup_vector = get_s3_waking_vector();
-    if (!s3_wakeup_vector)
-        return;
+    write_byte(0x40, 0xb0, 0);
 
-    s3_wakeup_ip = s3_wakeup_vector & 0xF;
-    s3_wakeup_cs = s3_wakeup_vector >> 4;
+    /* setup wakeup vector */
+    write_word(0x40, 0xb6, (s3_wakeup_vector & 0xF)); /* IP */
+    write_word(0x40, 0xb8, (s3_wakeup_vector >> 4)); /* CS */
 
+    BX_INFO("S3 resume jump to %x:%x\n", (s3_wakeup_vector >> 4),
+                   (s3_wakeup_vector & 0xF));
 ASM_START
-    push .s3_resume.s3_wakeup_cs[bp]
-    push .s3_resume.s3_wakeup_ip[bp]
-    retf
+    jmpf [0x04b6]
 ASM_END
+    return 1;
 }
 
 #if BX_USE_ATADRV
@@ -2421,6 +2382,7 @@ ASM_END
 // bits 7-4 of the device/head (CB_DH) reg
 #define ATA_CB_DH_DEV0 0xa0    // select device 0
 #define ATA_CB_DH_DEV1 0xb0    // select device 1
+#define ATA_CB_DH_LBA 0x40    // use LBA
 
 // status reg (CB_STAT and CB_ASTAT) bits
 #define ATA_CB_STAT_BSY  0x80  // busy
@@ -2470,6 +2432,7 @@ ASM_END
 #define ATA_CMD_READ_SECTORS                 0x20
 #define ATA_CMD_READ_VERIFY_SECTORS          0x40
 #define ATA_CMD_RECALIBRATE                  0x10
+#define ATA_CMD_REQUEST_SENSE                0x03
 #define ATA_CMD_SEEK                         0x70
 #define ATA_CMD_SET_FEATURES                 0xEF
 #define ATA_CMD_SET_MULTIPLE_MODE            0xC6
@@ -2514,7 +2477,7 @@ ASM_END
 #define ATA_DATA_NO      0x00
 #define ATA_DATA_IN      0x01
 #define ATA_DATA_OUT     0x02
-  
+
 // ---------------------------------------------------------------------------
 // ATA/ATAPI driver : initialization
 // ---------------------------------------------------------------------------
@@ -2523,7 +2486,7 @@ void ata_init( )
   Bit16u ebda_seg=read_word(0x0040,0x000E);
   Bit8u  channel, device;
 
-  // Channels info init. 
+  // Channels info init.
   for (channel=0; channel<BX_MAX_ATA_INTERFACES; channel++) {
     write_byte(ebda_seg,&EbdaData->ata.channels[channel].iface,ATA_IFACE_NONE);
     write_word(ebda_seg,&EbdaData->ata.channels[channel].iobase1,0x0);
@@ -2531,7 +2494,7 @@ void ata_init( )
     write_byte(ebda_seg,&EbdaData->ata.channels[channel].irq,0);
     }
 
-  // Devices info init. 
+  // Devices info init.
   for (device=0; device<BX_MAX_ATA_DEVICES; device++) {
     write_byte(ebda_seg,&EbdaData->ata.devices[device].type,ATA_TYPE_NONE);
     write_byte(ebda_seg,&EbdaData->ata.devices[device].device,ATA_DEVICE_NONE);
@@ -2546,11 +2509,12 @@ void ata_init( )
     write_word(ebda_seg,&EbdaData->ata.devices[device].pchs.heads,0);
     write_word(ebda_seg,&EbdaData->ata.devices[device].pchs.cylinders,0);
     write_word(ebda_seg,&EbdaData->ata.devices[device].pchs.spt,0);
-    
-    write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors,0L);
+
+    write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_low,0L);
+    write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_high,0L);
     }
 
-  // hdidmap  and cdidmap init. 
+  // hdidmap  and cdidmap init.
   for (device=0; device<BX_MAX_ATA_DEVICES; device++) {
     write_byte(ebda_seg,&EbdaData->ata.hdidmap[device],BX_MAX_ATA_DEVICES);
     write_byte(ebda_seg,&EbdaData->ata.cdidmap[device],BX_MAX_ATA_DEVICES);
@@ -2560,6 +2524,58 @@ void ata_init( )
   write_byte(ebda_seg,&EbdaData->ata.cdcount,0);
 }
 
+#define TIMEOUT 0
+#define BSY 1
+#define NOT_BSY 2
+#define NOT_BSY_DRQ 3
+#define NOT_BSY_NOT_DRQ 4
+#define NOT_BSY_RDY 5
+
+#define IDE_TIMEOUT 32000u //32 seconds max for IDE ops
+
+int await_ide();
+static int await_ide(when_done,base,timeout)
+  Bit8u when_done;
+  Bit16u base;
+  Bit16u timeout;
+{
+  Bit32u time=0,last=0;
+  Bit16u status;
+  Bit8u result;
+  status = inb(base + ATA_CB_STAT); // for the times you're supposed to throw one away
+  for(;;) {
+    status = inb(base+ATA_CB_STAT);
+    time++;
+    if (when_done == BSY)
+      result = status & ATA_CB_STAT_BSY;
+    else if (when_done == NOT_BSY)
+      result = !(status & ATA_CB_STAT_BSY);
+    else if (when_done == NOT_BSY_DRQ)
+      result = !(status & ATA_CB_STAT_BSY) && (status & ATA_CB_STAT_DRQ);
+    else if (when_done == NOT_BSY_NOT_DRQ)
+      result = !(status & ATA_CB_STAT_BSY) && !(status & ATA_CB_STAT_DRQ);
+    else if (when_done == NOT_BSY_RDY)
+      result = !(status & ATA_CB_STAT_BSY) && (status & ATA_CB_STAT_RDY);
+    else if (when_done == TIMEOUT)
+      result = 0;
+
+    if (result) return 0;
+    if (time>>16 != last) // mod 2048 each 16 ms
+    {
+      last = time >>16;
+      BX_DEBUG_ATA("await_ide: (TIMEOUT,BSY,!BSY,!BSY_DRQ,!BSY_!DRQ,!BSY_RDY) %d time= %ld timeout= %d\n",when_done,time>>11, timeout);
+    }
+    if (status & ATA_CB_STAT_ERR)
+    {
+      BX_DEBUG_ATA("await_ide: ERROR (TIMEOUT,BSY,!BSY,!BSY_DRQ,!BSY_!DRQ,!BSY_RDY) %d time= %ld timeout= %d\n",when_done,time>>11, timeout);
+      return -1;
+    }
+    if ((timeout == 0) || ((time>>11) > timeout)) break;
+  }
+  BX_INFO("IDE time out\n");
+  return -1;
+}
+
 // ---------------------------------------------------------------------------
 // ATA/ATAPI driver : device detection
 // ---------------------------------------------------------------------------
@@ -2600,7 +2616,7 @@ void ata_detect( )
 
   // Device detection
   hdcount=cdcount=0;
-  
+
   for(device=0; device<BX_MAX_ATA_DEVICES; device++) {
     Bit16u iobase1, iobase2;
     Bit8u  channel, slave, shift;
@@ -2630,33 +2646,34 @@ void ata_detect( )
 
     if ( (sc == 0x55) && (sn == 0xaa) ) {
       write_byte(ebda_seg,&EbdaData->ata.devices[device].type,ATA_TYPE_UNKNOWN);
-    
+
       // reset the channel
-      ata_reset (device);
-      
+      ata_reset(device);
+
       // check for ATA or ATAPI
       outb(iobase1+ATA_CB_DH, slave ? ATA_CB_DH_DEV1 : ATA_CB_DH_DEV0);
       sc = inb(iobase1+ATA_CB_SC);
       sn = inb(iobase1+ATA_CB_SN);
-      if ( (sc==0x01) && (sn==0x01) ) {
+      if ((sc==0x01) && (sn==0x01)) {
         cl = inb(iobase1+ATA_CB_CL);
         ch = inb(iobase1+ATA_CB_CH);
         st = inb(iobase1+ATA_CB_STAT);
 
-        if ( (cl==0x14) && (ch==0xeb) ) {
+        if ((cl==0x14) && (ch==0xeb)) {
           write_byte(ebda_seg,&EbdaData->ata.devices[device].type,ATA_TYPE_ATAPI);
-          }
-        else if ( (cl==0x00) && (ch==0x00) && (st!=0x00) ) {
+        } else if ((cl==0x00) && (ch==0x00) && (st!=0x00)) {
           write_byte(ebda_seg,&EbdaData->ata.devices[device].type,ATA_TYPE_ATA);
-          }
+        } else if ((cl==0xff) && (ch==0xff)) {
+          write_byte(ebda_seg,&EbdaData->ata.devices[device].type,ATA_TYPE_NONE);
         }
       }
+    }
 
     type=read_byte(ebda_seg,&EbdaData->ata.devices[device].type);
-    
-    // Now we send a IDENTIFY command to ATA device 
+
+    // Now we send a IDENTIFY command to ATA device
     if(type == ATA_TYPE_ATA) {
-      Bit32u sectors;
+      Bit32u sectors_low, sectors_high;
       Bit16u cylinders, heads, spt, blksize;
       Bit8u  translation, removable, mode;
 
@@ -2667,21 +2684,26 @@ void ata_detect( )
       write_byte(ebda_seg,&EbdaData->ata.devices[device].device,ATA_DEVICE_HD);
       write_byte(ebda_seg,&EbdaData->ata.devices[device].mode, ATA_MODE_PIO16);
 
-      if (ata_cmd_data_in(device,ATA_CMD_IDENTIFY_DEVICE, 1, 0, 0, 0, 0L, get_SS(),buffer) !=0 )
+      if (ata_cmd_data_in(device,ATA_CMD_IDENTIFY_DEVICE, 1, 0, 0, 0, 0L, 0L, get_SS(),buffer) !=0 )
         BX_PANIC("ata-detect: Failed to detect ATA device\n");
 
       removable = (read_byte(get_SS(),buffer+0) & 0x80) ? 1 : 0;
-#ifndef        NO_PIO32
+#ifndef        NO_PIO32
       mode      = read_byte(get_SS(),buffer+96) ? ATA_MODE_PIO32 : ATA_MODE_PIO16;
 #endif
-
       blksize   = read_word(get_SS(),buffer+10);
-      
+
       cylinders = read_word(get_SS(),buffer+(1*2)); // word 1
       heads     = read_word(get_SS(),buffer+(3*2)); // word 3
       spt       = read_word(get_SS(),buffer+(6*2)); // word 6
 
-      sectors   = read_dword(get_SS(),buffer+(60*2)); // word 60 and word 61
+      if (read_word(get_SS(),buffer+(83*2)) & (1 << 10)) { // word 83 - lba48 support
+        sectors_low  = read_dword(get_SS(),buffer+(100*2)); // word 100 and word 101
+        sectors_high = read_dword(get_SS(),buffer+(102*2)); // word 102 and word 103
+      } else {
+        sectors_low = read_dword(get_SS(),buffer+(60*2)); // word 60 and word 61
+        sectors_high = 0;
+      }
 
       write_byte(ebda_seg,&EbdaData->ata.devices[device].device,ATA_DEVICE_HD);
       write_byte(ebda_seg,&EbdaData->ata.devices[device].removable, removable);
@@ -2690,7 +2712,8 @@ void ata_detect( )
       write_word(ebda_seg,&EbdaData->ata.devices[device].pchs.heads, heads);
       write_word(ebda_seg,&EbdaData->ata.devices[device].pchs.cylinders, cylinders);
       write_word(ebda_seg,&EbdaData->ata.devices[device].pchs.spt, spt);
-      write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors, sectors);
+      write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_low, sectors_low);
+      write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_high, sectors_high);
       BX_INFO("ata%d-%d: PCHS=%u/%d/%d translation=", channel, slave,cylinders, heads, spt);
 
       translation = inb_cmos(0x39 + channel/2);
@@ -2718,14 +2741,14 @@ void ata_detect( )
           break;
         case ATA_TRANSLATION_LBA:
           spt = 63;
-          sectors /= 63;
-          heads = sectors / 1024;
+          sectors_low /= 63;
+          heads = sectors_low / 1024;
           if (heads>128) heads = 255;
           else if (heads>64) heads = 128;
           else if (heads>32) heads = 64;
           else if (heads>16) heads = 32;
           else heads=16;
-          cylinders = sectors / heads;
+          cylinders = sectors_low / heads;
           break;
         case ATA_TRANSLATION_RECHS:
           // Take care not to overflow
@@ -2752,15 +2775,15 @@ void ata_detect( )
       write_word(ebda_seg,&EbdaData->ata.devices[device].lchs.heads, heads);
       write_word(ebda_seg,&EbdaData->ata.devices[device].lchs.cylinders, cylinders);
       write_word(ebda_seg,&EbdaData->ata.devices[device].lchs.spt, spt);
-      // fill hdidmap 
+
+      // fill hdidmap
       write_byte(ebda_seg,&EbdaData->ata.hdidmap[hdcount], device);
       hdcount++;
       }
-    
+
     // Now we send a IDENTIFY command to ATAPI device
     if(type == ATA_TYPE_ATAPI) {
+
       Bit8u  type, removable, mode;
       Bit16u blksize;
 
@@ -2771,12 +2794,12 @@ void ata_detect( )
       write_byte(ebda_seg,&EbdaData->ata.devices[device].device,ATA_DEVICE_CDROM);
       write_byte(ebda_seg,&EbdaData->ata.devices[device].mode, ATA_MODE_PIO16);
 
-      if (ata_cmd_data_in(device,ATA_CMD_IDENTIFY_DEVICE_PACKET, 1, 0, 0, 0, 0L, get_SS(),buffer) != 0)
+      if (ata_cmd_data_in(device,ATA_CMD_IDENTIFY_DEVICE_PACKET, 1, 0, 0, 0, 0L, 0L, get_SS(),buffer) != 0)
         BX_PANIC("ata-detect: Failed to detect ATAPI device\n");
 
       type      = read_byte(get_SS(),buffer+1) & 0x1f;
       removable = (read_byte(get_SS(),buffer+0) & 0x80) ? 1 : 0;
-#ifndef        NO_PIO32
+#ifndef        NO_PIO32
       mode      = read_byte(get_SS(),buffer+96) ? ATA_MODE_PIO32 : ATA_MODE_PIO16;
 #endif
       blksize   = 2048;
@@ -2786,24 +2809,24 @@ void ata_detect( )
       write_byte(ebda_seg,&EbdaData->ata.devices[device].mode, mode);
       write_word(ebda_seg,&EbdaData->ata.devices[device].blksize, blksize);
 
-      // fill cdidmap 
+      // fill cdidmap
       write_byte(ebda_seg,&EbdaData->ata.cdidmap[cdcount], device);
       cdcount++;
       }
-  
+
       {
       Bit32u sizeinmb;
       Bit16u ataversion;
       Bit8u  c, i, version, model[41];
-      
+
       switch (type) {
         case ATA_TYPE_ATA:
-          sizeinmb = read_dword(ebda_seg,&EbdaData->ata.devices[device].sectors);
-          sizeinmb >>= 11;
+          sizeinmb = (read_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_high) << 21)
+            | (read_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_low) >> 11);
         case ATA_TYPE_ATAPI:
           // Read ATA/ATAPI version
           ataversion=((Bit16u)(read_byte(get_SS(),buffer+161))<<8)|read_byte(get_SS(),buffer+160);
-          for(version=15;version>0;version--) { 
+          for(version=15;version>0;version--) {
             if((ataversion&(1<<version))!=0)
             break;
             }
@@ -2812,7 +2835,7 @@ void ata_detect( )
           for(i=0;i<20;i++){
             write_byte(get_SS(),model+(i*2),read_byte(get_SS(),buffer+(i*2)+54+1));
             write_byte(get_SS(),model+(i*2)+1,read_byte(get_SS(),buffer+(i*2)+54));
-            }
+          }
 
           // Reformat
           write_byte(get_SS(),model+40,0x00);
@@ -2820,7 +2843,13 @@ void ata_detect( )
             if(read_byte(get_SS(),model+i)==0x20)
               write_byte(get_SS(),model+i,0x00);
             else break;
+          }
+          if (i>36) {
+            write_byte(get_SS(),model+36,0x00);
+            for(i=35;i>32;i--){
+              write_byte(get_SS(),model+i,0x2E);
             }
+          }
           break;
         }
 
@@ -2828,10 +2857,10 @@ void ata_detect( )
         case ATA_TYPE_ATA:
           printf("ata%d %s: ",channel,slave?" slave":"master");
           i=0; while(c=read_byte(get_SS(),model+i++)) printf("%c",c);
-          if (sizeinmb < 1UL<<16)
-            printf(" ATA-%d Hard-Disk (%04u MBytes)\n",version,(Bit16u)sizeinmb);
-          else
-            printf(" ATA-%d Hard-Disk (%04u GBytes)\n",version,(Bit16u)(sizeinmb>>10));
+         if (sizeinmb < (1UL<<16))
+            printf(" ATA-%d Hard-Disk (%4u MBytes)\n", version, (Bit16u)sizeinmb);
+         else
+            printf(" ATA-%d Hard-Disk (%4u GBytes)\n", version, (Bit16u)(sizeinmb>>10));
           break;
         case ATA_TYPE_ATAPI:
           printf("ata%d %s: ",channel,slave?" slave":"master");
@@ -2852,17 +2881,17 @@ void ata_detect( )
   write_byte(ebda_seg,&EbdaData->ata.hdcount, hdcount);
   write_byte(ebda_seg,&EbdaData->ata.cdcount, cdcount);
   write_byte(0x40,0x75, hdcount);
+
   printf("\n");
 
   // FIXME : should use bios=cmos|auto|disable bits
   // FIXME : should know about translation bits
-  // FIXME : move hard_drive_post here 
-  
+  // FIXME : move hard_drive_post here
+
 }
 
 // ---------------------------------------------------------------------------
-// ATA/ATAPI driver : software reset 
+// ATA/ATAPI driver : software reset
 // ---------------------------------------------------------------------------
 // ATA-3
 // 8.2.1 Software reset - Device 0
@@ -2872,7 +2901,8 @@ Bit16u device;
 {
   Bit16u ebda_seg=read_word(0x0040,0x000E);
   Bit16u iobase1, iobase2;
-  Bit8u  channel, slave, sn, sc; 
+  Bit8u  channel, slave, sn, sc;
+  Bit8u  type;
   Bit16u max;
 
   channel = device / 2;
@@ -2887,16 +2917,13 @@ Bit16u device;
   outb(iobase2+ATA_CB_DC, ATA_CB_DC_HD15 | ATA_CB_DC_NIEN | ATA_CB_DC_SRST);
 
 // 8.2.1 (b) -- wait for BSY
-  max=0xff;
-  while(--max>0) {
-    Bit8u status = inb(iobase1+ATA_CB_STAT);
-    if ((status & ATA_CB_STAT_BSY) != 0) break;
-  }
+  await_ide(BSY, iobase1, 20);
 
 // 8.2.1 (f) -- clear SRST
   outb(iobase2+ATA_CB_DC, ATA_CB_DC_HD15 | ATA_CB_DC_NIEN);
 
-  if (read_byte(ebda_seg,&EbdaData->ata.devices[device].type) != ATA_TYPE_NONE) {
+  type=read_byte(ebda_seg,&EbdaData->ata.devices[device].type);
+  if (type != ATA_TYPE_NONE) {
 
 // 8.2.1 (g) -- check for sc==sn==0x01
     // select device
@@ -2905,21 +2932,14 @@ Bit16u device;
     sn = inb(iobase1+ATA_CB_SN);
 
     if ( (sc==0x01) && (sn==0x01) ) {
-
-// 8.2.1 (h) -- wait for not BSY
-      max=0xff;
-      while(--max>0) {
-        Bit8u status = inb(iobase1+ATA_CB_STAT);
-        if ((status & ATA_CB_STAT_BSY) == 0) break;
-        }
-      }
+      if (type == ATA_TYPE_ATA) //ATA
+        await_ide(NOT_BSY_RDY, iobase1, IDE_TIMEOUT);
+      else //ATAPI
+        await_ide(NOT_BSY, iobase1, IDE_TIMEOUT);
     }
 
-// 8.2.1 (i) -- wait for DRDY
-  max=0xfff;
-  while(--max>0) {
-    Bit8u status = inb(iobase1+ATA_CB_STAT);
-      if ((status & ATA_CB_STAT_RDY) != 0) break;
+// 8.2.1 (h) -- wait for not BSY
+    await_ide(NOT_BSY, iobase1, IDE_TIMEOUT);
   }
 
   // Enable interrupts
@@ -2927,7 +2947,7 @@ Bit16u device;
 }
 
 // ---------------------------------------------------------------------------
-// ATA/ATAPI driver : execute a non data command 
+// ATA/ATAPI driver : execute a non data command
 // ---------------------------------------------------------------------------
 
 Bit16u ata_cmd_non_data()
@@ -2945,9 +2965,9 @@ Bit16u ata_cmd_non_data()
       // 5 : more sectors to read/verify
       // 6 : no sectors left to write
       // 7 : more sectors to write
-Bit16u ata_cmd_data_in(device, command, count, cylinder, head, sector, lba, segment, offset)
+Bit16u ata_cmd_data_in(device, command, count, cylinder, head, sector, lba_low, lba_high, segment, offset)
 Bit16u device, command, count, cylinder, head, sector, segment, offset;
-Bit32u lba;
+Bit32u lba_low, lba_high;
 {
   Bit16u ebda_seg=read_word(0x0040,0x000E);
   Bit16u iobase1, iobase2, blksize;
@@ -2976,22 +2996,20 @@ Bit32u lba;
 
   // sector will be 0 only on lba access. Convert to lba-chs
   if (sector == 0) {
-    if ((count >= 1 << 8) || (lba + count >= 1UL << 28)) {
+    if ((count >= 1 << 8) || lba_high || (lba_low + count >= 1UL << 28)) {
       outb(iobase1 + ATA_CB_FR, 0x00);
       outb(iobase1 + ATA_CB_SC, (count >> 8) & 0xff);
-      outb(iobase1 + ATA_CB_SN, lba >> 24);
-      outb(iobase1 + ATA_CB_CL, 0);
-      outb(iobase1 + ATA_CB_CH, 0);
+      outb(iobase1 + ATA_CB_SN, lba_low >> 24);
+      outb(iobase1 + ATA_CB_CL, lba_high & 0xff);
+      outb(iobase1 + ATA_CB_CH, lba_high >> 8);
       command |= 0x04;
       count &= (1UL << 8) - 1;
-      lba &= (1UL << 24) - 1;
+      lba_low &= (1UL << 24) - 1;
       }
-    sector = (Bit16u) (lba & 0x000000ffL);
-    lba >>= 8;
-    cylinder = (Bit16u) (lba & 0x0000ffffL);
-    lba >>= 16;
-    head = ((Bit16u) (lba & 0x0000000fL)) | 0x40;
-    }
+    sector = (Bit16u) (lba_low & 0x000000ffL);
+    cylinder = (Bit16u) ((lba_low>>8) & 0x0000ffffL);
+    head = ((Bit16u) ((lba_low>>24) & 0x0000000fL)) | ATA_CB_DH_LBA;
+  }
 
   outb(iobase1 + ATA_CB_FR, 0x00);
   outb(iobase1 + ATA_CB_SC, count);
@@ -3001,10 +3019,8 @@ Bit32u lba;
   outb(iobase1 + ATA_CB_DH, (slave ? ATA_CB_DH_DEV1 : ATA_CB_DH_DEV0) | (Bit8u) head );
   outb(iobase1 + ATA_CB_CMD, command);
 
-  while (1) {
-    status = inb(iobase1 + ATA_CB_STAT);
-    if ( !(status & ATA_CB_STAT_BSY) ) break;
-    }
+  await_ide(NOT_BSY_DRQ, iobase1, IDE_TIMEOUT);
+  status = inb(iobase1 + ATA_CB_STAT);
 
   if (status & ATA_CB_STAT_ERR) {
     BX_DEBUG_ATA("ata_cmd_data_in : read error\n");
@@ -3025,12 +3041,12 @@ ASM_END
 ASM_START
         push bp
         mov  bp, sp
-        mov  di, _ata_cmd_data_in.offset + 2[bp]  
-        mov  ax, _ata_cmd_data_in.segment + 2[bp] 
-        mov  cx, _ata_cmd_data_in.blksize + 2[bp] 
+        mov  di, _ata_cmd_data_in.offset + 2[bp]
+        mov  ax, _ata_cmd_data_in.segment + 2[bp]
+        mov  cx, _ata_cmd_data_in.blksize + 2[bp]
 
         ;; adjust if there will be an overrun. 2K max sector size
-        cmp   di, #0xf800 ;; 
+        cmp   di, #0xf800 ;;
         jbe   ata_in_no_adjust
 
 ata_in_adjust:
@@ -3042,7 +3058,7 @@ ata_in_no_adjust:
 
         mov   dx, _ata_cmd_data_in.iobase1 + 2[bp] ;; ATA data read port
 
-        mov  ah, _ata_cmd_data_in.mode + 2[bp] 
+        mov  ah, _ata_cmd_data_in.mode + 2[bp]
         cmp  ah, #ATA_MODE_PIO32
         je   ata_in_32
 
@@ -3064,9 +3080,10 @@ ASM_END
     current++;
     write_word(ebda_seg, &EbdaData->ata.trsfsectors,current);
     count--;
+    await_ide(NOT_BSY, iobase1, IDE_TIMEOUT);
     status = inb(iobase1 + ATA_CB_STAT);
     if (count == 0) {
-      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) ) 
+      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) )
           != ATA_CB_STAT_RDY ) {
         BX_DEBUG_ATA("ata_cmd_data_in : no sectors left (status %02x)\n", (unsigned) status);
         return 4;
@@ -3074,7 +3091,7 @@ ASM_END
       break;
       }
     else {
-      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) ) 
+      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) )
           != (ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ) ) {
         BX_DEBUG_ATA("ata_cmd_data_in : more sectors left (status %02x)\n", (unsigned) status);
         return 5;
@@ -3099,9 +3116,9 @@ ASM_END
       // 5 : more sectors to read/verify
       // 6 : no sectors left to write
       // 7 : more sectors to write
-Bit16u ata_cmd_data_out(device, command, count, cylinder, head, sector, lba, segment, offset)
+Bit16u ata_cmd_data_out(device, command, count, cylinder, head, sector, lba_low, lba_high, segment, offset)
 Bit16u device, command, count, cylinder, head, sector, segment, offset;
-Bit32u lba;
+Bit32u lba_low, lba_high;
 {
   Bit16u ebda_seg=read_word(0x0040,0x000E);
   Bit16u iobase1, iobase2, blksize;
@@ -3130,22 +3147,20 @@ Bit32u lba;
 
   // sector will be 0 only on lba access. Convert to lba-chs
   if (sector == 0) {
-    if ((count >= 1 << 8) || (lba + count >= 1UL << 28)) {
+    if ((count >= 1 << 8) || lba_high || (lba_low + count >= 1UL << 28)) {
       outb(iobase1 + ATA_CB_FR, 0x00);
       outb(iobase1 + ATA_CB_SC, (count >> 8) & 0xff);
-      outb(iobase1 + ATA_CB_SN, lba >> 24);
-      outb(iobase1 + ATA_CB_CL, 0);
-      outb(iobase1 + ATA_CB_CH, 0);
+      outb(iobase1 + ATA_CB_SN, lba_low >> 24);
+      outb(iobase1 + ATA_CB_CL, lba_high & 0xff);
+      outb(iobase1 + ATA_CB_CH, lba_high >> 8);
       command |= 0x04;
       count &= (1UL << 8) - 1;
-      lba &= (1UL << 24) - 1;
+      lba_low &= (1UL << 24) - 1;
       }
-    sector = (Bit16u) (lba & 0x000000ffL);
-    lba >>= 8;
-    cylinder = (Bit16u) (lba & 0x0000ffffL);
-    lba >>= 16;
-    head = ((Bit16u) (lba & 0x0000000fL)) | 0x40;
-    }
+    sector = (Bit16u) (lba_low & 0x000000ffL);
+    cylinder = (Bit16u) ((lba_low>>8) & 0x0000ffffL);
+    head = ((Bit16u) ((lba_low>>24) & 0x0000000fL)) | ATA_CB_DH_LBA;
+  }
 
   outb(iobase1 + ATA_CB_FR, 0x00);
   outb(iobase1 + ATA_CB_SC, count);
@@ -3155,10 +3170,8 @@ Bit32u lba;
   outb(iobase1 + ATA_CB_DH, (slave ? ATA_CB_DH_DEV1 : ATA_CB_DH_DEV0) | (Bit8u) head );
   outb(iobase1 + ATA_CB_CMD, command);
 
-  while (1) {
-    status = inb(iobase1 + ATA_CB_STAT);
-    if ( !(status & ATA_CB_STAT_BSY) ) break;
-    }
+  await_ide(NOT_BSY_DRQ, iobase1, IDE_TIMEOUT);
+  status = inb(iobase1 + ATA_CB_STAT);
 
   if (status & ATA_CB_STAT_ERR) {
     BX_DEBUG_ATA("ata_cmd_data_out : read error\n");
@@ -3179,12 +3192,12 @@ ASM_END
 ASM_START
         push bp
         mov  bp, sp
-        mov  si, _ata_cmd_data_out.offset + 2[bp]  
-        mov  ax, _ata_cmd_data_out.segment + 2[bp] 
-        mov  cx, _ata_cmd_data_out.blksize + 2[bp] 
+        mov  si, _ata_cmd_data_out.offset + 2[bp]
+        mov  ax, _ata_cmd_data_out.segment + 2[bp]
+        mov  cx, _ata_cmd_data_out.blksize + 2[bp]
 
         ;; adjust if there will be an overrun. 2K max sector size
-        cmp   si, #0xf800 ;; 
+        cmp   si, #0xf800 ;;
         jbe   ata_out_no_adjust
 
 ata_out_adjust:
@@ -3196,7 +3209,7 @@ ata_out_no_adjust:
 
         mov   dx, _ata_cmd_data_out.iobase1 + 2[bp] ;; ATA data write port
 
-        mov  ah, _ata_cmd_data_out.mode + 2[bp] 
+        mov  ah, _ata_cmd_data_out.mode + 2[bp]
         cmp  ah, #ATA_MODE_PIO32
         je   ata_out_32
 
@@ -3222,7 +3235,7 @@ ASM_END
     count--;
     status = inb(iobase1 + ATA_CB_STAT);
     if (count == 0) {
-      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DF | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) ) 
+      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DF | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) )
           != ATA_CB_STAT_RDY ) {
         BX_DEBUG_ATA("ata_cmd_data_out : no sectors left (status %02x)\n", (unsigned) status);
         return 6;
@@ -3230,7 +3243,7 @@ ASM_END
       break;
       }
     else {
-      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) ) 
+      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) )
           != (ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ) ) {
         BX_DEBUG_ATA("ata_cmd_data_out : more sectors left (status %02x)\n", (unsigned) status);
         return 7;
@@ -3297,19 +3310,17 @@ Bit32u length;
   if (status & ATA_CB_STAT_BSY) return 2;
 
   outb(iobase2 + ATA_CB_DC, ATA_CB_DC_HD15 | ATA_CB_DC_NIEN);
-  // outb(iobase1 + ATA_CB_FR, 0x00);
-  // outb(iobase1 + ATA_CB_SC, 0x00);
-  // outb(iobase1 + ATA_CB_SN, 0x00);
+  outb(iobase1 + ATA_CB_FR, 0x00);
+  outb(iobase1 + ATA_CB_SC, 0x00);
+  outb(iobase1 + ATA_CB_SN, 0x00);
   outb(iobase1 + ATA_CB_CL, 0xfff0 & 0x00ff);
   outb(iobase1 + ATA_CB_CH, 0xfff0 >> 8);
   outb(iobase1 + ATA_CB_DH, slave ? ATA_CB_DH_DEV1 : ATA_CB_DH_DEV0);
   outb(iobase1 + ATA_CB_CMD, ATA_CMD_PACKET);
 
   // Device should ok to receive command
-  while (1) {
-    status = inb(iobase1 + ATA_CB_STAT);
-    if ( !(status & ATA_CB_STAT_BSY) ) break;
-    }
+  await_ide(NOT_BSY_DRQ, iobase1, IDE_TIMEOUT);
+  status = inb(iobase1 + ATA_CB_STAT);
 
   if (status & ATA_CB_STAT_ERR) {
     BX_DEBUG_ATA("ata_cmd_packet : error, status is %02x\n",status);
@@ -3326,13 +3337,13 @@ Bit32u length;
   // Send command to device
 ASM_START
       sti  ;; enable higher priority interrupts
+
       push bp
       mov  bp, sp
-    
-      mov  si, _ata_cmd_packet.cmdoff + 2[bp]  
-      mov  ax, _ata_cmd_packet.cmdseg + 2[bp] 
-      mov  cx, _ata_cmd_packet.cmdlen + 2[bp] 
+
+      mov  si, _ata_cmd_packet.cmdoff + 2[bp]
+      mov  ax, _ata_cmd_packet.cmdseg + 2[bp]
+      mov  cx, _ata_cmd_packet.cmdlen + 2[bp]
       mov  es, ax      ;; segment in es
 
       mov  dx, _ata_cmd_packet.iobase1 + 2[bp] ;; ATA data write port
@@ -3345,32 +3356,38 @@ ASM_START
 ASM_END
 
   if (inout == ATA_DATA_NO) {
+    await_ide(NOT_BSY, iobase1, IDE_TIMEOUT);
     status = inb(iobase1 + ATA_CB_STAT);
     }
   else {
+        Bit16u loops = 0;
+        Bit8u sc;
   while (1) {
 
+      if (loops == 0) {//first time through
+        status = inb(iobase2 + ATA_CB_ASTAT);
+        await_ide(NOT_BSY_DRQ, iobase1, IDE_TIMEOUT);
+      }
+      else
+        await_ide(NOT_BSY, iobase1, IDE_TIMEOUT);
+      loops++;
+
       status = inb(iobase1 + ATA_CB_STAT);
+      sc = inb(iobase1 + ATA_CB_SC);
 
       // Check if command completed
-      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_DRQ) ) ==0 ) break;
+      if(((inb(iobase1 + ATA_CB_SC)&0x7)==0x3) &&
+         ((status & (ATA_CB_STAT_RDY | ATA_CB_STAT_ERR)) == ATA_CB_STAT_RDY)) break;
 
       if (status & ATA_CB_STAT_ERR) {
         BX_DEBUG_ATA("ata_cmd_packet : error (status %02x)\n",status);
         return 3;
       }
 
-      // Device must be ready to send data
-      if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) ) 
-            != (ATA_CB_STAT_RDY | ATA_CB_STAT_DRQ) ) {
-        BX_DEBUG_ATA("ata_cmd_packet : not ready (status %02x)\n", status);
-        return 4;
-        }
-
       // Normalize address
       bufseg += (bufoff / 16);
       bufoff %= 16;
-    
+
       // Get the byte count
       lcount =  ((Bit16u)(inb(iobase1 + ATA_CB_CH))<<8)+inb(iobase1 + ATA_CB_CL);
 
@@ -3431,10 +3448,10 @@ ASM_START
 
         mov  dx, _ata_cmd_packet.iobase1 + 2[bp] ;; ATA data read port
 
-        mov  cx, _ata_cmd_packet.lbefore + 2[bp] 
+        mov  cx, _ata_cmd_packet.lbefore + 2[bp]
         jcxz ata_packet_no_before
 
-        mov  ah, _ata_cmd_packet.lmode + 2[bp] 
+        mov  ah, _ata_cmd_packet.lmode + 2[bp]
         cmp  ah, #ATA_MODE_PIO32
         je   ata_packet_in_before_32
 
@@ -3451,14 +3468,14 @@ ata_packet_in_before_32_loop:
         pop  eax
 
 ata_packet_no_before:
-        mov  cx, _ata_cmd_packet.lcount + 2[bp] 
+        mov  cx, _ata_cmd_packet.lcount + 2[bp]
         jcxz ata_packet_after
 
-        mov  di, _ata_cmd_packet.bufoff + 2[bp]  
-        mov  ax, _ata_cmd_packet.bufseg + 2[bp] 
+        mov  di, _ata_cmd_packet.bufoff + 2[bp]
+        mov  ax, _ata_cmd_packet.bufseg + 2[bp]
         mov  es, ax
 
-        mov  ah, _ata_cmd_packet.lmode + 2[bp] 
+        mov  ah, _ata_cmd_packet.lmode + 2[bp]
         cmp  ah, #ATA_MODE_PIO32
         je   ata_packet_in_32
 
@@ -3472,10 +3489,10 @@ ata_packet_in_32:
           insd ;; CX dwords transfered to port(DX) to ES:[DI]
 
 ata_packet_after:
-        mov  cx, _ata_cmd_packet.lafter + 2[bp] 
+        mov  cx, _ata_cmd_packet.lafter + 2[bp]
         jcxz ata_packet_done
 
-        mov  ah, _ata_cmd_packet.lmode + 2[bp] 
+        mov  ah, _ata_cmd_packet.lmode + 2[bp]
         cmp  ah, #ATA_MODE_PIO32
         je   ata_packet_in_after_32
 
@@ -3505,7 +3522,7 @@ ASM_END
     }
 
   // Final check, device must be ready
-  if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DF | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) ) 
+  if ( (status & (ATA_CB_STAT_BSY | ATA_CB_STAT_RDY | ATA_CB_STAT_DF | ATA_CB_STAT_DRQ | ATA_CB_STAT_ERR) )
          != ATA_CB_STAT_RDY ) {
     BX_DEBUG_ATA("ata_cmd_packet : not ready (status %02x)\n", (unsigned) status);
     return 4;
@@ -3524,55 +3541,108 @@ ASM_END
 // Start of ATA/ATAPI generic functions
 // ---------------------------------------------------------------------------
 
-  Bit16u 
-atapi_get_sense(device)
+  Bit16u
+atapi_get_sense(device, seg, asc, ascq)
   Bit16u device;
 {
   Bit8u  atacmd[12];
-  Bit8u  buffer[16];
+  Bit8u  buffer[18];
   Bit8u i;
 
   memsetb(get_SS(),atacmd,0,12);
 
-  // Request SENSE 
-  atacmd[0]=0x03;    
-  atacmd[4]=0x20;    
-  if (ata_cmd_packet(device, 12, get_SS(), atacmd, 0, 16L, ATA_DATA_IN, get_SS(), buffer) != 0)
+  // Request SENSE
+  atacmd[0]=ATA_CMD_REQUEST_SENSE;
+  atacmd[4]=sizeof(buffer);
+  if (ata_cmd_packet(device, 12, get_SS(), atacmd, 0, 18L, ATA_DATA_IN, get_SS(), buffer) != 0)
     return 0x0002;
 
-  if ((buffer[0] & 0x7e) == 0x70) {
-    return (((Bit16u)buffer[2]&0x0f)*0x100)+buffer[12];
-    }
+  write_byte(seg,asc,buffer[12]);
+  write_byte(seg,ascq,buffer[13]);
 
   return 0;
 }
 
-  Bit16u 
+  Bit16u
 atapi_is_ready(device)
   Bit16u device;
 {
-  Bit8u  atacmd[12];
-  Bit8u  buffer[];
+  Bit8u packet[12];
+  Bit8u buf[8];
+  Bit32u block_len;
+  Bit32u sectors;
+  Bit32u timeout; //measured in ms
+  Bit32u time;
+  Bit8u asc, ascq;
+  Bit8u in_progress;
+  Bit16u ebda_seg = read_word(0x0040,0x000E);
+  if (read_byte(ebda_seg,&EbdaData->ata.devices[device].type) != ATA_TYPE_ATAPI) {
+    printf("not implemented for non-ATAPI device\n");
+    return -1;
+  }
 
-  memsetb(get_SS(),atacmd,0,12);
-  // Test Unit Ready
-  if (ata_cmd_packet(device, 12, get_SS(), atacmd, 0, 0L, ATA_DATA_NO, get_SS(), buffer) != 0)
-    return 0x000f;
+  BX_DEBUG_ATA("ata_detect_medium: begin\n");
+  memsetb(get_SS(),packet, 0, sizeof packet);
+  packet[0] = 0x25; /* READ CAPACITY */
+
+  /* Retry READ CAPACITY 50 times unless MEDIUM NOT PRESENT
+   * is reported by the device. If the device reports "IN PROGRESS",
+   * 30 seconds is added. */
+  timeout = 5000;
+  time = 0;
+  in_progress = 0;
+  while (time < timeout) {
+    if (ata_cmd_packet(device, sizeof(packet), get_SS(), packet, 0, 8L, ATA_DATA_IN, get_SS(), buf) == 0)
+      goto ok;
+
+    if (atapi_get_sense(device, get_SS(), &asc, &ascq) == 0) {
+      if (asc == 0x3a) { /* MEDIUM NOT PRESENT */
+        BX_DEBUG_ATA("Device reports MEDIUM NOT PRESENT\n");
+        return -1;
+      }
 
-  if (atapi_get_sense(device) !=0 ) {
-    memsetb(get_SS(),atacmd,0,12);
+      if (asc == 0x04 && ascq == 0x01 && !in_progress) {
+        /* IN PROGRESS OF BECOMING READY */
+        printf("Waiting for device to detect medium... ");
+        /* Allow 30 seconds more */
+        timeout = 30000;
+        in_progress = 1;
+      }
+    }
+    time += 100;
+  }
+  BX_DEBUG_ATA("read capacity failed\n");
+  return -1;
+ok:
 
-    // try to send Test Unit Ready again
-    if (ata_cmd_packet(device, 12, get_SS(), atacmd, 0, 0L, ATA_DATA_NO, get_SS(), buffer) != 0)
-      return 0x000f;
+  block_len = (Bit32u) buf[4] << 24
+    | (Bit32u) buf[5] << 16
+    | (Bit32u) buf[6] << 8
+    | (Bit32u) buf[7] << 0;
+  BX_DEBUG_ATA("block_len=%u\n", block_len);
 
-    return atapi_get_sense(device);
-    }
+  if (block_len!= 2048 && block_len!= 512)
+  {
+    printf("Unsupported sector size %u\n", block_len);
+    return -1;
+  }
+  write_dword(ebda_seg,&EbdaData->ata.devices[device].blksize, block_len);
+
+  sectors = (Bit32u) buf[0] << 24
+    | (Bit32u) buf[1] << 16
+    | (Bit32u) buf[2] << 8
+    | (Bit32u) buf[3] << 0;
+
+  BX_DEBUG_ATA("sectors=%u\n", sectors);
+  if (block_len == 2048)
+    sectors <<= 2; /* # of sectors in 512-byte "soft" sector */
+  if (sectors != read_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_low))
+    printf("%dMB medium detected\n", sectors>>(20-9));
+  write_dword(ebda_seg,&EbdaData->ata.devices[device].sectors_low, sectors);
   return 0;
 }
 
-  Bit16u 
+  Bit16u
 atapi_is_cdrom(device)
   Bit8u device;
 {
@@ -3632,7 +3702,7 @@ static char eltorito[24]="EL TORITO SPECIFICATION";
 //
 // Returns ah: emulated drive, al: error code
 //
-  Bit16u 
+  Bit16u
 cdrom_boot()
 {
   Bit16u ebda_seg=read_word(0x0040,0x000E);
@@ -3645,10 +3715,13 @@ cdrom_boot()
   for (device=0; device<BX_MAX_ATA_DEVICES;device++) {
     if (atapi_is_cdrom(device)) break;
     }
-  
+
   // if not found
   if(device >= BX_MAX_ATA_DEVICES) return 2;
 
+  if(error = atapi_is_ready(device) != 0)
+    BX_INFO("ata_is_ready returned %d\n",error);
+
   // Read the Boot Record Volume Descriptor
   memsetb(get_SS(),atacmd,0,12);
   atacmd[0]=0x28;                      // READ command
@@ -3668,7 +3741,7 @@ cdrom_boot()
    }
   for(i=0;i<23;i++)
     if(buffer[7+i]!=read_byte(0xf000,&eltorito[i]))return 6;
-  
+
   // ok, now we calculate the Boot catalog address
   lba=buffer[0x4A]*0x1000000+buffer[0x49]*0x10000+buffer[0x48]*0x100+buffer[0x47];
 
@@ -3683,7 +3756,7 @@ cdrom_boot()
   atacmd[5]=(lba & 0x000000ff);
   if((error = ata_cmd_packet(device, 12, get_SS(), atacmd, 0, 2048L, ATA_DATA_IN, get_SS(), buffer)) != 0)
     return 7;
+
   // Validation entry
   if(buffer[0x00]!=0x01)return 8;   // Header
   if(buffer[0x01]!=0x00)return 9;   // Platform
@@ -3702,10 +3775,10 @@ cdrom_boot()
 
   write_byte(ebda_seg,&EbdaData->cdemu.media,buffer[0x21]);
   if(buffer[0x21]==0){
-    // FIXME ElTorito Hardcoded. cdrom is hardcoded as device 0xE0. 
+    // FIXME ElTorito Hardcoded. cdrom is hardcoded as device 0xE0.
     // Win2000 cd boot needs to know it booted from cd
     write_byte(ebda_seg,&EbdaData->cdemu.emulated_drive,0xE0);
-    } 
+    }
   else if(buffer[0x21]<4)
     write_byte(ebda_seg,&EbdaData->cdemu.emulated_drive,0x00);
   else
@@ -3719,7 +3792,7 @@ cdrom_boot()
 
   write_word(ebda_seg,&EbdaData->cdemu.load_segment,boot_segment);
   write_word(ebda_seg,&EbdaData->cdemu.buffer_segment,0x0000);
-  
+
   nbsectors=buffer[0x27]*0x100+buffer[0x26];
   write_word(ebda_seg,&EbdaData->cdemu.sector_count,nbsectors);
 
@@ -3744,7 +3817,6 @@ cdrom_boot()
   tcpa_ipl((Bit32u)1L,(Bit32u)boot_segment,(Bit32u)0L,(Bit32u)512L);
 #endif
 
-
   // Remember the media type
   switch(read_byte(ebda_seg,&EbdaData->cdemu.media)) {
     case 0x01:  // 1.2M floppy
@@ -3765,7 +3837,7 @@ cdrom_boot()
     case 0x04:  // Harddrive
       write_word(ebda_seg,&EbdaData->cdemu.vdevice.spt,read_byte(boot_segment,446+6)&0x3f);
       write_word(ebda_seg,&EbdaData->cdemu.vdevice.cylinders,
-             (read_byte(boot_segment,446+6)<<2) + read_byte(boot_segment,446+7) + 1);
+              (read_byte(boot_segment,446+6)<<2) + read_byte(boot_segment,446+7) + 1);
       write_word(ebda_seg,&EbdaData->cdemu.vdevice.heads,read_byte(boot_segment,446+5) + 1);
       break;
    }
@@ -3778,7 +3850,7 @@ cdrom_boot()
       write_byte(ebda_seg, &EbdaData->ata.hdcount, read_byte(ebda_seg, &EbdaData->ata.hdcount) + 1);
    }
 
-  
+
   // everything is ok, so from now on, the emulation is active
   if(read_byte(ebda_seg,&EbdaData->cdemu.media)!=0)
     write_byte(ebda_seg,&EbdaData->cdemu.active,0x01);
@@ -4124,9 +4196,10 @@ ASM_END
       regs.u.r8.al = inb_cmos(0x30);
       regs.u.r8.ah = inb_cmos(0x31);
 
-      // limit to 15M
-      if(regs.u.r16.ax > 0x3c00)
-        regs.u.r16.ax = 0x3c00;
+      // According to Ralf Brown's interrupt the limit should be 15M,
+      // but real machines mostly return max. 63M.
+      if(regs.u.r16.ax > 0xffc0)
+        regs.u.r16.ax = 0xffc0;
 
       CLEAR_CF();
 #endif
@@ -4344,13 +4417,35 @@ BX_DEBUG_INT15("case 2:\n");
 
         case 3: // Set Resolution
 BX_DEBUG_INT15("case 3:\n");
-          // BX:
+          // BH:
           //      0 =  25 dpi, 1 count  per millimeter
           //      1 =  50 dpi, 2 counts per millimeter
           //      2 = 100 dpi, 4 counts per millimeter
           //      3 = 200 dpi, 8 counts per millimeter
-          CLEAR_CF();
-          regs.u.r8.ah = 0;
+          comm_byte = inhibit_mouse_int_and_events(); // disable IRQ12 and packets
+          if (regs.u.r8.bh < 4) {
+            ret = send_to_mouse_ctrl(0xE8); // set resolution command
+            if (ret == 0) {
+              ret = get_mouse_data(&mouse_data1);
+              if (mouse_data1 != 0xfa)
+                BX_PANIC("Mouse status returned %02x (should be ack)\n", (unsigned)mouse_data1);
+              ret = send_to_mouse_ctrl(regs.u.r8.bh);
+              ret = get_mouse_data(&mouse_data1);
+              if (mouse_data1 != 0xfa)
+                BX_PANIC("Mouse status returned %02x (should be ack)\n", (unsigned)mouse_data1);
+              CLEAR_CF();
+              regs.u.r8.ah = 0;
+            } else {
+              // error
+              SET_CF();
+              regs.u.r8.ah = UNSUPPORTED_FUNCTION;
+            }
+          } else {
+            // error
+            SET_CF();
+            regs.u.r8.ah = UNSUPPORTED_FUNCTION;
+          }
+          set_kbd_command_byte(comm_byte); // restore IRQ12 and serial enable
           break;
 
         case 4: // Get Device ID
@@ -4472,7 +4567,30 @@ BX_DEBUG_INT15("case default:\n");
       break;
     }
 }
-#endif
+#endif // BX_USE_PS2_MOUSE
+
+
+void set_e820_range(ES, DI, start, end, type)
+     Bit16u ES;
+     Bit16u DI;
+     Bit32u start;
+     Bit32u end;
+     Bit16u type;
+{
+    write_word(ES, DI, start);
+    write_word(ES, DI+2, start >> 16);
+    write_word(ES, DI+4, 0x00);
+    write_word(ES, DI+6, 0x00);
+
+    end -= start;
+    write_word(ES, DI+8, end);
+    write_word(ES, DI+10, end >> 16);
+    write_word(ES, DI+12, 0x0000);
+    write_word(ES, DI+14, 0x0000);
+
+    write_word(ES, DI+16, type);
+    write_word(ES, DI+18, 0x0);
+}
 
   void
 int15_function32(regs, ES, DS, FLAGS)
@@ -4481,22 +4599,31 @@ int15_function32(regs, ES, DS, FLAGS)
 {
   Bit32u  extended_memory_size=0; // 64bits long
   Bit16u  CX,DX;
+#ifdef HVMASSIST
+  Bit16u off, e820_table_size;
+  Bit32u base, type, size;
+#endif
 
 BX_DEBUG_INT15("int15 AX=%04x\n",regs.u.r16.ax);
 
   switch (regs.u.r8.ah) {
     case 0x86:
-      // Wait for CX:DX microseconds. currently using the 
-      // refresh request port 0x61 bit4, toggling every 15usec 
+      // Wait for CX:DX microseconds. currently using the
+      // refresh request port 0x61 bit4, toggling every 15usec
 
       CX = regs.u.r16.cx;
       DX = regs.u.r16.dx;
 
 ASM_START
+      sti
+
       ;; Get the count in eax
-      mov  ax, .int15_function32.CX [bp]
+      mov  bx, sp
+SEG SS
+      mov  ax, _int15_function32.CX [bx]
       shl  eax, #16
-      mov  ax, .int15_function32.DX [bp]
+SEG SS
+      mov  ax, _int15_function32.DX [bx]
 
       ;; convert to numbers of 15usec ticks
       mov ebx, #15
@@ -4527,8 +4654,9 @@ ASM_END
     case 0xe8:
         switch(regs.u.r8.al)
         {
-        case 0x20: {
-            Bit16u e820_table_size = read_word(0xe000, 0x8) * 0x14;
+#ifdef HVMASSIST
+       case 0x20: {
+            e820_table_size = read_word(E820_SEG, E820_NR_OFFSET) * 0x14;
 
             if (regs.u.r32.edx != 0x534D4150) /* SMAP */
                 goto int15_unimplemented;
@@ -4536,16 +4664,14 @@ ASM_END
             if ((regs.u.r16.bx / 0x14) * 0x14 == regs.u.r16.bx) {
                 if (regs.u.r16.bx + 0x14 <= e820_table_size)
                     memcpyb(ES, regs.u.r16.di,
-                            0xe000, 0x10 + regs.u.r16.bx, 0x14);
+                            E820_SEG, E820_OFFSET + regs.u.r16.bx, 0x14);
                 regs.u.r32.ebx += 0x14;
                 if ((regs.u.r32.ebx + 0x14 - 1) > e820_table_size)
                     regs.u.r32.ebx = 0;
             } else if (regs.u.r16.bx == 1) {
-                Bit32u base, type;
-                Bit16u off;
                 for (off = 0; off < e820_table_size; off += 0x14) {
-                    base = read_dword(0xe000, 0x10 + off);
-                    type = read_dword(0xe000, 0x20 + off);
+                    base = read_dword(E820_SEG, E820_OFFSET + off);
+                    type = read_dword(E820_SEG, E820_OFFSET + 0x10 + off);
                     if ((base >= 0x100000) && (type == 1))
                         break;
                 }
@@ -4553,7 +4679,7 @@ ASM_END
                     SET_CF();
                     break;
                 }
-                memcpyb(ES, regs.u.r16.di, 0xe000, 0x10 + off, 0x14);
+                memcpyb(ES, regs.u.r16.di, E820_SEG, E820_OFFSET + off, 0x14);
                 regs.u.r32.ebx = 0;
             } else { /* AX=E820, DX=534D4150, BX unrecognized */
                 goto int15_unimplemented;
@@ -4566,8 +4692,7 @@ ASM_END
         }
 
         case 0x01: {
-            Bit16u off, e820_table_size = read_word(0xe000, 0x8) * 0x14;
-            Bit32u base, type, size;
+            e820_table_size = read_word(E820_SEG, E820_NR_OFFSET) * 0x14;
 
             // do we have any reason to fail here ?
             CLEAR_CF();
@@ -4575,15 +4700,15 @@ ASM_END
             // Get the amount of extended memory (above 1M)
             regs.u.r8.cl = inb_cmos(0x30);
             regs.u.r8.ch = inb_cmos(0x31);
-          
+
             // limit to 15M
             if (regs.u.r16.cx > (15*1024))
                 regs.u.r16.cx = 15*1024;
 
             // Find first RAM E820 entry >= 1MB.
             for (off = 0; off < e820_table_size; off += 0x14) {
-                base = read_dword(0xe000, 0x10 + off);
-                type = read_dword(0xe000, 0x20 + off);
+                base = read_dword(E820_SEG, E820_OFFSET + off);
+                type = read_dword(E820_SEG, E820_OFFSET + 0x10 + off);
                 if ((base >= 0x100000) && (type == 1))
                     break;
             }
@@ -4591,7 +4716,7 @@ ASM_END
             // If there is RAM above 16MB, return amount in 64kB chunks.
             regs.u.r16.dx = 0;
             if (off != e820_table_size) {
-                size = base + read_dword(0xe000, 0x18 + off);
+                size = base + read_dword(E820_SEG, E820_OFFSET + 0x8 + off);
                 if (size > 0x1000000) {
                     size -= 0x1000000;
                     regs.u.r16.dx = (Bit16u)(size >> 16);
@@ -4603,7 +4728,7 @@ ASM_END
             regs.u.r16.bx = regs.u.r16.dx;
             break;
         }
-       default:  /* AH=0xE8?? but not implemented */
+        default:  /* AH=0xE8?? but not implemented */
             goto int15_unimplemented;
         }
         break;
@@ -4616,17 +4741,179 @@ ASM_END
       regs.u.r8.ah = UNSUPPORTED_FUNCTION;
       break;
     }
+#else
+         case 0x20: // coded by osmaker aka K.J.
+            if(regs.u.r32.edx == 0x534D4150)
+            {
+                extended_memory_size = inb_cmos(0x35);
+                extended_memory_size <<= 8;
+                extended_memory_size |= inb_cmos(0x34);
+                extended_memory_size *= 64;
+                // greater than EFF00000???
+                if(extended_memory_size > 0x3bc000) {
+                    extended_memory_size = 0x3bc000; // everything after this is reserved memory until we get to 0x100000000
+                }
+                extended_memory_size *= 1024;
+                extended_memory_size += (16L * 1024 * 1024);
+
+                if(extended_memory_size <= (16L * 1024 * 1024)) {
+                    extended_memory_size = inb_cmos(0x31);
+                    extended_memory_size <<= 8;
+                    extended_memory_size |= inb_cmos(0x30);
+                    extended_memory_size *= 1024;
+                    extended_memory_size += (1L * 1024 * 1024);
+                }
+
+                switch(regs.u.r16.bx)
+                {
+                    case 0:
+                        set_e820_range(ES, regs.u.r16.di,
+                                       0x0000000L, 0x0009f000L, 1);
+                        regs.u.r32.ebx = 1;
+                        regs.u.r32.eax = 0x534D4150;
+                        regs.u.r32.ecx = 0x14;
+                        CLEAR_CF();
+                        return;
+                        break;
+                    case 1:
+                        set_e820_range(ES, regs.u.r16.di,
+                                       0x0009f000L, 0x000a0000L, 2);
+                        regs.u.r32.ebx = 2;
+                        regs.u.r32.eax = 0x534D4150;
+                        regs.u.r32.ecx = 0x14;
+                        CLEAR_CF();
+                        return;
+                        break;
+                    case 2:
+                        set_e820_range(ES, regs.u.r16.di,
+                                       0x000e8000L, 0x00100000L, 2);
+                        regs.u.r32.ebx = 3;
+                        regs.u.r32.eax = 0x534D4150;
+                        regs.u.r32.ecx = 0x14;
+                        CLEAR_CF();
+                        return;
+                        break;
+                    case 3:
+#if BX_ROMBIOS32
+                        set_e820_range(ES, regs.u.r16.di,
+                                       0x00100000L,
+                                       extended_memory_size - ACPI_DATA_SIZE, 1);
+                        regs.u.r32.ebx = 4;
+#else
+                        set_e820_range(ES, regs.u.r16.di,
+                                       0x00100000L,
+                                       extended_memory_size, 1);
+                        regs.u.r32.ebx = 5;
+#endif
+                        regs.u.r32.eax = 0x534D4150;
+                        regs.u.r32.ecx = 0x14;
+                        CLEAR_CF();
+                        return;
+                        break;
+                    case 4:
+                        set_e820_range(ES, regs.u.r16.di,
+                                       extended_memory_size - ACPI_DATA_SIZE,
+                                       extended_memory_size, 3); // ACPI RAM
+                        regs.u.r32.ebx = 5;
+                        regs.u.r32.eax = 0x534D4150;
+                        regs.u.r32.ecx = 0x14;
+                        CLEAR_CF();
+                        return;
+                        break;
+                    case 5:
+                        /* 256KB BIOS area at the end of 4 GB */
+                        set_e820_range(ES, regs.u.r16.di,
+                                       0xfffc0000L, 0x00000000L, 2);
+                        regs.u.r32.ebx = 0;
+                        regs.u.r32.eax = 0x534D4150;
+                        regs.u.r32.ecx = 0x14;
+                        CLEAR_CF();
+                        return;
+                    default:  /* AX=E820, DX=534D4150, BX unrecognized */
+                        goto int15_unimplemented;
+                        break;
+                }
+            } else {
+              // if DX != 0x534D4150)
+              goto int15_unimplemented;
+            }
+            break;
+
+        case 0x01:
+          // do we have any reason to fail here ?
+          CLEAR_CF();
+
+          // my real system sets ax and bx to 0
+          // this is confirmed by Ralph Brown list
+          // but syslinux v1.48 is known to behave
+          // strangely if ax is set to 0
+          // regs.u.r16.ax = 0;
+          // regs.u.r16.bx = 0;
+
+          // Get the amount of extended memory (above 1M)
+          regs.u.r8.cl = inb_cmos(0x30);
+          regs.u.r8.ch = inb_cmos(0x31);
+
+          // limit to 15M
+          if(regs.u.r16.cx > 0x3c00)
+          {
+            regs.u.r16.cx = 0x3c00;
+          }
+
+          // Get the amount of extended memory above 16M in 64k blocs
+          regs.u.r8.dl = inb_cmos(0x34);
+          regs.u.r8.dh = inb_cmos(0x35);
+
+          // Set configured memory equal to extended memory
+          regs.u.r16.ax = regs.u.r16.cx;
+          regs.u.r16.bx = regs.u.r16.dx;
+          break;
+        default:  /* AH=0xE8?? but not implemented */
+          goto int15_unimplemented;
+       }
+       break;
+    int15_unimplemented:
+       // fall into the default
+    default:
+      BX_INFO("*** int 15h function AX=%04x, BX=%04x not yet supported!\n",
+        (unsigned) regs.u.r16.ax, (unsigned) regs.u.r16.bx);
+      SET_CF();
+      regs.u.r8.ah = UNSUPPORTED_FUNCTION;
+      break;
+    }
+#endif /* HVMASSIST */
 }
 
   void
 int16_function(DI, SI, BP, SP, BX, DX, CX, AX, FLAGS)
   Bit16u DI, SI, BP, SP, BX, DX, CX, AX, FLAGS;
 {
-  Bit8u scan_code, ascii_code, shift_flags, count;
+  Bit8u scan_code, ascii_code, shift_flags, led_flags, count;
   Bit16u kbd_code, max;
 
   BX_DEBUG_INT16("int16: AX=%04x BX=%04x CX=%04x DX=%04x \n", AX, BX, CX, DX);
 
+  shift_flags = read_byte(0x0040, 0x17);
+  led_flags = read_byte(0x0040, 0x97);
+  if ((((shift_flags >> 4) & 0x07) ^ (led_flags & 0x07)) != 0) {
+ASM_START
+    cli
+ASM_END
+    outb(0x60, 0xed);
+    while ((inb(0x64) & 0x01) == 0) outb(0x80, 0x21);
+    if ((inb(0x60) == 0xfa)) {
+      led_flags &= 0xf8;
+      led_flags |= ((shift_flags >> 4) & 0x07);
+      outb(0x60, led_flags & 0x07);
+      while ((inb(0x64) & 0x01) == 0) outb(0x80, 0x21);
+      inb(0x60);
+      write_byte(0x0040, 0x97, led_flags);
+    }
+ASM_START
+    sti
+ASM_END
+  }
+
   switch (GET_AH()) {
     case 0x00: /* read keyboard input */
 
@@ -4664,7 +4951,7 @@ int16_function(DI, SI, BP, SP, BX, DX, CX, AX, FLAGS)
       break;
 
     case 0x09: /* GET KEYBOARD FUNCTIONALITY */
-      // bit Bochs Description     
+      // bit Bochs Description
       //  7    0   reserved
       //  6    0   INT 16/AH=20h-22h supported (122-key keyboard support)
       //  5    1   INT 16/AH=10h-12h supported (enhanced keyboard support)
@@ -4694,7 +4981,7 @@ int16_function(DI, SI, BP, SP, BX, DX, CX, AX, FLAGS)
               kbd_code |= (inb(0x60) << 8);
             }
           } while (--count>0);
-       }
+        }
       }
       BX=kbd_code;
       break;
@@ -4721,7 +5008,8 @@ int16_function(DI, SI, BP, SP, BX, DX, CX, AX, FLAGS)
     case 0x12: /* get extended keyboard status */
       shift_flags = read_byte(0x0040, 0x17);
       SET_AL(shift_flags);
-      shift_flags = read_byte(0x0040, 0x18);
+      shift_flags = read_byte(0x0040, 0x18) & 0x73;
+      shift_flags |= read_byte(0x0040, 0x96) & 0x0c;
       SET_AH(shift_flags);
       BX_DEBUG_INT16("int16: func 12 sending %04x\n",AX);
       break;
@@ -4736,7 +5024,7 @@ int16_function(DI, SI, BP, SP, BX, DX, CX, AX, FLAGS)
 
     case 0x6F:
       if (GET_AL() == 0x08)
-       SET_AH(0x02); // unsupported, aka normal keyboard
+        SET_AH(0x02); // unsupported, aka normal keyboard
 
     default:
       BX_INFO("KBD: unsupported int 16h function %02x\n", GET_AH());
@@ -4877,7 +5165,7 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
   Bit16u DI, SI, BP, SP, BX, DX, CX, AX;
 {
   Bit8u scancode, asciicode, shift_flags;
-  Bit8u mf2_flags, mf2_state, led_flags;
+  Bit8u mf2_flags, mf2_state;
 
   //
   // DS has been set to F000 before call
@@ -4895,7 +5183,6 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
   shift_flags = read_byte(0x0040, 0x17);
   mf2_flags = read_byte(0x0040, 0x18);
   mf2_state = read_byte(0x0040, 0x96);
-  led_flags = read_byte(0x0040, 0x97);
   asciicode = 0;
 
   switch (scancode) {
@@ -4904,8 +5191,6 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
       write_byte(0x0040, 0x17, shift_flags);
       mf2_flags |= 0x40;
       write_byte(0x0040, 0x18, mf2_flags);
-      led_flags ^= 0x04;
-      write_byte(0x0040, 0x97, led_flags);
       break;
     case 0xba: /* Caps Lock release */
       mf2_flags &= ~0x40;
@@ -4913,11 +5198,8 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
       break;
 
     case 0x2a: /* L Shift press */
-      /*shift_flags &= ~0x40;*/
       shift_flags |= 0x02;
       write_byte(0x0040, 0x17, shift_flags);
-      led_flags &= ~0x04;
-      write_byte(0x0040, 0x97, led_flags);
       break;
     case 0xaa: /* L Shift release */
       shift_flags &= ~0x02;
@@ -4925,11 +5207,8 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
       break;
 
     case 0x36: /* R Shift press */
-      /*shift_flags &= ~0x40;*/
       shift_flags |= 0x01;
       write_byte(0x0040, 0x17, shift_flags);
-      led_flags &= ~0x04;
-      write_byte(0x0040, 0x97, led_flags);
       break;
     case 0xb6: /* R Shift release */
       shift_flags &= ~0x01;
@@ -4937,71 +5216,75 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
       break;
 
     case 0x1d: /* Ctrl press */
-      shift_flags |= 0x04;
-      write_byte(0x0040, 0x17, shift_flags);
-      if (mf2_state & 0x01) {
-        mf2_flags |= 0x04;
-      } else {
-        mf2_flags |= 0x01;
+      if ((mf2_state & 0x01) == 0) {
+        shift_flags |= 0x04;
+        write_byte(0x0040, 0x17, shift_flags);
+        if (mf2_state & 0x02) {
+          mf2_state |= 0x04;
+          write_byte(0x0040, 0x96, mf2_state);
+        } else {
+          mf2_flags |= 0x01;
+          write_byte(0x0040, 0x18, mf2_flags);
         }
-      write_byte(0x0040, 0x18, mf2_flags);
+      }
       break;
     case 0x9d: /* Ctrl release */
-      shift_flags &= ~0x04;
-      write_byte(0x0040, 0x17, shift_flags);
-      if (mf2_state & 0x01) {
-        mf2_flags &= ~0x04;
-      } else {
-        mf2_flags &= ~0x01;
+      if ((mf2_state & 0x01) == 0) {
+        shift_flags &= ~0x04;
+        write_byte(0x0040, 0x17, shift_flags);
+        if (mf2_state & 0x02) {
+          mf2_state &= ~0x04;
+          write_byte(0x0040, 0x96, mf2_state);
+        } else {
+          mf2_flags &= ~0x01;
+          write_byte(0x0040, 0x18, mf2_flags);
         }
-      write_byte(0x0040, 0x18, mf2_flags);
+      }
       break;
 
     case 0x38: /* Alt press */
       shift_flags |= 0x08;
       write_byte(0x0040, 0x17, shift_flags);
-      if (mf2_state & 0x01) {
-        mf2_flags |= 0x08;
+      if (mf2_state & 0x02) {
+        mf2_state |= 0x08;
+        write_byte(0x0040, 0x96, mf2_state);
       } else {
         mf2_flags |= 0x02;
-        }
-      write_byte(0x0040, 0x18, mf2_flags);
+        write_byte(0x0040, 0x18, mf2_flags);
+      }
       break;
     case 0xb8: /* Alt release */
       shift_flags &= ~0x08;
       write_byte(0x0040, 0x17, shift_flags);
-      if (mf2_state & 0x01) {
-        mf2_flags &= ~0x08;
+      if (mf2_state & 0x02) {
+        mf2_state &= ~0x08;
+        write_byte(0x0040, 0x96, mf2_state);
       } else {
         mf2_flags &= ~0x02;
-        }
-      write_byte(0x0040, 0x18, mf2_flags);
+        write_byte(0x0040, 0x18, mf2_flags);
+      }
       break;
 
     case 0x45: /* Num Lock press */
-      if ((mf2_state & 0x01) == 0) {
+      if ((mf2_state & 0x03) == 0) {
         mf2_flags |= 0x20;
         write_byte(0x0040, 0x18, mf2_flags);
         shift_flags ^= 0x20;
-        led_flags ^= 0x02;
         write_byte(0x0040, 0x17, shift_flags);
-        write_byte(0x0040, 0x97, led_flags);
-        }
+      }
       break;
     case 0xc5: /* Num Lock release */
-      if ((mf2_state & 0x01) == 0) {
+      if ((mf2_state & 0x03) == 0) {
         mf2_flags &= ~0x20;
         write_byte(0x0040, 0x18, mf2_flags);
-        }
+      }
       break;
 
     case 0x46: /* Scroll Lock press */
       mf2_flags |= 0x10;
       write_byte(0x0040, 0x18, mf2_flags);
       shift_flags ^= 0x10;
-      led_flags ^= 0x01;
       write_byte(0x0040, 0x17, shift_flags);
-      write_byte(0x0040, 0x97, led_flags);
       break;
 
     case 0xc6: /* Scroll Lock release */
@@ -5014,50 +5297,55 @@ int09_function(DI, SI, BP, SP, BX, DX, CX, AX)
             machine_reset();
         /* Fall through */
     default:
-      if (scancode & 0x80) return; /* toss key releases ... */
+      if (scancode & 0x80) {
+        break; /* toss key releases ... */
+      }
       if (scancode > MAX_SCAN_CODE) {
-        BX_INFO("KBD: int09h_handler(): unknown scancode (%x) read!\n", scancode);
+        BX_INFO("KBD: int09h_handler(): unknown scancode read: 0x%02x!\n", scancode);
         return;
-        }
+      }
       if (shift_flags & 0x08) { /* ALT */
         asciicode = scan_to_scanascii[scancode].alt;
         scancode = scan_to_scanascii[scancode].alt >> 8;
-        }
-      else if (shift_flags & 0x04) { /* CONTROL */
+      } else if (shift_flags & 0x04) { /* CONTROL */
         asciicode = scan_to_scanascii[scancode].control;
         scancode = scan_to_scanascii[scancode].control >> 8;
-        }
-      else if (shift_flags & 0x03) { /* LSHIFT + RSHIFT */
-        /* check if lock state should be ignored 
+      } else if (((mf2_state & 0x02) > 0) && ((scancode >= 0x47) && (scancode <= 0x53))) {
+        /* extended keys handling */
+        asciicode = 0xe0;
+        scancode = scan_to_scanascii[scancode].normal >> 8;
+      } else if (shift_flags & 0x03) { /* LSHIFT + RSHIFT */
+        /* check if lock state should be ignored
          * because a SHIFT key are pressed */
-         
+
         if (shift_flags & scan_to_scanascii[scancode].lock_flags) {
           asciicode = scan_to_scanascii[scancode].normal;
           scancode = scan_to_scanascii[scancode].normal >> 8;
-          }
-        else {
+        } else {
           asciicode = scan_to_scanascii[scancode].shift;
           scancode = scan_to_scanascii[scancode].shift >> 8;
-          }
         }
-      else {
+      else {
         /* check if lock is on */
         if (shift_flags & scan_to_scanascii[scancode].lock_flags) {
           asciicode = scan_to_scanascii[scancode].shift;
           scancode = scan_to_scanascii[scancode].shift >> 8;
-          }
-        else {
+        } else {
           asciicode = scan_to_scanascii[scancode].normal;
           scancode = scan_to_scanascii[scancode].normal >> 8;
-          }
         }
+      }
       if (scancode==0 && asciicode==0) {
         BX_INFO("KBD: int09h_handler(): scancode & asciicode are zero?\n");
-        }
+      }
       enqueue_key(scancode, asciicode);
       break;
-    }
-  mf2_state &= ~0x01;
+  }
+  if ((scancode & 0x7f) != 0x1d) {
+    mf2_state &= ~0x01;
+  }
+  mf2_state &= ~0x02;
+  write_byte(0x0040, 0x96, mf2_state);
 }
 
   unsigned int
@@ -5066,9 +5354,6 @@ enqueue_key(scan_code, ascii_code)
 {
   Bit16u buffer_start, buffer_end, buffer_head, buffer_tail, temp_tail;
 
-  //BX_INFO("KBD:   enqueue_key() called scan:%02x, ascii:%02x\n",
-  //    scan_code, ascii_code);
-
 #if BX_CPU < 2
   buffer_start = 0x001E;
   buffer_end   = 0x003E;
@@ -5118,9 +5403,8 @@ BX_DEBUG_INT74("int74: read byte %02x\n", in_byte);
   mouse_flags_2 = read_byte(ebda_seg, 0x0027);
 
   if ( (mouse_flags_2 & 0x80) != 0x80 ) {
-      //    BX_PANIC("int74_function:\n");
       return;
-    }
+  }
 
   package_count = mouse_flags_2 & 0x07;
   index = mouse_flags_1 & 0x07;
@@ -5148,10 +5432,10 @@ BX_DEBUG_INT74("int74_function: make_farcall=1\n");
 #if BX_USE_ATADRV
 
   void
-int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
-  Bit16u DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS;
+int13_harddisk(EHAX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
+  Bit16u EHAX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS;
 {
-  Bit32u lba;
+  Bit32u lba_low, lba_high;
   Bit16u ebda_seg=read_word(0x0040,0x000E);
   Bit16u cylinder, head, sector;
   Bit16u segment, offset;
@@ -5172,12 +5456,12 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
   // Get the ata channel
   device=read_byte(ebda_seg,&EbdaData->ata.hdidmap[GET_ELDL()-0x80]);
 
-  // basic check : device has to be valid 
+  // basic check : device has to be valid
   if (device >= BX_MAX_ATA_DEVICES) {
     BX_INFO("int13_harddisk: function %02x, unmapped device for ELDL=%02x\n", GET_AH(), GET_ELDL());
     goto int13_fail;
     }
-  
+
   switch (GET_AH()) {
 
     case 0x00: /* disk controller reset */
@@ -5195,7 +5479,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       break;
 
     case 0x02: // read disk sectors
-    case 0x03: // write disk sectors 
+    case 0x03: // write disk sectors
     case 0x04: // verify disk sectors
 
       count       = GET_AL();
@@ -5207,10 +5491,10 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       segment = ES;
       offset  = BX;
 
-      if ( (count > 128) || (count == 0) ) {
-        BX_INFO("int13_harddisk: function %02x, count out of range!\n",GET_AH());
+      if ((count > 128) || (count == 0) || (sector == 0)) {
+        BX_INFO("int13_harddisk: function %02x, parameter out of range!\n",GET_AH());
         goto int13_fail;
-        }
+      }
 
       nlc   = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.cylinders);
       nlh   = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.heads);
@@ -5221,7 +5505,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
         BX_INFO("int13_harddisk: function %02x, parameters out of range %04x/%04x/%04x!\n", GET_AH(), cylinder, head, sector);
         goto int13_fail;
         }
-      
+
       // FIXME verify
       if ( GET_AH() == 0x04 ) goto int13_success;
 
@@ -5230,14 +5514,15 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
 
       // if needed, translate lchs to lba, and execute command
       if ( (nph != nlh) || (npspt != nlspt)) {
-        lba = ((((Bit32u)cylinder * (Bit32u)nlh) + (Bit32u)head) * (Bit32u)nlspt) + (Bit32u)sector - 1;
+        lba_low = ((((Bit32u)cylinder * (Bit32u)nlh) + (Bit32u)head) * (Bit32u)nlspt) + (Bit32u)sector - 1;
+        lba_high = 0;
         sector = 0; // this forces the command to be lba
         }
 
       if ( GET_AH() == 0x02 )
-        status=ata_cmd_data_in(device, ATA_CMD_READ_SECTORS, count, cylinder, head, sector, lba, segment, offset);
+        status=ata_cmd_data_in(device, ATA_CMD_READ_SECTORS, count, cylinder, head, sector, lba_low, lba_high, segment, offset);
       else
-        status=ata_cmd_data_out(device, ATA_CMD_WRITE_SECTORS, count, cylinder, head, sector, lba, segment, offset);
+        status=ata_cmd_data_out(device, ATA_CMD_WRITE_SECTORS, count, cylinder, head, sector, lba_low, lba_high, segment, offset);
 
       // Set nb of sector transferred
       SET_AL(read_word(ebda_seg, &EbdaData->ata.trsfsectors));
@@ -5258,7 +5543,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       break;
 
     case 0x08: /* read disk drive parameters */
-      
+
       // Get logical geometry from table
       nlc   = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.cylinders);
       nlh   = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.heads);
@@ -5273,13 +5558,13 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       SET_DL(count); /* FIXME returns 0, 1, or n hard drives */
 
       // FIXME should set ES & DI
-      
+
       goto int13_success;
       break;
 
     case 0x10: /* check drive ready */
       // should look at 40:8E also???
-      
+
       // Read the status from controller
       status = inb(read_word(ebda_seg, &EbdaData->ata.channels[device/2].iobase1) + ATA_CB_STAT);
       if ( (status & ( ATA_CB_STAT_BSY | ATA_CB_STAT_RDY )) == ATA_CB_STAT_RDY ) {
@@ -5293,15 +5578,15 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
 
     case 0x15: /* read disk drive size */
 
-      // Get physical geometry from table
-      npc   = read_word(ebda_seg, &EbdaData->ata.devices[device].pchs.cylinders);
-      nph   = read_word(ebda_seg, &EbdaData->ata.devices[device].pchs.heads);
-      npspt = read_word(ebda_seg, &EbdaData->ata.devices[device].pchs.spt);
+      // Get logical geometry from table
+      nlc   = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.cylinders);
+      nlh   = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.heads);
+      nlspt = read_word(ebda_seg, &EbdaData->ata.devices[device].lchs.spt);
 
       // Compute sector count seen by int13
-      lba = (Bit32u)(npc - 1) * (Bit32u)nph * (Bit32u)npspt;
-      CX = lba >> 16;
-      DX = lba & 0xffff;
+      lba_low = (Bit32u)(nlc - 1) * (Bit32u)nlh * (Bit32u)nlspt;
+      CX = lba_low >> 16;
+      DX = lba_low & 0xffff;
 
       SET_AH(3);  // hard disk accessible
       goto int13_success_noah;
@@ -5322,17 +5607,18 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       count=read_word(DS, SI+(Bit16u)&Int13Ext->count);
       segment=read_word(DS, SI+(Bit16u)&Int13Ext->segment);
       offset=read_word(DS, SI+(Bit16u)&Int13Ext->offset);
-      // Can't use 64 bits lba
-      lba=read_dword(DS, SI+(Bit16u)&Int13Ext->lba2);
-      if (lba != 0L) {
-        BX_PANIC("int13_harddisk: function %02x. Can't use 64bits lba\n",GET_AH());
+
+      // Get 32 msb lba and check
+      lba_high=read_dword(DS, SI+(Bit16u)&Int13Ext->lba2);
+      if (lba_high > read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors_high) ) {
+        BX_INFO("int13_harddisk: function %02x. LBA out of range\n",GET_AH());
         goto int13_fail;
         }
 
-      // Get 32 bits lba and check
-      lba=read_dword(DS, SI+(Bit16u)&Int13Ext->lba1);
-      if (lba >= read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors) ) {
+      // Get 32 lsb lba and check
+      lba_low=read_dword(DS, SI+(Bit16u)&Int13Ext->lba1);
+      if (lba_high == read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors_high)
+          && lba_low >= read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors_low) ) {
         BX_INFO("int13_harddisk: function %02x. LBA out of range\n",GET_AH());
         goto int13_fail;
         }
@@ -5340,12 +5626,12 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       // If verify or seek
       if (( GET_AH() == 0x44 ) || ( GET_AH() == 0x47 ))
         goto int13_success;
-      
+
       // Execute the command
       if ( GET_AH() == 0x42 )
-        status=ata_cmd_data_in(device, ATA_CMD_READ_SECTORS, count, 0, 0, 0, lba, segment, offset);
+        status=ata_cmd_data_in(device, ATA_CMD_READ_SECTORS, count, 0, 0, 0, lba_low, lba_high, segment, offset);
       else
-        status=ata_cmd_data_out(device, ATA_CMD_WRITE_SECTORS, count, 0, 0, 0, lba, segment, offset);
+        status=ata_cmd_data_out(device, ATA_CMD_WRITE_SECTORS, count, 0, 0, 0, lba_low, lba_high, segment, offset);
 
       count=read_word(ebda_seg, &EbdaData->ata.trsfsectors);
       write_word(DS, SI+(Bit16u)&Int13Ext->count, count);
@@ -5363,7 +5649,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
     case 0x49: // IBM/MS extended media change
       goto int13_success;    // Always success for HD
       break;
-      
+
     case 0x46: // IBM/MS eject media
       SET_AH(0xb2);          // Volume Not Removable
       goto int13_fail_noah;  // Always fail for HD
@@ -5373,7 +5659,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       size=read_word(DS,SI+(Bit16u)&Int13DPT->size);
 
       // Buffer is too small
-      if(size < 0x1a) 
+      if(size < 0x1a)
         goto int13_fail;
 
       // EDD 1.x
@@ -5383,17 +5669,26 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
         npc     = read_word(ebda_seg, &EbdaData->ata.devices[device].pchs.cylinders);
         nph     = read_word(ebda_seg, &EbdaData->ata.devices[device].pchs.heads);
         npspt   = read_word(ebda_seg, &EbdaData->ata.devices[device].pchs.spt);
-        lba     = read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors);
+        lba_low = read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors_low);
+        lba_high = read_dword(ebda_seg, &EbdaData->ata.devices[device].sectors_high);
         blksize = read_word(ebda_seg, &EbdaData->ata.devices[device].blksize);
 
         write_word(DS, SI+(Bit16u)&Int13DPT->size, 0x1a);
-        write_word(DS, SI+(Bit16u)&Int13DPT->infos, 0x02); // geometry is valid
-        write_dword(DS, SI+(Bit16u)&Int13DPT->cylinders, (Bit32u)npc);
+        if (lba_high || (lba_low/npspt)/nph > 0x3fff)
+        {
+          write_word(DS, SI+(Bit16u)&Int13DPT->infos, 0x00); // geometry is invalid
+          write_dword(DS, SI+(Bit16u)&Int13DPT->cylinders, 0x3fff);
+        }
+        else
+        {
+          write_word(DS, SI+(Bit16u)&Int13DPT->infos, 0x02); // geometry is valid
+          write_dword(DS, SI+(Bit16u)&Int13DPT->cylinders, (Bit32u)npc);
+        }
         write_dword(DS, SI+(Bit16u)&Int13DPT->heads, (Bit32u)nph);
         write_dword(DS, SI+(Bit16u)&Int13DPT->spt, (Bit32u)npspt);
-        write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count1, lba);  // FIXME should be Bit64
-        write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count2, 0L);  
-        write_word(DS, SI+(Bit16u)&Int13DPT->blksize, blksize);  
+        write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count1, lba_low);
+        write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count2, lba_high);
+        write_word(DS, SI+(Bit16u)&Int13DPT->blksize, blksize);
         }
 
       // EDD 2.x
@@ -5403,8 +5698,8 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
 
         write_word(DS, SI+(Bit16u)&Int13DPT->size, 0x1e);
 
-        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_segment, ebda_seg);  
-        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_offset, &EbdaData->ata.dpte);  
+        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_segment, ebda_seg);
+        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_offset, &EbdaData->ata.dpte);
 
         // Fill in dpte
         channel = device / 2;
@@ -5414,14 +5709,14 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
         mode = read_byte(ebda_seg, &EbdaData->ata.devices[device].mode);
         translation = read_byte(ebda_seg, &EbdaData->ata.devices[device].translation);
 
-        options  = (translation==ATA_TRANSLATION_NONE?0:1<<3); // chs translation
+        options  = (translation==ATA_TRANSLATION_NONE?0:1)<<3; // chs translation
         options |= (1<<4); // lba translation
-        options |= (mode==ATA_MODE_PIO32?1:0<<7);
-        options |= (translation==ATA_TRANSLATION_LBA?1:0<<9); 
-        options |= (translation==ATA_TRANSLATION_RECHS?3:0<<9); 
+        options |= (mode==ATA_MODE_PIO32?1:0)<<7;
+        options |= (translation==ATA_TRANSLATION_LBA?1:0)<<9;
+        options |= (translation==ATA_TRANSLATION_RECHS?3:0)<<9;
 
         write_word(ebda_seg, &EbdaData->ata.dpte.iobase1, iobase1);
-        write_word(ebda_seg, &EbdaData->ata.dpte.iobase2, iobase2);
+        write_word(ebda_seg, &EbdaData->ata.dpte.iobase2, iobase2 + ATA_CB_DC);
         write_byte(ebda_seg, &EbdaData->ata.dpte.prefix, (0xe | (device % 2))<<4 );
         write_byte(ebda_seg, &EbdaData->ata.dpte.unused, 0xcb );
         write_byte(ebda_seg, &EbdaData->ata.dpte.irq, irq );
@@ -5430,10 +5725,13 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
         write_byte(ebda_seg, &EbdaData->ata.dpte.pio, 0 );
         write_word(ebda_seg, &EbdaData->ata.dpte.options, options);
         write_word(ebda_seg, &EbdaData->ata.dpte.reserved, 0);
-        write_byte(ebda_seg, &EbdaData->ata.dpte.revision, 0x11);
+        if (size >=0x42)
+          write_byte(ebda_seg, &EbdaData->ata.dpte.revision, 0x11);
+        else
+          write_byte(ebda_seg, &EbdaData->ata.dpte.revision, 0x10);
+
         checksum=0;
-        for (i=0; i<15; i++) checksum+=read_byte(ebda_seg, (&EbdaData->ata.dpte) + i);
+        for (i=0; i<15; i++) checksum+=read_byte(ebda_seg, ((Bit8u*)(&EbdaData->ata.dpte)) + i);
         checksum = ~checksum;
         write_byte(ebda_seg, &EbdaData->ata.dpte.checksum, checksum);
         }
@@ -5459,7 +5757,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
           write_byte(DS, SI+(Bit16u)&Int13DPT->host_bus[2], 'A');
           write_byte(DS, SI+(Bit16u)&Int13DPT->host_bus[3], 0);
           }
-        else { 
+        else {
           // FIXME PCI
           }
         write_byte(DS, SI+(Bit16u)&Int13DPT->iface_type[0], 'A');
@@ -5472,7 +5770,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
           write_word(DS, SI+(Bit16u)&Int13DPT->iface_path[2], 0);
           write_dword(DS, SI+(Bit16u)&Int13DPT->iface_path[4], 0L);
           }
-        else { 
+        else {
           // FIXME PCI
           }
         write_byte(DS, SI+(Bit16u)&Int13DPT->device_path[0], device%2);
@@ -5508,7 +5806,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
     case 0x0d: /* alternate disk reset */
     case 0x11: /* recalibrate */
     case 0x14: /* controller internal diagnostic */
-      BX_INFO("int13h_harddisk function %02xh unimplemented, returns success\n", GET_AH());
+      BX_INFO("int13_harddisk: function %02xh unimplemented, returns success\n", GET_AH());
       goto int13_success;
       break;
 
@@ -5517,7 +5815,7 @@ int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
     case 0x18: // set media type for format
     case 0x50: // IBM/MS send packet command
     default:
-      BX_INFO("int13_harddisk function %02xh unsupported, returns fail\n", GET_AH());
+      BX_INFO("int13_harddisk: function %02xh unsupported, returns fail\n", GET_AH());
       goto int13_fail;
       break;
     }
@@ -5553,8 +5851,7 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
   Bit16u count, segment, offset, i, size;
 
   BX_DEBUG_INT13_CD("int13_cdrom: AX=%04x BX=%04x CX=%04x DX=%04x ES=%04x\n", AX, BX, CX, DX, ES);
-  // BX_DEBUG_INT13_CD("int13_cdrom: SS=%04x DS=%04x ES=%04x DI=%04x SI=%04x\n",get_SS(), DS, ES, DI, SI);
-  
+
   SET_DISK_RET_STATUS(0x00);
 
   /* basic check : device should be 0xE0+ */
@@ -5571,16 +5868,16 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
     BX_INFO("int13_cdrom: function %02x, unmapped device for ELDL=%02x\n", GET_AH(), GET_ELDL());
     goto int13_fail;
     }
-  
+
   switch (GET_AH()) {
 
     // all those functions return SUCCESS
     case 0x00: /* disk controller reset */
     case 0x09: /* initialize drive parameters */
     case 0x0c: /* seek to specified cylinder */
-    case 0x0d: /* alternate disk reset */  
-    case 0x10: /* check drive ready */    
-    case 0x11: /* recalibrate */      
+    case 0x0d: /* alternate disk reset */
+    case 0x10: /* check drive ready */
+    case 0x11: /* recalibrate */
     case 0x14: /* controller internal diagnostic */
     case 0x16: /* detect disk change */
       goto int13_success;
@@ -5602,7 +5899,7 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       /* set CF if error status read */
       if (status) goto int13_fail_nostatus;
       else        goto int13_success_noah;
-      break;      
+      break;
 
     case 0x15: /* read disk drive size */
       SET_AH(0x02);
@@ -5619,11 +5916,11 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
     case 0x42: // IBM/MS extended read
     case 0x44: // IBM/MS verify sectors
     case 0x47: // IBM/MS extended seek
-       
+
       count=read_word(DS, SI+(Bit16u)&Int13Ext->count);
       segment=read_word(DS, SI+(Bit16u)&Int13Ext->segment);
       offset=read_word(DS, SI+(Bit16u)&Int13Ext->offset);
+
       // Can't use 64 bits lba
       lba=read_dword(DS, SI+(Bit16u)&Int13Ext->lba2);
       if (lba != 0L) {
@@ -5631,13 +5928,13 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
         goto int13_fail;
         }
 
-      // Get 32 bits lba 
+      // Get 32 bits lba
       lba=read_dword(DS, SI+(Bit16u)&Int13Ext->lba1);
 
       // If verify or seek
       if (( GET_AH() == 0x44 ) || ( GET_AH() == 0x47 ))
         goto int13_success;
-      
+
       memsetb(get_SS(),atacmd,0,12);
       atacmd[0]=0x28;                      // READ command
       atacmd[7]=(count & 0xff00) >> 8;     // Sectors
@@ -5646,7 +5943,7 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
       atacmd[3]=(lba & 0x00ff0000) >> 16;
       atacmd[4]=(lba & 0x0000ff00) >> 8;
       atacmd[5]=(lba & 0x000000ff);
-      status = ata_cmd_packet(device, 12, get_SS(), atacmd, 0, count*2048L, ATA_DATA_IN, segment,offset); 
+      status = ata_cmd_packet(device, 12, get_SS(), atacmd, 0, count*2048L, ATA_DATA_IN, segment,offset);
 
       count = (Bit16u)(read_dword(ebda_seg, &EbdaData->ata.trsfbytes) >> 11);
       write_word(DS, SI+(Bit16u)&Int13Ext->count, count);
@@ -5693,21 +5990,21 @@ int13_cdrom(EHBX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
 
     case 0x46: // IBM/MS eject media
       locks = read_byte(ebda_seg, &EbdaData->ata.devices[device].lock);
-      
+
       if (locks != 0) {
         SET_AH(0xb1); // media locked
         goto int13_fail_noah;
         }
       // FIXME should handle 0x31 no media in device
       // FIXME should handle 0xb5 valid request failed
-    
+
       // Call removable media eject
       ASM_START
         push bp
         mov  bp, sp
 
         mov ah, #0x52
-        int 15
+        int #0x15
         mov _int13_cdrom.status + 2[bp], ah
         jnc int13_cdrom_rme_end
         mov _int13_cdrom.status, #1
@@ -5727,7 +6024,7 @@ int13_cdrom_rme_end:
       size = read_word(DS,SI+(Bit16u)&Int13Ext->size);
 
       // Buffer is too small
-      if(size < 0x1a) 
+      if(size < 0x1a)
         goto int13_fail;
 
       // EDD 1.x
@@ -5742,8 +6039,8 @@ int13_cdrom_rme_end:
         write_dword(DS, SI+(Bit16u)&Int13DPT->heads, 0xffffffff);
         write_dword(DS, SI+(Bit16u)&Int13DPT->spt, 0xffffffff);
         write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count1, 0xffffffff);  // FIXME should be Bit64
-        write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count2, 0xffffffff);  
-        write_word(DS, SI+(Bit16u)&Int13DPT->blksize, blksize);  
+        write_dword(DS, SI+(Bit16u)&Int13DPT->sector_count2, 0xffffffff);
+        write_word(DS, SI+(Bit16u)&Int13DPT->blksize, blksize);
         }
 
       // EDD 2.x
@@ -5753,8 +6050,8 @@ int13_cdrom_rme_end:
 
         write_word(DS, SI+(Bit16u)&Int13DPT->size, 0x1e);
 
-        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_segment, ebda_seg);  
-        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_offset, &EbdaData->ata.dpte);  
+        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_segment, ebda_seg);
+        write_word(DS, SI+(Bit16u)&Int13DPT->dpte_offset, &EbdaData->ata.dpte);
 
         // Fill in dpte
         channel = device / 2;
@@ -5770,7 +6067,7 @@ int13_cdrom_rme_end:
         options |= (mode==ATA_MODE_PIO32?1:0<<7);
 
         write_word(ebda_seg, &EbdaData->ata.dpte.iobase1, iobase1);
-        write_word(ebda_seg, &EbdaData->ata.dpte.iobase2, iobase2);
+        write_word(ebda_seg, &EbdaData->ata.dpte.iobase2, iobase2 + ATA_CB_DC);
         write_byte(ebda_seg, &EbdaData->ata.dpte.prefix, (0xe | (device % 2))<<4 );
         write_byte(ebda_seg, &EbdaData->ata.dpte.unused, 0xcb );
         write_byte(ebda_seg, &EbdaData->ata.dpte.irq, irq );
@@ -5782,7 +6079,7 @@ int13_cdrom_rme_end:
         write_byte(ebda_seg, &EbdaData->ata.dpte.revision, 0x11);
 
         checksum=0;
-        for (i=0; i<15; i++) checksum+=read_byte(ebda_seg, (&EbdaData->ata.dpte) + i);
+        for (i=0; i<15; i++) checksum+=read_byte(ebda_seg, ((Bit8u*)(&EbdaData->ata.dpte)) + i);
         checksum = ~checksum;
         write_byte(ebda_seg, &EbdaData->ata.dpte.checksum, checksum);
         }
@@ -5808,7 +6105,7 @@ int13_cdrom_rme_end:
           write_byte(DS, SI+(Bit16u)&Int13DPT->host_bus[2], 'A');
           write_byte(DS, SI+(Bit16u)&Int13DPT->host_bus[3], 0);
           }
-        else { 
+        else {
           // FIXME PCI
           }
         write_byte(DS, SI+(Bit16u)&Int13DPT->iface_type[0], 'A');
@@ -5821,7 +6118,7 @@ int13_cdrom_rme_end:
           write_word(DS, SI+(Bit16u)&Int13DPT->iface_path[2], 0);
           write_dword(DS, SI+(Bit16u)&Int13DPT->iface_path[4], 0L);
           }
-        else { 
+        else {
           // FIXME PCI
           }
         write_byte(DS, SI+(Bit16u)&Int13DPT->device_path[0], device%2);
@@ -5843,7 +6140,7 @@ int13_cdrom_rme_end:
       SET_AH(06);
       goto int13_fail_nostatus;
       break;
-      
+
     case 0x4e: // // IBM/MS set hardware configuration
       // DMA, prefetch, PIO maximum not supported
       switch (GET_AL()) {
@@ -5905,7 +6202,7 @@ int13_eltorito(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
 
   BX_DEBUG_INT13_ET("int13_eltorito: AX=%04x BX=%04x CX=%04x DX=%04x ES=%04x\n", AX, BX, CX, DX, ES);
   // BX_DEBUG_INT13_ET("int13_eltorito: SS=%04x DS=%04x ES=%04x DI=%04x SI=%04x\n",get_SS(), DS, ES, DI, SI);
-  
+
   switch (GET_AH()) {
 
     // FIXME ElTorito Various. Should be implemented
@@ -5980,11 +6277,10 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
   Bit8u  atacmd[12];
 
   BX_DEBUG_INT13_ET("int13_cdemu: AX=%04x BX=%04x CX=%04x DX=%04x ES=%04x\n", AX, BX, CX, DX, ES);
-  //BX_DEBUG_INT13_ET("int13_cdemu: SS=%04x ES=%04x DI=%04x SI=%04x\n", get_SS(), ES, DI, SI);
-  
+
   /* at this point, we are emulating a floppy/harddisk */
-  
-  // Recompute the device number 
+
+  // Recompute the device number
   device  = read_byte(ebda_seg,&EbdaData->cdemu.controller_index) * 2;
   device += read_byte(ebda_seg,&EbdaData->cdemu.device_spec);
 
@@ -5997,7 +6293,6 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
     goto int13_fail;
     }
 
-  
   switch (GET_AH()) {
 
     // all those functions return SUCCESS
@@ -6006,7 +6301,7 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
     case 0x0c: /* seek to specified cylinder */
     case 0x0d: /* alternate disk reset */  // FIXME ElTorito Various. should really reset ?
     case 0x10: /* check drive ready */     // FIXME ElTorito Various. should check if ready ?
-    case 0x11: /* recalibrate */      
+    case 0x11: /* recalibrate */
     case 0x14: /* controller internal diagnostic */
     case 0x16: /* detect disk change */
       goto int13_success;
@@ -6031,9 +6326,9 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
 
     case 0x02: // read disk sectors
     case 0x04: // verify disk sectors
-      vspt       = read_word(ebda_seg,&EbdaData->cdemu.vdevice.spt); 
-      vcylinders = read_word(ebda_seg,&EbdaData->cdemu.vdevice.cylinders); 
-      vheads     = read_word(ebda_seg,&EbdaData->cdemu.vdevice.heads); 
+      vspt       = read_word(ebda_seg,&EbdaData->cdemu.vdevice.spt);
+      vcylinders = read_word(ebda_seg,&EbdaData->cdemu.vdevice.cylinders);
+      vheads     = read_word(ebda_seg,&EbdaData->cdemu.vdevice.heads);
 
       ilba       = read_dword(ebda_seg,&EbdaData->cdemu.ilba);
 
@@ -6062,17 +6357,17 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
 
       // calculate the virtual lba inside the image
       vlba=((((Bit32u)cylinder*(Bit32u)vheads)+(Bit32u)head)*(Bit32u)vspt)+((Bit32u)(sector-1));
+
       // In advance so we don't loose the count
       SET_AL(nbsectors);
 
       // start lba on cd
-      slba  = (Bit32u)vlba/4; 
+      slba  = (Bit32u)vlba/4;
       before= (Bit16u)vlba%4;
 
       // end lba on cd
       elba = (Bit32u)(vlba+nbsectors-1)/4;
-      
+
       memsetb(get_SS(),atacmd,0,12);
       atacmd[0]=0x28;                      // READ command
       atacmd[7]=((Bit16u)(elba-slba+1) & 0xff00) >> 8; // Sectors
@@ -6092,10 +6387,10 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
       break;
 
     case 0x08: /* read disk drive parameters */
-      vspt=read_word(ebda_seg,&EbdaData->cdemu.vdevice.spt); 
-      vcylinders=read_word(ebda_seg,&EbdaData->cdemu.vdevice.cylinders) - 1; 
-      vheads=read_word(ebda_seg,&EbdaData->cdemu.vdevice.heads) - 1; 
+      vspt=read_word(ebda_seg,&EbdaData->cdemu.vdevice.spt);
+      vcylinders=read_word(ebda_seg,&EbdaData->cdemu.vdevice.cylinders) - 1;
+      vheads=read_word(ebda_seg,&EbdaData->cdemu.vdevice.heads) - 1;
+
       SET_AL( 0x00 );
       SET_BL( 0x00 );
       SET_CH( vcylinders & 0xff );
@@ -6103,7 +6398,7 @@ int13_cdemu(DS, ES, DI, SI, BP, SP, BX, DX, CX, AX, IP, CS, FLAGS)
       SET_DH( vheads );
       SET_DL( 0x02 );   // FIXME ElTorito Various. should send the real count of drives 1 or 2
                         // FIXME ElTorito Harddisk. should send the HD count
+
       switch(read_byte(ebda_seg,&EbdaData->cdemu.media)) {
         case 0x01: SET_BL( 0x02 ); break;
         case 0x02: SET_BL( 0x04 ); break;
@@ -6139,7 +6434,7 @@ ASM_END
     case 0x45: // IBM/MS lock/unlock drive
     case 0x46: // IBM/MS eject media
     case 0x47: // IBM/MS extended seek
-    case 0x48: // IBM/MS get drive parameters 
+    case 0x48: // IBM/MS get drive parameters
     case 0x49: // IBM/MS extended media change
     case 0x4e: // ? - set hardware configuration
     case 0x50: // ? - send packet command
@@ -6227,8 +6522,8 @@ ASM_END
 }
 
   void
-int13_harddisk(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
-  Bit16u DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS;
+int13_harddisk(EHAX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
+  Bit16u EHAX, DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS;
 {
   Bit8u    drive, num_sectors, sector, head, status, mod;
   Bit8u    drive_map;
@@ -6334,7 +6629,7 @@ BX_DEBUG_INT13_HD("int13_f01\n");
         }
 
       if ( (num_sectors > 128) || (num_sectors == 0) )
-        BX_PANIC("int13_harddisk(): num_sectors out of range!\n");
+        BX_PANIC("int13_harddisk: num_sectors out of range!\n");
 
       if (head > 15)
         BX_PANIC("hard drive BIOS:(read/verify) head > 15\n");
@@ -6480,7 +6775,7 @@ BX_DEBUG_INT13_HD("int13_f03\n");
         }
 
       if ( (num_sectors > 128) || (num_sectors == 0) )
-        BX_PANIC("int13_harddisk(): num_sectors out of range!\n");
+        BX_PANIC("int13_harddisk: num_sectors out of range!\n");
 
       if (head > 15)
         BX_PANIC("hard drive BIOS:(read) head > 15\n");
@@ -6590,7 +6885,7 @@ BX_DEBUG_INT13_HD("int13_f05\n");
 
     case 0x08: /* read disk drive parameters */
 BX_DEBUG_INT13_HD("int13_f08\n");
-      
+
       drive = GET_ELDL ();
       get_hd_geometry(drive, &hd_cylinders, &hd_heads, &hd_sectors);
 
@@ -6730,10 +7025,10 @@ ASM_END
       break;
 
     case 0x18: // set media type for format
-    case 0x41: // IBM/MS 
-    case 0x42: // IBM/MS 
-    case 0x43: // IBM/MS 
-    case 0x44: // IBM/MS 
+    case 0x41: // IBM/MS
+    case 0x42: // IBM/MS
+    case 0x43: // IBM/MS
+    case 0x44: // IBM/MS
     case 0x45: // IBM/MS lock/unlock drive
     case 0x46: // IBM/MS eject media
     case 0x47: // IBM/MS extended seek
@@ -6778,7 +7073,7 @@ get_hd_geometry(drive, hd_cylinders, hd_heads, hd_sectors)
     hd_type = inb_cmos(0x12) & 0x0f;
     if (hd_type != 0x0f)
       BX_INFO(panic_msg_reg12h,1);
-    hd_type = inb_cmos(0x1a); // HD0: extended type
+    hd_type = inb_cmos(0x1a); // HD1: extended type
     if (hd_type != 47)
       BX_INFO(panic_msg_reg19h,0,0x1a);
     iobase = 0x24;
@@ -6797,11 +7092,72 @@ get_hd_geometry(drive, hd_cylinders, hd_heads, hd_sectors)
 
 #endif //else BX_USE_ATADRV
 
+#if BX_SUPPORT_FLOPPY
 
 //////////////////////
 // FLOPPY functions //
 //////////////////////
 
+void floppy_reset_controller()
+{
+  Bit8u val8;
+
+  // Reset controller
+  val8 = inb(0x03f2);
+  outb(0x03f2, val8 & ~0x04);
+  outb(0x03f2, val8 | 0x04);
+
+  // Wait for controller to come out of reset
+  do {
+    val8 = inb(0x3f4);
+  } while ( (val8 & 0xc0) != 0x80 );
+}
+
+void floppy_prepare_controller(drive)
+  Bit16u drive;
+{
+  Bit8u  val8, dor, prev_reset;
+
+  // set 40:3e bit 7 to 0
+  val8 = read_byte(0x0040, 0x003e);
+  val8 &= 0x7f;
+  write_byte(0x0040, 0x003e, val8);
+
+  // turn on motor of selected drive, DMA & int enabled, normal operation
+  prev_reset = inb(0x03f2) & 0x04;
+  if (drive)
+    dor = 0x20;
+  else
+    dor = 0x10;
+  dor |= 0x0c;
+  dor |= drive;
+  outb(0x03f2, dor);
+
+  // reset the disk motor timeout value of INT 08
+  write_byte(0x40,0x40, BX_FLOPPY_ON_CNT);
+
+  // wait for drive readiness
+  do {
+    val8 = inb(0x3f4);
+  } while ( (val8 & 0xc0) != 0x80 );
+
+  if (prev_reset == 0) {
+    // turn on interrupts
+ASM_START
+    sti
+ASM_END
+    // wait on 40:3e bit 7 to become 1
+    do {
+      val8 = read_byte(0x0040, 0x003e);
+    } while ( (val8 & 0x80) == 0 );
+    val8 &= 0x7f;
+ASM_START
+    cli
+ASM_END
+    write_byte(0x0040, 0x003e, val8);
+  }
+}
+
   bx_bool
 floppy_media_known(drive)
   Bit16u drive;
@@ -6908,7 +7264,7 @@ floppy_media_sense(drive)
     retval = 1;
     }
   //
-  // Extended floppy size uses special cmos setting 
+  // Extended floppy size uses special cmos setting
   else if ( drive_type == 6 ) {
     // 160k 5.25" drive
     config_data = 0x00; // 0000 0000
@@ -6949,63 +7305,41 @@ floppy_media_sense(drive)
 floppy_drive_recal(drive)
   Bit16u drive;
 {
-  Bit8u  val8, dor;
+  Bit8u  val8;
   Bit16u curr_cyl_offset;
 
-  // set 40:3e bit 7 to 0
-  val8 = read_byte(0x0000, 0x043e);
-  val8 &= 0x7f;
-  write_byte(0x0000, 0x043e, val8);
-
-  // turn on motor of selected drive, DMA & int enabled, normal operation
-  if (drive)
-    dor = 0x20;
-  else
-    dor = 0x10;
-  dor |= 0x0c;
-  dor |= drive;
-  outb(0x03f2, dor);
-
-  // reset the disk motor timeout value of INT 08
-  write_byte(0x40,0x40, BX_FLOPPY_ON_CNT);
-
-  // check port 3f4 for drive readiness
-  val8 = inb(0x3f4);
-  if ( (val8 & 0xf0) != 0x80 )
-    BX_PANIC("floppy recal:f07: ctrl not ready\n");
+  floppy_prepare_controller(drive);
 
   // send Recalibrate command (2 bytes) to controller
   outb(0x03f5, 0x07);  // 07: Recalibrate
   outb(0x03f5, drive); // 0=drive0, 1=drive1
 
- // turn on interrupts
 // turn on interrupts
 ASM_START
   sti
 ASM_END
 
   // wait on 40:3e bit 7 to become 1
-  val8 = (read_byte(0x0000, 0x043e) & 0x80);
-  while ( val8 == 0 ) {
-    val8 = (read_byte(0x0000, 0x043e) & 0x80);
-    }
+  do {
+    val8 = (read_byte(0x0040, 0x003e) & 0x80);
+  } while ( val8 == 0 );
 
- val8 = 0; // separate asm from while() loop
- // turn off interrupts
 val8 = 0; // separate asm from while() loop
 // turn off interrupts
 ASM_START
   cli
 ASM_END
 
   // set 40:3e bit 7 to 0, and calibrated bit
-  val8 = read_byte(0x0000, 0x043e);
+  val8 = read_byte(0x0040, 0x003e);
   val8 &= 0x7f;
   if (drive) {
     val8 |= 0x02; // Drive 1 calibrated
     curr_cyl_offset = 0x0095;
-    }
-  else {
+  } else {
     val8 |= 0x01; // Drive 0 calibrated
     curr_cyl_offset = 0x0094;
-    }
+  }
   write_byte(0x0040, 0x003e, val8);
   write_byte(0x0040, curr_cyl_offset, 0); // current cylinder is 0
 
@@ -7032,7 +7366,6 @@ floppy_drive_exists(drive)
     return(1);
 }
 
-#if BX_SUPPORT_FLOPPY
   void
 int13_diskette_function(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
   Bit16u DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS;
@@ -7045,7 +7378,6 @@ int13_diskette_function(DS, ES, DI, SI, BP, ELDX, BX, DX, CX, AX, IP, CS, FLAGS)
   Bit16u es, last_addr;
 
   BX_DEBUG_INT13_FL("int13_diskette: AX=%04x BX=%04x CX=%04x DX=%04x ES=%04x\n", AX, BX, CX, DX, ES);
-  // BX_DEBUG_INT13_FL("int13_diskette: SS=%04x DS=%04x ES=%04x DI=%04x SI=%04x\n",get_SS(), get_DS(), ES, DI, SI);
 
   ah = GET_AH();
 
@@ -7058,7 +7390,7 @@ BX_DEBUG_INT13_FL("floppy f00\n");
         set_diskette_ret_status(1);
         SET_CF();
         return;
-        }
+      }
       drive_type = inb_cmos(0x10);
 
       if (drive == 0)
@@ -7070,7 +7402,7 @@ BX_DEBUG_INT13_FL("floppy f00\n");
         set_diskette_ret_status(0x80);
         SET_CF();
         return;
-        }
+      }
       SET_AH(0);
       set_diskette_ret_status(0);
       CLEAR_CF(); // successful
@@ -7083,7 +7415,7 @@ BX_DEBUG_INT13_FL("floppy f00\n");
       SET_AH(val8);
       if (val8) {
         SET_CF();
-        }
+      }
       return;
 
     case 0x02: // Read Diskette Sectors
@@ -7095,15 +7427,15 @@ BX_DEBUG_INT13_FL("floppy f00\n");
       head        = GET_DH();
       drive       = GET_ELDL();
 
-      if ( (drive > 1) || (head > 1) ||
-           (num_sectors == 0) || (num_sectors > 72) ) {
-BX_INFO("floppy: drive>1 || head>1 ...\n");
+      if ((drive > 1) || (head > 1) || (sector == 0) ||
+          (num_sectors == 0) || (num_sectors > 72)) {
+        BX_INFO("int13_diskette: read/write/verify: parameter out of range\n");
         SET_AH(1);
         set_diskette_ret_status(1);
         SET_AL(0); // no sectors read
         SET_CF(); // error occurred
         return;
-        }
+      }
 
       // see if drive exists
       if (floppy_drive_exists(drive) == 0) {
@@ -7112,7 +7444,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         SET_AL(0); // no sectors read
         SET_CF(); // error occurred
         return;
-        }
+      }
 
       // see if media in drive, and type is known
       if (floppy_media_known(drive) == 0) {
@@ -7122,8 +7454,8 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
           SET_AL(0); // no sectors read
           SET_CF(); // error occurred
           return;
-          }
         }
+      }
 
       if (ah == 0x02) {
         // Read Diskette Sectors
@@ -7142,7 +7474,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         if ( base_address < base_es ) {
           // in case of carry, adjust page by 1
           page++;
-          }
+        }
         base_count = (num_sectors * 512) - 1;
 
         // check for 64K boundary overrun
@@ -7153,7 +7485,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
           SET_AL(0); // no sectors read
           SET_CF(); // error occurred
           return;
-          }
+        }
 
         BX_DEBUG_INT13_FL("masking DMA-1 c2\n");
         outb(0x000a, 0x06);
@@ -7186,28 +7518,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         //--------------------------------------
         // set up floppy controller for transfer
         //--------------------------------------
-
-        // set 40:3e bit 7 to 0
-        val8 = read_byte(0x0000, 0x043e);
-        val8 &= 0x7f;
-        write_byte(0x0000, 0x043e, val8);
-
-        // turn on motor of selected drive, DMA & int enabled, normal operation
-        if (drive)
-          dor = 0x20;
-        else
-          dor = 0x10;
-        dor |= 0x0c;
-        dor |= drive;
-        outb(0x03f2, dor);
-
-        // reset the disk motor timeout value of INT 08
-        write_byte(0x40,0x40, BX_FLOPPY_ON_CNT);
-
-        // check port 3f4 for drive readiness
-        val8 = inb(0x3f4);
-        if ( (val8 & 0xf0) != 0x80 )
-          BX_PANIC("int13_diskette:f02: ctrl not ready\n");
+        floppy_prepare_controller(drive);
 
         // send read-normal-data command (9 bytes) to controller
         outb(0x03f5, 0xe6); // e6: read normal data
@@ -7216,31 +7527,39 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         outb(0x03f5, head);
         outb(0x03f5, sector);
         outb(0x03f5, 2); // 512 byte sector size
-        outb(0x03f5, 0); // last sector number possible on track
+        outb(0x03f5, sector + num_sectors - 1); // last sector to read on track
         outb(0x03f5, 0); // Gap length
         outb(0x03f5, 0xff); // Gap length
 
-       // turn on interrupts
+        // turn on interrupts
   ASM_START
         sti
   ASM_END
 
         // wait on 40:3e bit 7 to become 1
-        val8 = (read_byte(0x0000, 0x043e) & 0x80);
-        while ( val8 == 0 ) {
-          val8 = (read_byte(0x0000, 0x043e) & 0x80);
+        do {
+          val8 = read_byte(0x0040, 0x0040);
+          if (val8 == 0) {
+            floppy_reset_controller();
+            SET_AH(0x80); // drive not ready (timeout)
+            set_diskette_ret_status(0x80);
+            SET_AL(0); // no sectors read
+            SET_CF(); // error occurred
+            return;
           }
+          val8 = (read_byte(0x0040, 0x003e) & 0x80);
+        } while ( val8 == 0 );
 
-       val8 = 0; // separate asm from while() loop
-       // turn off interrupts
+        val8 = 0; // separate asm from while() loop
+        // turn off interrupts
   ASM_START
         cli
   ASM_END
 
         // set 40:3e bit 7 to 0
-        val8 = read_byte(0x0000, 0x043e);
+        val8 = read_byte(0x0040, 0x003e);
         val8 &= 0x7f;
-        write_byte(0x0000, 0x043e, val8);
+        write_byte(0x0040, 0x003e, val8);
 
         // check port 3f4 for accessibility to status bytes
         val8 = inb(0x3f4);
@@ -7271,7 +7590,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
           SET_AL(0); // no sectors read
           SET_CF(); // error occurred
           return;
-          }
+        }
 
         // ??? should track be new val from return_status[3] ?
         set_diskette_current_cyl(drive, track);
@@ -7279,8 +7598,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         SET_AH(0x00); // success
         CLEAR_CF();   // success
         return;
-        }
-      else if (ah == 0x03) {
+      } else if (ah == 0x03) {
         // Write Diskette Sectors
 
         //-----------------------------------
@@ -7297,7 +7615,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         if ( base_address < base_es ) {
           // in case of carry, adjust page by 1
           page++;
-          }
+        }
         base_count = (num_sectors * 512) - 1;
 
         // check for 64K boundary overrun
@@ -7308,7 +7626,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
           SET_AL(0); // no sectors read
           SET_CF(); // error occurred
           return;
-          }
+        }
 
         BX_DEBUG_INT13_FL("masking DMA-1 c2\n");
         outb(0x000a, 0x06);
@@ -7334,61 +7652,48 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         //--------------------------------------
         // set up floppy controller for transfer
         //--------------------------------------
+        floppy_prepare_controller(drive);
 
-        // set 40:3e bit 7 to 0
-        val8 = read_byte(0x0000, 0x043e);
-        val8 &= 0x7f;
-        write_byte(0x0000, 0x043e, val8);
-
-        // turn on motor of selected drive, DMA & int enabled, normal operation
-        if (drive)
-          dor = 0x20;
-        else
-          dor = 0x10;
-        dor |= 0x0c;
-        dor |= drive;
-        outb(0x03f2, dor);
-
-        // reset the disk motor timeout value of INT 08
-        write_byte(0x40,0x40, BX_FLOPPY_ON_CNT);
-
-        // check port 3f4 for drive readiness
-        val8 = inb(0x3f4);
-        if ( (val8 & 0xf0) != 0x80 )
-          BX_PANIC("int13_diskette:f03: ctrl not ready\n");
-
-        // send read-normal-data command (9 bytes) to controller
+        // send write-normal-data command (9 bytes) to controller
         outb(0x03f5, 0xc5); // c5: write normal data
         outb(0x03f5, (head << 2) | drive); // HD DR1 DR2
         outb(0x03f5, track);
         outb(0x03f5, head);
         outb(0x03f5, sector);
         outb(0x03f5, 2); // 512 byte sector size
-        outb(0x03f5, 0); // last sector number possible on track
+        outb(0x03f5, sector + num_sectors - 1); // last sector to write on track
         outb(0x03f5, 0); // Gap length
         outb(0x03f5, 0xff); // Gap length
 
-       // turn on interrupts
+        // turn on interrupts
   ASM_START
         sti
   ASM_END
 
         // wait on 40:3e bit 7 to become 1
-        val8 = (read_byte(0x0000, 0x043e) & 0x80);
-        while ( val8 == 0 ) {
-          val8 = (read_byte(0x0000, 0x043e) & 0x80);
+        do {
+          val8 = read_byte(0x0040, 0x0040);
+          if (val8 == 0) {
+            floppy_reset_controller();
+            SET_AH(0x80); // drive not ready (timeout)
+            set_diskette_ret_status(0x80);
+            SET_AL(0); // no sectors written
+            SET_CF(); // error occurred
+            return;
           }
+          val8 = (read_byte(0x0040, 0x003e) & 0x80);
+        } while ( val8 == 0 );
 
-       val8 = 0; // separate asm from while() loop
-       // turn off interrupts
+        val8 = 0; // separate asm from while() loop
+        // turn off interrupts
   ASM_START
         cli
   ASM_END
 
         // set 40:3e bit 7 to 0
-        val8 = read_byte(0x0000, 0x043e);
+        val8 = read_byte(0x0040, 0x003e);
         val8 &= 0x7f;
-        write_byte(0x0000, 0x043e, val8);
+        write_byte(0x0040, 0x003e, val8);
 
         // check port 3f4 for accessibility to status bytes
         val8 = inb(0x3f4);
@@ -7432,8 +7737,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         SET_AH(0x00); // success
         CLEAR_CF();   // success
         return;
-        }
-      else {  // if (ah == 0x04)
+      } else {  // if (ah == 0x04)
         // Verify Diskette Sectors
 
         // ??? should track be new val from return_status[3] ?
@@ -7442,8 +7746,8 @@ BX_INFO("floppy: drive>1 || head>1 ...\n");
         CLEAR_CF();   // success
         SET_AH(0x00); // success
         return;
-        }
-
+      }
+      break;
 
     case 0x05: // format diskette track
 BX_DEBUG_INT13_FL("floppy f05\n");
@@ -7458,7 +7762,7 @@ BX_DEBUG_INT13_FL("floppy f05\n");
         SET_AH(1);
         set_diskette_ret_status(1);
         SET_CF(); // error occurred
-        }
+      }
 
       // see if drive exists
       if (floppy_drive_exists(drive) == 0) {
@@ -7466,7 +7770,7 @@ BX_DEBUG_INT13_FL("floppy f05\n");
         set_diskette_ret_status(0x80);
         SET_CF(); // error occurred
         return;
-        }
+      }
 
       // see if media in drive, and type is known
       if (floppy_media_known(drive) == 0) {
@@ -7476,8 +7780,8 @@ BX_DEBUG_INT13_FL("floppy f05\n");
           SET_AL(0); // no sectors read
           SET_CF(); // error occurred
           return;
-          }
         }
+      }
 
       // set up DMA controller for transfer
       page = (ES >> 12);   // upper 4 bits
@@ -7487,7 +7791,7 @@ BX_DEBUG_INT13_FL("floppy f05\n");
       if ( base_address < base_es ) {
         // in case of carry, adjust page by 1
         page++;
-        }
+      }
       base_count = (num_sectors * 4) - 1;
 
       // check for 64K boundary overrun
@@ -7498,7 +7802,7 @@ BX_DEBUG_INT13_FL("floppy f05\n");
         SET_AL(0); // no sectors read
         SET_CF(); // error occurred
         return;
-        }
+      }
 
       outb(0x000a, 0x06);
       outb(0x000c, 0x00); // clear flip-flop
@@ -7515,27 +7819,9 @@ BX_DEBUG_INT13_FL("floppy f05\n");
       outb(0x000a, 0x02);
 
       // set up floppy controller for transfer
-      val8 = read_byte(0x0000, 0x043e);
-      val8 &= 0x7f;
-      write_byte(0x0000, 0x043e, val8);
-      // turn on motor of selected drive, DMA & int enabled, normal operation
-      if (drive)
-        dor = 0x20;
-      else
-        dor = 0x10;
-      dor |= 0x0c;
-      dor |= drive;
-      outb(0x03f2, dor);
+      floppy_prepare_controller(drive);
 
-      // reset the disk motor timeout value of INT 08
-      write_byte(0x40,0x40, BX_FLOPPY_ON_CNT);
-
-      // check port 3f4 for drive readiness
-      val8 = inb(0x3f4);
-      if ( (val8 & 0xf0) != 0x80 )
-        BX_PANIC("int13_diskette:f05: ctrl not ready\n");
-
-      // send read-normal-data command (6 bytes) to controller
+      // send format-track command (6 bytes) to controller
       outb(0x03f5, 0x4d); // 4d: format track
       outb(0x03f5, (head << 2) | drive); // HD DR1 DR2
       outb(0x03f5, 2); // 512 byte sector size
@@ -7546,20 +7832,29 @@ BX_DEBUG_INT13_FL("floppy f05\n");
   ASM_START
       sti
   ASM_END
+
       // wait on 40:3e bit 7 to become 1
-      val8 = (read_byte(0x0000, 0x043e) & 0x80);
-      while ( val8 == 0 ) {
-        val8 = (read_byte(0x0000, 0x043e) & 0x80);
+      do {
+        val8 = read_byte(0x0040, 0x0040);
+        if (val8 == 0) {
+          floppy_reset_controller();
+          SET_AH(0x80); // drive not ready (timeout)
+          set_diskette_ret_status(0x80);
+          SET_CF(); // error occurred
+          return;
         }
-     val8 = 0; // separate asm from while() loop
-     // turn off interrupts
+        val8 = (read_byte(0x0040, 0x003e) & 0x80);
+      } while ( val8 == 0 );
+
+      val8 = 0; // separate asm from while() loop
+      // turn off interrupts
   ASM_START
       cli
   ASM_END
       // set 40:3e bit 7 to 0
-      val8 = read_byte(0x0000, 0x043e);
+      val8 = read_byte(0x0040, 0x003e);
       val8 &= 0x7f;
-      write_byte(0x0000, 0x043e, val8);
+      write_byte(0x0040, 0x003e, val8);
       // check port 3f4 for accessibility to status bytes
       val8 = inb(0x3f4);
       if ( (val8 & 0xc0) != 0xc0 )
@@ -7911,8 +8206,9 @@ Bit16u seq_nr;
   Bit16u bootseg;
   Bit16u bootip;
   Bit16u status;
+  Bit16u bootfirst;
 
-  struct ipl_entry e;
+  ipl_entry_t e;
 
   // if BX_ELTORITO_BOOT is not defined, old behavior
   //   check bit 5 in CMOS reg 0x2d.  load either 0x00 or 0x80 into DL
@@ -7926,7 +8222,7 @@ Bit16u seq_nr;
   //     CMOS reg 0x38 & 0xf0 : 3rd boot device
   //   boot device codes:
   //     0x00 : not defined
-  //     0x01 : first floppy 
+  //     0x01 : first floppy
   //     0x02 : first harddrive
   //     0x03 : first cdrom
   //     0x04 - 0x0f : PnP expansion ROMs (e.g. Etherboot)
@@ -7938,16 +8234,25 @@ Bit16u seq_nr;
   bootdev |= ((inb_cmos(0x38) & 0xf0) << 4);
   bootdev >>= 4 * seq_nr;
   bootdev &= 0xf;
-  if (bootdev == 0) BX_PANIC("No bootable device.\n");
-  
+
+  /* Read user selected device */
+  bootfirst = read_word(ebda_seg, IPL_BOOTFIRST_OFFSET);
+  if (bootfirst != 0xFFFF) {
+    bootdev = bootfirst;
+    /* User selected device not set */
+    write_word(ebda_seg, IPL_BOOTFIRST_OFFSET, 0xFFFF);
+    /* Reset boot sequence */
+    write_word(ebda_seg, IPL_SEQUENCE_OFFSET, 0xFFFF);
+  } else if (bootdev == 0) BX_PANIC("No bootable device.\n");
+
   /* Translate from CMOS runes to an IPL table offset by subtracting 1 */
   bootdev -= 1;
-#else  
+#else
   if (seq_nr ==2) BX_PANIC("No more boot devices.");
-  if (!!(inb_cmos(0x2d) & 0x20) ^ (seq_nr == 1)) 
+  if (!!(inb_cmos(0x2d) & 0x20) ^ (seq_nr == 1))
       /* Boot from floppy if the bit is set or it's the second boot */
     bootdev = 0x00;
-  else 
+  else
     bootdev = 0x01;
 #endif
 
@@ -7959,13 +8264,13 @@ Bit16u seq_nr;
 
   /* Do the loading, and set up vector as a far pointer to the boot
    * address, and bootdrv as the boot drive */
-  print_boot_device(e.type);
+  print_boot_device(&e);
 
   switch(e.type) {
-  case 0x01: /* FDD */
-  case 0x02: /* HDD */
+  case IPL_TYPE_FLOPPY: /* FDD */
+  case IPL_TYPE_HARDDISK: /* HDD */
 
-    bootdrv = (e.type == 0x02) ? 0x80 : 0x00;
+    bootdrv = (e.type == IPL_TYPE_HARDDISK) ? 0x80 : 0x00;
     bootseg = 0x07c0;
     status = 0;
 
@@ -7980,7 +8285,7 @@ ASM_START
     mov  dl, _int18_function.bootdrv + 2[bp]
     mov  ax, _int18_function.bootseg + 2[bp]
     mov  es, ax         ;; segment
-    mov  bx, #0x0000    ;; offset
+    xor  bx, bx         ;; offset
     mov  ah, #0x02      ;; function 2, read diskette sector
     mov  al, #0x01      ;; read 1 sector
     mov  ch, #0x00      ;; track 0
@@ -7998,7 +8303,7 @@ int19_load_done:
     pop  ax
     pop  bp
 ASM_END
-    
+
     if (status != 0) {
       print_boot_failure(e.type, 1);
       return;
@@ -8006,7 +8311,7 @@ ASM_END
 
     /* Always check the signature on a HDD boot sector; on FDD, only do
      * the check if the CMOS doesn't tell us to skip it */
-    if (e.type != 0x00 || !((inb_cmos(0x38) & 0x01))) {
+    if ((e.type != IPL_TYPE_FLOPPY) || !((inb_cmos(0x38) & 0x01))) {
       if (read_word(bootseg,0x1fe) != 0xaa55) {
         print_boot_failure(e.type, 0);
         return;
@@ -8024,7 +8329,7 @@ ASM_END
   break;
 
 #if BX_ELTORITO_BOOT
-  case 0x03: /* CD-ROM */
+  case IPL_TYPE_CDROM: /* CD-ROM */
     status = cdrom_boot();
 
     // If failure
@@ -8043,7 +8348,7 @@ ASM_END
     break;
 #endif
 
-  case 0x80: /* Expansion ROM with a Bootstrap Entry Vector (a far pointer) */
+  case IPL_TYPE_BEV: /* Expansion ROM with a Bootstrap Entry Vector (a far pointer) */
     bootseg = e.vector >> 16;
     bootip = e.vector & 0xffff;
     break;
@@ -8051,16 +8356,20 @@ ASM_END
   default: return;
   }
 
-  
+  /* Debugging info */
+  BX_INFO("Booting from %x:%x\n", bootseg, bootip);
+
   /* Jump to the boot vector */
 ASM_START
     mov  bp, sp
+//    push cs
+//    push #int18_handler
     ;; Build an iret stack frame that will take us to the boot vector.
     ;; iret pops ip, then cs, then flags, so push them in the opposite order.
     pushf
-    mov  ax, _int18_function.bootseg + 0[bp] 
+    mov  ax, _int18_function.bootseg + 0[bp]
     push ax
-    mov  ax, _int18_function.bootip + 0[bp] 
+    mov  ax, _int18_function.bootip + 0[bp]
     push ax
     ;; Set the magic number in ax and the boot drive in dl.
     mov  ax, #0xaa55
@@ -8263,7 +8572,11 @@ int1a_function(regs, ds, iret_addr)
       } else if (regs.u.r8.bl == 0x83) {
         BX_INFO("bad PCI vendor ID %04x\n", regs.u.r16.dx);
       } else if (regs.u.r8.bl == 0x86) {
-        BX_INFO("PCI device %04x:%04x not found\n", regs.u.r16.dx, regs.u.r16.cx);
+        if (regs.u.r8.al == 0x02) {
+          BX_INFO("PCI device %04x:%04x not found at index %d\n", regs.u.r16.dx, regs.u.r16.cx, regs.u.r16.si);
+        } else {
+          BX_INFO("no PCI device with class code 0x%02x%04x found at index %d\n", regs.u.r8.cl, regs.u.r16.dx, regs.u.r16.si);
+        }
       }
       regs.u.r8.ah = regs.u.r8.bl;
       SetCF(iret_addr.flags);
@@ -8309,11 +8622,11 @@ ASM_END
           // Done waiting.
           Bit16u segment, offset;
 
-          offset = read_word( 0x40, 0x98 );
-          segment = read_word( 0x40, 0x9A );
+          segment = read_word( 0x40, 0x98 );
+          offset = read_word( 0x40, 0x9A );
           write_byte( 0x40, 0xA0, 0 );  // Turn of status byte.
           outb_cmos( 0xB, registerB & 0x37 ); // Clear the Periodic Interrupt.
-          write_byte( segment, offset, 0x80 );  // Write to specified flag byte.
+          write_byte(segment, offset, read_byte(segment, offset) | 0x80 );  // Write to specified flag byte.
         } else {
           // Continue waiting.
           time -= 0x3D1;
@@ -8521,13 +8834,18 @@ int13_notcdrom:
 #endif
 
 int13_disk:
+  ;; int13_harddisk modifies high word of EAX
+  shr   eax, #16
+  push  ax
   call  _int13_harddisk
+  pop   ax
+  shl   eax, #16
 
 int13_out:
   pop ds
   pop es
   popa
-  iret 
+  iret
 
 ;----------
 ;- INT18h -
@@ -8540,18 +8858,19 @@ int18_handler: ;; Boot Failure recovery: try the next device.
   xor  ax, ax
   mov  ss, ax
 
-  ;; Get the boot sequence number out of the IPL memory
   ;; The first time we do this it will have been set to -1 so 
   ;; we will start from device 0.
-  mov  bx, #IPL_SEG 
+  mov  ds, ax
+  mov  bx, word ptr [0x40E]       ;; EBDA segment
   mov  ds, bx                     ;; Set segment
   mov  bx, IPL_SEQUENCE_OFFSET    ;; BX is now the sequence number
   inc  bx                         ;; ++
   mov  IPL_SEQUENCE_OFFSET, bx    ;; Write it back
-  mov  ds, ax                     ;; and reset the segment to zero. 
+  mov  ds, ax                     ;; and reset the segment to zero.
 
   ;; Call the C code for the next boot device
   push bx
+
   call _int18_function
 
   ;; Boot failed: invoke the boot recovery function...
@@ -8561,6 +8880,7 @@ int18_handler: ;; Boot Failure recovery: try the next device.
 ;- INT19h -
 ;----------
 int19_relocated: ;; Boot function, relocated
+
   ;;
   ;; *** Warning: INT 19h resets the whole machine *** 
   ;;
@@ -8572,10 +8892,12 @@ int19_relocated: ;; Boot function, relocated
   ;; boot sequence will start, which is more or less the required behaviour.
   ;; 
   ;; Reset SP and SS
+
   mov  ax, #0xfffe
   mov  sp, ax
   xor  ax, ax
   mov  ss, ax
+
   call _machine_reset
 
 ;----------
@@ -8589,7 +8911,7 @@ int1c_handler: ;; User Timer Tick
 ;- POST: Floppy Drive -
 ;----------------------
 floppy_drive_post:
-  mov  ax, #0x0000
+  xor  ax, ax
   mov  ds, ax
 
   mov  al, #0x00
@@ -8671,7 +8993,7 @@ hard_drive_post:
   mov  dx, #0x03f6
   out  dx, al
 
-  mov  ax, #0x0000
+  xor  ax, ax
   mov  ds, ax
   mov  0x0474, al /* hard disk status of last operation */
   mov  0x0477, al /* hard disk port offset (XT only ???) */
@@ -8686,8 +9008,8 @@ hard_drive_post:
   SET_INT_VECTOR(0x76, #0xF000, #int76_handler)
   ;; INT 41h: hard disk 0 configuration pointer
   ;; INT 46h: hard disk 1 configuration pointer
-  SET_INT_VECTOR(0x41, #EBDA_SEG, #0x003D)
-  SET_INT_VECTOR(0x46, #EBDA_SEG, #0x004D)
+  SET_INT_VECTOR(0x41, word ptr [0x40E], #0x003D) /* EBDA:003D */
+  SET_INT_VECTOR(0x46, word ptr [0x40E], #0x004D) /* EBDA:004D */
 
   ;; move disk geometry data from CMOS to EBDA disk parameter table(s)
   mov  al, #0x12
@@ -8716,7 +9038,9 @@ post_d0_type47:
   ;; 22    landing zone high        D
   ;; 23    sectors/track            E
 
-  mov  ax, #EBDA_SEG
+  xor  ax, ax
+  mov  ds, ax
+  mov  ax, word ptr [0x40E] ;; EBDA segment
   mov  ds, ax
 
   ;;; Filling EBDA table for hard disk 0.
@@ -8862,7 +9186,9 @@ post_d1_type47:
   ;; 0x2b    landing zone high        D
   ;; 0x2c    sectors/track            E
 ;;; Fill EBDA table for hard disk 1.
-  mov  ax, #EBDA_SEG
+  xor  ax, ax
+  mov  ds, ax
+  mov  ax, word ptr [0x40E] ;; EBDA segment
   mov  ds, ax
   mov  al, #0x28
   out  #0x70, al
@@ -8993,13 +9319,42 @@ ebda_post:
 ;--------------------
 ; relocated here because the primary POST area isnt big enough.
 eoi_jmp_post:
-  call eoi_both_pics
+  mov   al, #0x20
+  out   #0xA0, al ;; slave  PIC EOI
+  mov   al, #0x20
+  out   #0x20, al ;; master PIC EOI
 
+jmp_post_0x467:
   xor ax, ax
   mov ds, ax
 
   jmp far ptr [0x467]
 
+iret_post_0x467:
+  xor ax, ax
+  mov ds, ax
+
+  mov sp, [0x467]
+  mov ss, [0x469]
+  iret
+
+retf_post_0x467:
+  xor ax, ax
+  mov ds, ax
+
+  mov sp, [0x467]
+  mov ss, [0x469]
+  retf
+
+s3_post:
+#if BX_ROMBIOS32
+  call rombios32_init
+#endif
+  call _s3_resume
+  mov bl, #0x00
+  and ax, ax
+  jz normal_post
+  call _s3_resume_panic
 
 ;--------------------
 eoi_both_pics:
@@ -9133,8 +9488,9 @@ use16 386
 
 #endif
 
-ASM_END
 #include "32bitgateway.c"
+ASM_END
+#include "tcgbios.c"
 ASM_START
 
 ;--------------------
@@ -9152,16 +9508,22 @@ bios32_structure:
 
 .align 16
 bios32_entry_point:
-  pushf
-  cmp eax, #0x49435024
+  pushfd
+  cmp eax, #0x49435024 ;; "$PCI"
   jne unknown_service
   mov eax, #0x80000000
   mov dx, #0x0cf8
   out dx, eax
   mov dx, #0x0cfc
   in  eax, dx
-  cmp eax, #0x12378086
+#ifdef PCI_FIXED_HOST_BRIDGE
+  cmp eax, #PCI_FIXED_HOST_BRIDGE
   jne unknown_service
+#else
+  ;; say ok if a device is present
+  cmp eax, #0xffffffff
+  je unknown_service
+#endif
   mov ebx, #0x000f0000
   mov ecx, #0
   mov edx, #pcibios_protected
@@ -9170,12 +9532,15 @@ bios32_entry_point:
 unknown_service:
   mov al, #0x80
 bios32_end:
-  popf
+#ifdef BX_QEMU
+  and dword ptr[esp+8],0xfffffffc ;; reset CS.RPL for kqemu
+#endif
+  popfd
   retf
 
 .align 16
 pcibios_protected:
-  pushf
+  pushfd
   cli
   push esi
   push edi
@@ -9183,15 +9548,15 @@ pcibios_protected:
   jne pci_pro_f02
   mov bx, #0x0210
   mov cx, #0
-  mov edx, #0x20494350
+  mov edx, #0x20494350 ;; "PCI "
   mov al, #0x01
   jmp pci_pro_ok
 pci_pro_f02: ;; find pci device
   cmp al, #0x02
-  jne pci_pro_f08
+  jne pci_pro_f03
   shl ecx, #16
   mov cx, dx
-  mov bx, #0x0000
+  xor bx, bx
   mov di, #0x00
 pci_pro_devloop:
   call pci_pro_select_reg
@@ -9208,6 +9573,27 @@ pci_pro_nextdev:
   jne pci_pro_devloop
   mov ah, #0x86
   jmp pci_pro_fail
+pci_pro_f03: ;; find class code
+  cmp al, #0x03
+  jne pci_pro_f08
+  xor bx, bx
+  mov di, #0x08
+pci_pro_devloop2:
+  call pci_pro_select_reg
+  mov dx, #0x0cfc
+  in  eax, dx
+  shr eax, #8
+  cmp eax, ecx
+  jne pci_pro_nextdev2
+  cmp si, #0
+  je  pci_pro_ok
+  dec si
+pci_pro_nextdev2:
+  inc bx
+  cmp bx, #0x0100
+  jne pci_pro_devloop2
+  mov ah, #0x86
+  jmp pci_pro_fail
 pci_pro_f08: ;; read configuration byte
   cmp al, #0x08
   jne pci_pro_f09
@@ -9281,16 +9667,20 @@ pci_pro_unknown:
 pci_pro_fail:
   pop edi
   pop esi
-  sti
-  popf
+#ifdef BX_QEMU
+  and dword ptr[esp+8],0xfffffffc ;; reset CS.RPL for kqemu
+#endif
+  popfd
   stc
   retf
 pci_pro_ok:
   xor ah, ah
   pop edi
   pop esi
-  sti
-  popf
+#ifdef BX_QEMU
+  and dword ptr[esp+8],0xfffffffc ;; reset CS.RPL for kqemu
+#endif
+  popfd
   clc
   retf
 
@@ -9317,8 +9707,14 @@ pcibios_real:
   out dx, eax
   mov dx, #0x0cfc
   in  eax, dx
-  cmp eax, #0x12378086
+#ifdef PCI_FIXED_HOST_BRIDGE
+  cmp eax, #PCI_FIXED_HOST_BRIDGE
   je  pci_present
+#else
+  ;; say ok if a device is present
+  cmp eax, #0xffffffff
+  jne  pci_present
+#endif
   pop dx
   pop eax
   mov ah, #0xff
@@ -9332,7 +9728,7 @@ pci_present:
   mov ax, #0x0001
   mov bx, #0x0210
   mov cx, #0
-  mov edx, #0x20494350
+  mov edx, #0x20494350 ;; "PCI "
   mov edi, #0xf0000
   mov di, #pcibios_protected
   clc
@@ -9341,10 +9737,10 @@ pci_real_f02: ;; find pci device
   push esi
   push edi
   cmp al, #0x02
-  jne pci_real_f08
+  jne pci_real_f03
   shl ecx, #16
   mov cx, dx
-  mov bx, #0x0000
+  xor bx, bx
   mov di, #0x00
 pci_real_devloop:
   call pci_real_select_reg
@@ -9361,7 +9757,30 @@ pci_real_nextdev:
   jne pci_real_devloop
   mov dx, cx
   shr ecx, #16
-  mov ah, #0x86
+  mov ax, #0x8602
+  jmp pci_real_fail
+pci_real_f03: ;; find class code
+  cmp al, #0x03
+  jne pci_real_f08
+  xor bx, bx
+  mov di, #0x08
+pci_real_devloop2:
+  call pci_real_select_reg
+  mov dx, #0x0cfc
+  in  eax, dx
+  shr eax, #8
+  cmp eax, ecx
+  jne pci_real_nextdev2
+  cmp si, #0
+  je  pci_real_ok
+  dec si
+pci_real_nextdev2:
+  inc bx
+  cmp bx, #0x0100
+  jne pci_real_devloop2
+  mov dx, cx
+  shr ecx, #16
+  mov ax, #0x8603
   jmp pci_real_fail
 pci_real_f08: ;; read configuration byte
   cmp al, #0x08
@@ -9423,7 +9842,7 @@ pci_real_f0c: ;; write configuration word
   jmp pci_real_ok
 pci_real_f0d: ;; write configuration dword
   cmp al, #0x0d
-  jne pci_real_unknown
+  jne pci_real_f0e
   call pci_real_select_reg
   push dx
   mov dx, #0x0cfc
@@ -9431,6 +9850,46 @@ pci_real_f0d: ;; write configuration dword
   out dx, eax
   pop dx
   jmp pci_real_ok
+pci_real_f0e: ;; get irq routing options
+  cmp al, #0x0e
+  jne pci_real_unknown
+  SEG ES
+  cmp word ptr [di], #pci_routing_table_structure_end - pci_routing_table_structure_start
+  jb pci_real_too_small
+  SEG ES
+  mov word ptr [di], #pci_routing_table_structure_end - pci_routing_table_structure_start
+  pushf
+  push ds
+  push es
+  push cx
+  push si
+  push di
+  cld
+  mov si, #pci_routing_table_structure_start
+  push cs
+  pop ds
+  SEG ES
+  mov cx, [di+2]
+  SEG ES
+  mov es, [di+4]
+  mov di, cx
+  mov cx, #pci_routing_table_structure_end - pci_routing_table_structure_start
+  rep
+      movsb
+  pop di
+  pop si
+  pop cx
+  pop es
+  pop ds
+  popf
+  mov bx, #(1 << 9) | (1 << 11)   ;; irq 9 and 11 are used
+  jmp pci_real_ok
+pci_real_too_small:
+  SEG ES
+  mov word ptr [di], #pci_routing_table_structure_end - pci_routing_table_structure_start
+  mov ah, #0x89
+  jmp pci_real_fail
+
 pci_real_unknown:
   mov ah, #0x81
 pci_real_fail:
@@ -9457,7 +9916,7 @@ pci_real_select_reg:
   out dx,  eax
   pop dx
   ret
-  
+
 .align 16
 pci_routing_table_structure:
   db 0x24, 0x50, 0x49, 0x52  ;; "$PIR" signature
@@ -9465,21 +9924,22 @@ pci_routing_table_structure:
   dw 32 + (6 * 16) ;; table size
   db 0 ;; PCI interrupt router bus
   db 0x08 ;; PCI interrupt router DevFunc
-  dw 0x0000 ;; PCI exclusive IRQs 
+  dw 0x0000 ;; PCI exclusive IRQs
   dw 0x8086 ;; compatible PCI interrupt router vendor ID
-  dw 0x7000 ;; compatible PCI interrupt router device ID
+  dw 0x122e ;; compatible PCI interrupt router device ID
   dw 0,0 ;; Miniport data
   db 0,0,0,0,0,0,0,0,0,0,0 ;; reserved
-  db 0x07 ;; checksum
+  db 0x37 ;; checksum
+pci_routing_table_structure_start:
   ;; first slot entry PCI-to-ISA (embedded)
   db 0 ;; pci bus number
   db 0x08 ;; pci device number (bit 7-3)
   db 0x61 ;; link value INTA#: pointer into PCI2ISA config space
-  dw 0x0c20 ;; IRQ bitmap INTA# 
+  dw 0x0c20 ;; IRQ bitmap INTA#
   db 0x62 ;; link value INTB#
-  dw 0x0c20 ;; IRQ bitmap INTB# 
+  dw 0x0c20 ;; IRQ bitmap INTB#
   db 0x63 ;; link value INTC#
-  dw 0x0c20 ;; IRQ bitmap INTC# 
+  dw 0x0c20 ;; IRQ bitmap INTC#
   db 0x60 ;; link value INTD#
   dw 0x0c20 ;; IRQ bitmap INTD#
   db 0 ;; physical slot (0 = embedded)
@@ -9488,11 +9948,11 @@ pci_routing_table_structure:
   db 0 ;; pci bus number
   db 0x10 ;; pci device number (bit 7-3)
   db 0x62 ;; link value INTA#
-  dw 0x0c20 ;; IRQ bitmap INTA# 
+  dw 0x0c20 ;; IRQ bitmap INTA#
   db 0x63 ;; link value INTB#
-  dw 0x0c20 ;; IRQ bitmap INTB# 
+  dw 0x0c20 ;; IRQ bitmap INTB#
   db 0x60 ;; link value INTC#
-  dw 0x0c20 ;; IRQ bitmap INTC# 
+  dw 0x0c20 ;; IRQ bitmap INTC#
   db 0x61 ;; link value INTD#
   dw 0x0c20 ;; IRQ bitmap INTD#
   db 1 ;; physical slot (0 = embedded)
@@ -9501,11 +9961,11 @@ pci_routing_table_structure:
   db 0 ;; pci bus number
   db 0x18 ;; pci device number (bit 7-3)
   db 0x63 ;; link value INTA#
-  dw 0x0c20 ;; IRQ bitmap INTA# 
+  dw 0x0c20 ;; IRQ bitmap INTA#
   db 0x60 ;; link value INTB#
-  dw 0x0c20 ;; IRQ bitmap INTB# 
+  dw 0x0c20 ;; IRQ bitmap INTB#
   db 0x61 ;; link value INTC#
-  dw 0x0c20 ;; IRQ bitmap INTC# 
+  dw 0x0c20 ;; IRQ bitmap INTC#
   db 0x62 ;; link value INTD#
   dw 0x0c20 ;; IRQ bitmap INTD#
   db 2 ;; physical slot (0 = embedded)
@@ -9514,11 +9974,11 @@ pci_routing_table_structure:
   db 0 ;; pci bus number
   db 0x20 ;; pci device number (bit 7-3)
   db 0x60 ;; link value INTA#
-  dw 0x0c20 ;; IRQ bitmap INTA# 
+  dw 0x0c20 ;; IRQ bitmap INTA#
   db 0x61 ;; link value INTB#
-  dw 0x0c20 ;; IRQ bitmap INTB# 
+  dw 0x0c20 ;; IRQ bitmap INTB#
   db 0x62 ;; link value INTC#
-  dw 0x0c20 ;; IRQ bitmap INTC# 
+  dw 0x0c20 ;; IRQ bitmap INTC#
   db 0x63 ;; link value INTD#
   dw 0x0c20 ;; IRQ bitmap INTD#
   db 3 ;; physical slot (0 = embedded)
@@ -9527,11 +9987,11 @@ pci_routing_table_structure:
   db 0 ;; pci bus number
   db 0x28 ;; pci device number (bit 7-3)
   db 0x61 ;; link value INTA#
-  dw 0x0c20 ;; IRQ bitmap INTA# 
+  dw 0x0c20 ;; IRQ bitmap INTA#
   db 0x62 ;; link value INTB#
-  dw 0x0c20 ;; IRQ bitmap INTB# 
+  dw 0x0c20 ;; IRQ bitmap INTB#
   db 0x63 ;; link value INTC#
-  dw 0x0c20 ;; IRQ bitmap INTC# 
+  dw 0x0c20 ;; IRQ bitmap INTC#
   db 0x60 ;; link value INTD#
   dw 0x0c20 ;; IRQ bitmap INTD#
   db 4 ;; physical slot (0 = embedded)
@@ -9540,17 +10000,394 @@ pci_routing_table_structure:
   db 0 ;; pci bus number
   db 0x30 ;; pci device number (bit 7-3)
   db 0x62 ;; link value INTA#
-  dw 0x0c20 ;; IRQ bitmap INTA# 
+  dw 0x0c20 ;; IRQ bitmap INTA#
   db 0x63 ;; link value INTB#
-  dw 0x0c20 ;; IRQ bitmap INTB# 
+  dw 0x0c20 ;; IRQ bitmap INTB#
   db 0x60 ;; link value INTC#
-  dw 0x0c20 ;; IRQ bitmap INTC# 
+  dw 0x0c20 ;; IRQ bitmap INTC#
   db 0x61 ;; link value INTD#
   dw 0x0c20 ;; IRQ bitmap INTD#
   db 5 ;; physical slot (0 = embedded)
   db 0 ;; reserved
+pci_routing_table_structure_end:
+
+#if !BX_ROMBIOS32 && !defined(HVMASSIST)
+pci_irq_list:
+  db 11, 10, 9, 5;
+
+pcibios_init_sel_reg:
+  push eax
+  mov eax, #0x800000
+  mov ax,  bx
+  shl eax, #8
+  and dl,  #0xfc
+  or  al,  dl
+  mov dx,  #0x0cf8
+  out dx,  eax
+  pop eax
+  ret
+
+pcibios_init_iomem_bases:
+  push bp
+  mov  bp, sp
+  mov  eax, #0xe0000000 ;; base for memory init
+  push eax
+  mov  ax, #0xc000 ;; base for i/o init
+  push ax
+  mov  ax, #0x0010 ;; start at base address #0
+  push ax
+  mov  bx, #0x0008
+pci_init_io_loop1:
+  mov  dl, #0x00
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  in   ax, dx
+  cmp  ax, #0xffff
+  jz   next_pci_dev
+  mov  dl, #0x04 ;; disable i/o and memory space access
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  in   al, dx
+  and  al, #0xfc
+  out  dx, al
+pci_init_io_loop2:
+  mov  dl, [bp-8]
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  in   eax, dx
+  test al, #0x01
+  jnz  init_io_base
+  mov  ecx, eax
+  mov  eax, #0xffffffff
+  out  dx, eax
+  in   eax, dx
+  cmp  eax, ecx
+  je   next_pci_base
+  xor  eax, #0xffffffff
+  mov  ecx, eax
+  mov  eax, [bp-4]
+  out  dx, eax
+  add  eax, ecx ;; calculate next free mem base
+  add  eax, #0x01000000
+  and  eax, #0xff000000
+  mov  [bp-4], eax
+  jmp  next_pci_base
+init_io_base:
+  mov  cx, ax
+  mov  ax, #0xffff
+  out  dx, ax
+  in   ax, dx
+  cmp  ax, cx
+  je   next_pci_base
+  xor  ax, #0xfffe
+  mov  cx, ax
+  mov  ax, [bp-6]
+  out  dx, ax
+  add  ax, cx ;; calculate next free i/o base
+  add  ax, #0x0100
+  and  ax, #0xff00
+  mov  [bp-6], ax
+next_pci_base:
+  mov  al, [bp-8]
+  add  al, #0x04
+  cmp  al, #0x28
+  je   enable_iomem_space
+  mov  byte ptr[bp-8], al
+  jmp  pci_init_io_loop2
+enable_iomem_space:
+  mov  dl, #0x04 ;; enable i/o and memory space access if available
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  in   al, dx
+  or   al, #0x07
+  out  dx, al
+next_pci_dev:
+  mov  byte ptr[bp-8], #0x10
+  inc  bx
+  cmp  bx, #0x0100
+  jne  pci_init_io_loop1
+  mov  sp, bp
+  pop  bp
+  ret
+
+pcibios_init_set_elcr:
+  push ax
+  push cx
+  mov  dx, #0x04d0
+  test al, #0x08
+  jz   is_master_pic
+  inc  dx
+  and  al, #0x07
+is_master_pic:
+  mov  cl, al
+  mov  bl, #0x01
+  shl  bl, cl
+  in   al, dx
+  or   al, bl
+  out  dx, al
+  pop  cx
+  pop  ax
+  ret
+
+pcibios_init_irqs:
+  push ds
+  push bp
+  mov  ax, #0xf000
+  mov  ds, ax
+  mov  dx, #0x04d0 ;; reset ELCR1 + ELCR2
+  mov  al, #0x00
+  out  dx, al
+  inc  dx
+  out  dx, al
+  mov  si, #pci_routing_table_structure
+  mov  bh, [si+8]
+  mov  bl, [si+9]
+  mov  dl, #0x00
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  in   eax, dx
+  cmp  eax, [si+12] ;; check irq router
+  jne  pci_init_end
+  mov  dl, [si+34]
+  call pcibios_init_sel_reg
+  push bx ;; save irq router bus + devfunc
+  mov  dx, #0x0cfc
+  mov  ax, #0x8080
+  out  dx, ax ;; reset PIRQ route control
+  add  dx, #2
+  out  dx, ax
+  mov  ax, [si+6]
+  sub  ax, #0x20
+  shr  ax, #4
+  mov  cx, ax
+  add  si, #0x20 ;; set pointer to 1st entry
+  mov  bp, sp
+  mov  ax, #pci_irq_list
+  push ax
+  xor  ax, ax
+  push ax
+pci_init_irq_loop1:
+  mov  bh, [si]
+  mov  bl, [si+1]
+pci_init_irq_loop2:
+  mov  dl, #0x00
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  in   ax, dx
+  cmp  ax, #0xffff
+  jnz  pci_test_int_pin
+  test bl, #0x07
+  jz   next_pir_entry
+  jmp  next_pci_func
+pci_test_int_pin:
+  mov  dl, #0x3c
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfd
+  in   al, dx
+  and  al, #0x07
+  jz   next_pci_func
+  dec  al ;; determine pirq reg
+  mov  dl, #0x03
+  mul  al, dl
+  add  al, #0x02
+  xor  ah, ah
+  mov  bx, ax
+  mov  al, [si+bx]
+  mov  dl, al
+  mov  bx, [bp]
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  and  al, #0x03
+  add  dl, al
+  in   al, dx
+  cmp  al, #0x80
+  jb   pirq_found
+  mov  bx, [bp-2] ;; pci irq list pointer
+  mov  al, [bx]
+  out  dx, al
+  inc  bx
+  mov  [bp-2], bx
+  call pcibios_init_set_elcr
+pirq_found:
+  mov  bh, [si]
+  mov  bl, [si+1]
+  add  bl, [bp-3] ;; pci function number
+  mov  dl, #0x3c
+  call pcibios_init_sel_reg
+  mov  dx, #0x0cfc
+  out  dx, al
+next_pci_func:
+  inc  byte ptr[bp-3]
+  inc  bl
+  test bl, #0x07
+  jnz  pci_init_irq_loop2
+next_pir_entry:
+  add  si, #0x10
+  mov  byte ptr[bp-3], #0x00
+  loop pci_init_irq_loop1
+  mov  sp, bp
+  pop  bx
+pci_init_end:
+  pop  bp
+  pop  ds
+  ret
+#endif // !BX_ROMBIOS32
 #endif // BX_PCIBIOS
 
+#if BX_ROMBIOS32
+rombios32_init:
+  ;; save a20 and enable it
+  in al, 0x92
+  push ax
+  or al, #0x02
+  out 0x92, al
+
+  ;; save SS:SP to the BDA
+  xor ax, ax
+  mov ds, ax
+  mov 0x0469, ss
+  mov 0x0467, sp
+
+  SEG CS
+    lidt [pmode_IDT_info]
+  SEG CS
+    lgdt [rombios32_gdt_48]
+  ;; set PE bit in CR0
+  mov  eax, cr0
+  or   al, #0x01
+  mov  cr0, eax
+  ;; start protected mode code: ljmpl 0x10:rombios32_init1
+  db 0x66, 0xea
+  dw rombios32_05
+  dw 0x000f       ;; high 16 bit address
+  dw 0x0010
+
+use32 386
+rombios32_05:
+  ;; init data segments
+  mov eax, #0x18
+  mov ds, ax
+  mov es, ax
+  mov ss, ax
+  xor eax, eax
+  mov fs, ax
+  mov gs, ax
+  cld
+
+  ;; init the stack pointer to point below EBDA
+  mov ax, [0x040e]
+  shl eax, #4
+  mov esp, #-0x10
+  add esp, eax
+
+  ;; pass pointer to s3_resume_flag and s3_resume_vector to rombios32
+  push #0x04b0
+  push #0x04b2
+
+  ;; call rombios32 code
+  mov eax, #0x000e0000
+  call eax
+
+  ;; return to 16 bit protected mode first
+  db 0xea
+  dd rombios32_10
+  dw 0x20
+
+use16 386
+rombios32_10:
+  ;; restore data segment limits to 0xffff
+  mov ax, #0x28
+  mov ds, ax
+  mov es, ax
+  mov ss, ax
+  mov fs, ax
+  mov gs, ax
+
+  ;; reset PE bit in CR0
+  mov  eax, cr0
+  and  al, #0xFE
+  mov  cr0, eax
+
+  ;; far jump to flush CPU queue after transition to real mode
+  JMP_AP(0xf000, rombios32_real_mode)
+
+rombios32_real_mode:
+  ;; restore IDT to normal real-mode defaults
+  SEG CS
+    lidt [rmode_IDT_info]
+
+  xor ax, ax
+  mov ds, ax
+  mov es, ax
+  mov fs, ax
+  mov gs, ax
+
+  ;; restore SS:SP from the BDA
+  mov ss, 0x0469
+  xor esp, esp
+  mov sp, 0x0467
+  ;; restore a20
+  pop ax
+  out 0x92, al
+  ret
+
+rombios32_gdt_48:
+  dw 0x30
+  dw rombios32_gdt
+  dw 0x000f
+
+rombios32_gdt:
+  dw 0, 0, 0, 0
+  dw 0, 0, 0, 0
+  dw 0xffff, 0, 0x9b00, 0x00cf ; 32 bit flat code segment (0x10)
+  dw 0xffff, 0, 0x9300, 0x00cf ; 32 bit flat data segment (0x18)
+  dw 0xffff, 0, 0x9b0f, 0x0000 ; 16 bit code segment base=0xf0000 limit=0xffff
+  dw 0xffff, 0, 0x9300, 0x0000 ; 16 bit data segment base=0x0 limit=0xffff
+#endif // BX_ROMBIOS32
+
+#if BX_PMM
+; according to POST Memory Manager Specification Version 1.01
+.align 16
+pmm_structure:
+  db 0x24,0x50,0x4d,0x4d ;; "$PMM" signature
+  db 0x01 ;; revision
+  db 16 ;; length
+  db (-((pmm_entry_point>>8)+pmm_entry_point+0x20f))&0xff;; checksum
+  dw pmm_entry_point,0xf000 ;; far call entrypoint
+  db 0,0,0,0,0 ;; reserved
+
+pmm_entry_point:
+  pushf
+  pushad
+; Calculate protected-mode address of PMM function args
+  xor  eax, eax
+  mov  ax, sp
+  xor  ebx, ebx
+  mov  bx, ss
+  shl  ebx, 4
+  lea  ebx, [eax+ebx+38] ;; ebx=(ss<<4)+sp+4(far call)+2(pushf)+32(pushad)
+  push ebx
+;
+; Stack layout at this point:
+;
+;        : +0x0    +0x2    +0x4    +0x6    +0x8    +0xa    +0xc    +0xe
+; -----------------------------------------------------------------------
+; sp     : [&arg1         ][edi           ][esi           ][ebp           ]
+; sp+0x10: [esp           ][ebx           ][edx           ][ecx           ]
+; sp+0x20: [eax           ][flags ][ip    ][cs    ][arg1  ][arg2, ...
+;
+  call _pmm
+  mov  bx, sp
+SEG SS
+  mov  [bx+0x20], ax
+SEG SS
+  mov  [bx+0x18], dx
+  pop  ebx
+  popad
+  popf
+  retf
+#endif // BX_PMM
+
 ; parallel port detection: base address in DX, index in BX, timeout in CL
 detect_parport:
   push dx
@@ -9621,19 +10458,18 @@ checksum_loop:
   ret
 
 
-;; We need a copy of this string, but we are not actually a PnP BIOS, 
+;; We need a copy of this string, but we are not actually a PnP BIOS,
 ;; so make sure it is *not* aligned, so OSes will not see it if they scan.
 .align 16
   db 0
 pnp_string:
   .ascii "$PnP"
 
-
 rom_scan:
   ;; Scan for existence of valid expansion ROMS.
   ;;   Video ROM:   from 0xC0000..0xC7FFF in 2k increments
-  ;;   General ROM: from 0xC8000..0xDFFFF in 2k increments
-  ;;   System  ROM: only 0xE0000
+  ;;   General ROM: from 0xC8000..0xE9FFF in 2k increments
+  ;;   System  ROM: only 0xF0000
   ;;
   ;; Header:
   ;;   Offset    Value
@@ -9643,10 +10479,13 @@ rom_scan:
   ;;   3         ROM initialization entry point (FAR CALL)
 
 #if BX_TCGBIOS
+  push ax
   call _tcpa_start_option_rom_scan    /* specs: 3.2.3.3 + 10.4.3 */
+  pop ax
 #endif
-  mov  cx, #0xc000
+
 rom_scan_loop:
+  push ax       ;; Save AX
   mov  ds, cx
   mov  ax, #0x0004 ;; start with increment of 4 (512-byte) blocks = 2k
   cmp [0], #0xAA55 ;; look for signature
@@ -9673,23 +10512,38 @@ block_count_rounded:
   push ecx       ;; segment where option rom is located at
   call _tcpa_option_rom                   /* specs: 3.2.3.3 */
   add sp, #4    ;; pop segment
-  pop ecx      ;; original ecx
+  pop ecx      ;; original ecx
   pop ds
   pop ax
 #endif
-  xor  bx, bx   ;; Restore DS back to 0000:
-  mov  ds, bx
   push ax       ;; Save AX
   push di       ;; Save DI
   ;; Push addr of ROM entry point
   push cx       ;; Push seg
   push #0x0003  ;; Push offset
 
-  ;; Point ES:DI at "$PnP", which tells the ROM that we are a PnP BIOS.  
+  ;; Get the BDF into ax before invoking the option ROM
+  mov  bl, [2]
+  mov  al, bl
+  shr  al, #7
+  cmp  al, #1
+  jne  fetch_bdf
+  mov  ax, ds ;; Increment the DS since rom size larger than an segment
+  add  ax, #0x1000
+  mov  ds, ax
+fetch_bdf:
+  shl  bx, #9
+  xor  ax, ax
+  mov  al, [bx]
+
+  ;; Point ES:DI at "$PnP", which tells the ROM that we are a PnP BIOS.
   ;; That should stop it grabbing INT 19h; we will use its BEV instead.
-  mov  ax, #0xf000
-  mov  es, ax
-  lea  di, pnp_string 
+  mov  bx, #0xf000
+  mov  es, bx
+  lea  di, pnp_string
+
+  xor  bx, bx   ;; Restore DS back to 0000:
+  mov  ds, bx
 
   mov  bp, sp   ;; Call ROM init routine using seg:off on stack
   db   0xff     ;; call_far ss:[bp+0]
@@ -9699,8 +10553,8 @@ block_count_rounded:
   add  sp, #2   ;; Pop offset value
   pop  cx       ;; Pop seg value (restore CX)
 
-  ;; Look at the ROM's PnP Expansion header.  Properly, we're supposed 
-  ;; to init all the ROMs and then go back and build an IPL table of 
+  ;; Look at the ROM's PnP Expansion header.  Properly, we're supposed
+  ;; to init all the ROMs and then go back and build an IPL table of
   ;; all the bootable devices, but we can get away with one pass.
   mov  ds, cx       ;; ROM base
   mov  bx, 0x001a   ;; 0x1A is the offset into ROM header that contains...
@@ -9708,22 +10562,54 @@ block_count_rounded:
   cmp  ax, #0x5024  ;; we look for signature "$PnP"
   jne  no_bev
   mov  ax, 2[bx]
-  cmp  ax, #0x506e 
+  cmp  ax, #0x506e
   jne  no_bev
+
+  mov  ax, 0x16[bx] ;; 0x16 is the offset of Boot Connection Vector
+  cmp  ax, #0x0000
+  je   no_bcv
+
+  ;; Option ROM has BCV. Run it now.
+  push cx       ;; Push seg
+  push ax       ;; Push offset
+
+  ;; Point ES:DI at "$PnP", which tells the ROM that we are a PnP BIOS.
+  mov  bx, #0xf000
+  mov  es, bx
+  lea  di, pnp_string
+  /* jump to BCV function entry pointer */
+  mov  bp, sp   ;; Call ROM BCV routine using seg:off on stack
+  db   0xff     ;; call_far ss:[bp+0]
+  db   0x5e
+  db   0
+  cli           ;; In case expansion ROM BIOS turns IF on
+  add  sp, #2   ;; Pop offset value
+  pop  cx       ;; Pop seg value (restore CX)
+  jmp   no_bev
+
+no_bcv:
   mov  ax, 0x1a[bx] ;; 0x1A is also the offset into the expansion header of...
   cmp  ax, #0x0000  ;; the Bootstrap Entry Vector, or zero if there is none.
   je   no_bev
 
-  ;; Found a device that thinks it can boot the system.  Record its BEV.
-  mov  bx, #IPL_SEG            ;; Go to the segment where the IPL table lives 
+  ;; Found a device that thinks it can boot the system.  Record its BEV and product name string.
+  mov  di, 0x10[bx]            ;; Pointer to the product name string or zero if none
+  xor  bx, bx
   mov  ds, bx
+  mov  bx, word ptr [0x40E]    ;; EBDA segment
+  mov  ds, bx                  ;; Go to the segment where the IPL table lives
   mov  bx, IPL_COUNT_OFFSET    ;; Read the number of entries so far
   cmp  bx, #IPL_TABLE_ENTRIES
   je   no_bev                  ;; Get out if the table is full
   shl  bx, #0x4                ;; Turn count into offset (entries are 16 bytes)
-  mov  0[bx], #0x80            ;; This entry is a BEV device
-  mov  6[bx], cx               ;; Build a far pointer from the segment...
-  mov  4[bx], ax               ;; and the offset
+  mov  IPL_TABLE_OFFSET+0[bx], #IPL_TYPE_BEV ;; This entry is a BEV device
+  mov  IPL_TABLE_OFFSET+6[bx], cx            ;; Build a far pointer from the segment...
+  mov  IPL_TABLE_OFFSET+4[bx], ax            ;; and the offset
+  cmp  di, #0x0000
+  je   no_prod_str
+  mov  0xA[bx], cx             ;; Build a far pointer from the segment...
+  mov  8[bx], di               ;; and the offset
+no_prod_str:
   shr  bx, #0x4                ;; Turn the offset back into a count
   inc  bx                      ;; We have one more entry now
   mov  IPL_COUNT_OFFSET, bx    ;; Remember that.
@@ -9735,7 +10621,8 @@ rom_scan_increment:
   shl  ax, #5   ;; convert 512-bytes blocks to 16-byte increments
                 ;; because the segment selector is shifted left 4 bits.
   add  cx, ax
-  cmp  cx, #0xe000
+  pop  ax       ;; Restore AX
+  cmp  cx, ax
   jbe  rom_scan_loop
 
   xor  ax, ax   ;; Restore DS back to 0000:
@@ -9800,10 +10687,36 @@ tcpa_post_part2:
 #endif
 
 
-;; for 'C' strings and other data, insert them here with
-;; a the following hack:
-;; DATA_SEG_DEFS_HERE
+post_init_pic:
+  mov al, #0x11 ; send initialisation commands
+  out 0x20, al
+  out 0xa0, al
+  mov al, #0x08
+  out 0x21, al
+  mov al, #0x70
+  out 0xa1, al
+  mov al, #0x04
+  out 0x21, al
+  mov al, #0x02
+  out 0xa1, al
+  mov al, #0x01
+  out 0x21, al
+  out 0xa1, al
+  mov  al, #0xb8
+  out  0x21, AL ;master pic: unmask IRQ 0, 1, 2, 6
+#if BX_USE_PS2_MOUSE
+  mov  al, #0x8f
+#else
+  mov  al, #0x9f
+#endif
+  out  0xa1, AL ;slave  pic: unmask IRQ 12, 13, 14
+  ret
 
+;; the following area can be used to write dynamically generated tables
+  .align 16
+bios_table_area_start:
+  dd 0xaafb4442
+  dd bios_table_area_end - bios_table_area_start - 8;
 
 ;--------
 ;- POST -
@@ -9839,17 +10752,66 @@ post:
 
   ;; Examine CMOS shutdown status.
   mov al, bl
-  mov dx, #EBDA_SEG
-  mov ds, dx
-  mov [EBDA_CMOS_SHUTDOWN_STATUS_OFFSET], AL
+
+  ;; 0x00, 0x09, 0x0D+ = normal startup
+  cmp AL, #0x00
+  jz normal_post
+  cmp AL, #0x0d
+  jae normal_post
+  cmp AL, #0x09
+  je normal_post
+
+  ;; 0x05 = eoi + jmp via [0x40:0x67] jump
+  cmp al, #0x05
+  je  eoi_jmp_post
+
+  ;; 0x0A = jmp via [0x40:0x67] jump
+  cmp al, #0x0a
+  je  jmp_post_0x467
+
+  ;; 0x0B = iret via [0x40:0x67]
+  cmp al, #0x0b
+  je  iret_post_0x467
+
+  ;; 0x0C = retf via [0x40:0x67]
+  cmp al, #0x0c
+  je  retf_post_0x467
+
+  ;; Examine CMOS shutdown status.
+  ;;  0x01,0x02,0x03,0x04,0x06,0x07,0x08 = Unimplemented shutdown status.
+  push bx
+  call _shutdown_status_panic
+
+#if 0
+  HALT(__LINE__)
+  ;
+  ;#if 0
+  ;  0xb0, 0x20,       /* mov al, #0x20 */
+  ;  0xe6, 0x20,       /* out 0x20, al    ;send EOI to PIC */
+  ;#endif
+  ;
+  pop es
+  pop ds
+  popa
+  iret
+#endif
+
+normal_post:
+  ; case 0: normal startup
 
   cli
   mov  ax, #0xfffe
   mov  sp, ax
-  mov  ax, #0x0000
+  xor  ax, ax
   mov  ds, ax
   mov  ss, ax
 
+  ;; Save shutdown status
+  mov 0x04b0, bl
+
+  cmp bl, #0xfe
+  jz s3_post
+
   ;; zero out BIOS data area (40:00..40:ff)
   mov  es, ax
   mov  cx, #0x0080 ;; 128 words
@@ -9861,18 +10823,16 @@ post:
   call _log_bios_start
 
   ;; set all interrupts to default handler
-  mov  bx, #0x0000    ;; offset index
+  xor  bx, bx         ;; offset index
   mov  cx, #0x0100    ;; counter (256 interrupts)
   mov  ax, #dummy_iret_handler
   mov  dx, #0xF000
 
 post_default_ints:
   mov  [bx], ax
-  inc  bx
-  inc  bx
+  add  bx, #2
   mov  [bx], dx
-  inc  bx
-  inc  bx
+  add  bx, #2
   loop post_default_ints
 
   ;; set vector 0x79 to zero
@@ -9883,7 +10843,6 @@ post_default_ints:
   mov  ax, #BASE_MEM_IN_K
   mov  0x0413, ax
 
-
   ;; Manufacturing Test 40:12
   ;;   zerod out above
 
@@ -10028,67 +10987,41 @@ post_default_ints:
   SET_INT_VECTOR(0x10, #0xF000, #int10_handler)
 
   ;; PIC
-  mov al, #0x11 ; send initialisation commands
-  out 0x20, al
-  out 0xa0, al
-  mov al, #0x08
-  out 0x21, al
-  mov al, #0x70
-  out 0xa1, al
-  mov al, #0x04
-  out 0x21, al
-  mov al, #0x02
-  out 0xa1, al
-  mov al, #0x01
-  out 0x21, al
-  out 0xa1, al
-  mov  al, #0xb8
-  out  0x21, AL ;master pic: unmask IRQ 0, 1, 2, 6
-#if BX_USE_PS2_MOUSE
-  mov  al, #0x8f
-#else
-  mov  al, #0x9f
-#endif
-  out  0xa1, AL ;slave  pic: unmask IRQ 12, 13, 14
-
-#ifdef HVMASSIST
-  call _enable_rom_write_access
-  call _clobber_entry_point
-  call _copy_e820_table
-  call smbios_init
-  call _disable_rom_write_access
-#endif
-
-  call _init_boot_vectors
+  call post_init_pic
 
+  mov  cx, #0xc000  ;; init vga bios
+  mov  ax, #0xc780
   call rom_scan
 
-  call _print_bios_banner 
+  call _print_bios_banner
+
+#if BX_ROMBIOS32
+  call rombios32_init
+#else
+#if BX_PCIBIOS && !defined(HVMASSIST)
+  call pcibios_init_iomem_bases
+  call pcibios_init_irqs
+#endif //BX_PCIBIOS
+#endif
 
   ;;
   ;; Floppy setup
   ;;
   call floppy_drive_post
 
-#if BX_USE_ATADRV
-
   ;;
   ;; Hard Drive setup
   ;;
   call hard_drive_post
 
+#if BX_USE_ATADRV
+
   ;;
   ;; ATA/ATAPI driver setup
   ;;
   call _ata_init
   call _ata_detect
   ;;
-#else // BX_USE_ATADRV
-
-  ;;
-  ;; Hard Drive setup
-  ;;
-  call hard_drive_post
 
 #endif // BX_USE_ATADRV
 
@@ -10100,13 +11033,32 @@ post_default_ints:
   ;;
 #endif // BX_ELTORITO_BOOT
 
-  call _s3_resume
+#ifdef HVMASSIST
+  call _enable_rom_write_access
+  call _clobber_entry_point
+  call _fixup_base_mem_in_k
+  call smbios_init
+#endif
+
+  call _init_boot_vectors
+
+  mov  cx, #(OPTIONROM_PHYSICAL_ADDRESS >> 4)  ;; init option roms
+  mov  ax, #(OPTIONROM_PHYSICAL_END >> 4)
+  call rom_scan
+
+#ifdef HVMASSIST
+  call _disable_rom_write_access
+#endif
+
+#if BX_ELTORITO_BOOT
   call _interactive_bootkey
+#endif // BX_ELTORITO_BOOT
 
 #if BX_TCGBIOS
   call tcpa_post_part2
 #endif
 
+  sti        ;; enable interrupts
   ;; Start the boot sequence.   See the comments in int19_relocated 
   ;; for why we use INT 18h instead of INT 19h here.
   int  #0x18
@@ -10119,7 +11071,7 @@ nmi:
   iret
 
 int75_handler:
-  out  0xf0, al         // clear irq13 
+  out  0xf0, al         // clear irq13
   call eoi_both_pics    // clear interrupt
   int  2                // legacy nmi call
   iret
@@ -10218,7 +11170,7 @@ db 0x00
 int14_handler:
   push ds
   pusha
-  mov  ax, #0x0000
+  xor  ax, ax
   mov  ds, ax
   call _int14_function
   popa
@@ -10323,26 +11275,7 @@ int09_handler:
   jz  int09_finish
 
   in  al, #0x60             ;;read key from keyboard controller
-  //test al, #0x80            ;;look for key release
-  //jnz  int09_process_key    ;; dont pass releases to intercept?
-
-  ;; check for extended key
-  cmp  al, #0xe0
-  jne int09_call_int15_4f
-  
-  push ds
-  xor  ax, ax
-  mov  ds, ax
-  mov  al, BYTE [0x496]     ;; mf2_state |= 0x01
-  or   al, #0x01
-  mov  BYTE [0x496], al
-  pop  ds
-  
-  in  al, #0x60             ;;read another key from keyboard controller
-
   sti
-
-int09_call_int15_4f:
   push  ds
   pusha
 #ifdef BX_CALL_INT15_4F
@@ -10352,8 +11285,27 @@ int09_call_int15_4f:
   jnc  int09_done
 #endif
 
+  ;; check for extended key
+  cmp  al, #0xe0
+  jne int09_check_pause
+  xor  ax, ax
+  mov  ds, ax
+  mov  al, BYTE [0x496]     ;; mf2_state |= 0x02
+  or   al, #0x02
+  mov  BYTE [0x496], al
+  jmp int09_done
+
+int09_check_pause: ;; check for pause key
+  cmp  al, #0xe1
+  jne int09_process_key
+  xor  ax, ax
+  mov  ds, ax
+  mov  al, BYTE [0x496]     ;; mf2_state |= 0x01
+  or   al, #0x01
+  mov  BYTE [0x496], al
+  jmp int09_done
 
-//int09_process_key:
+int09_process_key:
   mov   bx, #0xf000
   mov   ds, bx
   call  _int09_function
@@ -10371,8 +11323,6 @@ int09_finish:
   iret
 
 
-
-
 ;----------------------------------------
 ;- INT 13h Diskette Service Entry Point -
 ;----------------------------------------
@@ -10411,7 +11361,7 @@ int0e_loop2:
   je int0e_loop2
 int0e_normal:
   push ds
-  mov  ax, #0x0000 ;; segment 0000
+  xor  ax, ax ;; segment 0000
   mov  ds, ax
   call eoi_master_pic
   mov  al, 0x043e
@@ -10448,7 +11398,7 @@ db  0x08
 int17_handler:
   push ds
   pusha
-  mov  ax, #0x0000
+  xor  ax, ax
   mov  ds, ax
   call _int17_function
   popa
@@ -10638,11 +11588,11 @@ int1a_callfunction:
 ;;
 int70_handler:
   push ds
-  pusha
+  pushad
   xor  ax, ax
   mov  ds, ax
   call _int70_function
-  popa
+  popad
   pop  ds
   iret
 
@@ -10700,7 +11650,7 @@ int08_store_ticks:
 
 
 .org 0xff00
-.ascii "(c) 2002 MandrakeSoft S.A. Written by Kevin Lawton & the Bochs team."
+.ascii BIOS_COPYRIGHT_STRING
 
 ;------------------------------------------------
 ;- IRET Instruction for Dummy Interrupt Handler -
@@ -10722,7 +11672,7 @@ dummy_iret_handler:
 #ifdef HVMTEST
   jmp 0xd000:0x0003;
 #else
-  jmp 0xf000:post
+   jmp 0xf000:post
 #endif
 
 .org 0xfff5 ; ASCII Date ROM was built - 8 characters in MM/DD/YY
@@ -10735,10 +11685,10 @@ db 0x00   ; filler
 .org 0xfa6e ;; Character Font for 320x200 & 640x200 Graphics (lower 128 characters)
 ASM_END
 /*
- * This font comes from the fntcol16.zip package (c) by  Joseph Gil 
+ * This font comes from the fntcol16.zip package (c) by  Joseph Gil
  * found at ftp://ftp.simtel.net/pub/simtelnet/msdos/screen/fntcol16.zip
  * This font is public domain
- */ 
+ */
 static Bit8u vgafont8[128*8]=
 {
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -10874,15 +11824,6 @@ static Bit8u vgafont8[128*8]=
 #ifdef HVMASSIST
 ASM_START
 
-// space for addresses in 32bit BIOS area; currently 256/4 entries
-// are allocated
-.org 0xcb00
-jmptable:
-db 0x5F, 0x5F, 0x5F, 0x4A, 0x4D, 0x50, 0x54 ;; ___JMPT
-dw 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ;;  64 bytes
-dw 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ;; 128 bytes
-dw 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 ;; 192 bytes
-
 //
 // MP Tables
 // just carve out some blank space for HVMLOADER to write the MP tables to
@@ -10914,328 +11855,10 @@ db 0,0,0,0,0,0,0,0 ; 24 bytes
 db 0,0,0,0,0,0,0   ; 31 bytes
 ASM_END
 
-#else // !HVMASSIST
-
+#endif // HVMASSIST
 ASM_START
-.org 0xcc00
+.org 0xcff0
+bios_table_area_end:
 // bcc-generated data will be placed here
-
-// For documentation of this config structure, look on developer.intel.com and
-// search for multiprocessor specification.  Note that when you change anything
-// you must update the checksum (a pain!).  It would be better to construct this
-// with C structures, or at least fill in the checksum automatically.
-//
-// Maybe this structs could be moved elsewhere than d000
-
-#if (BX_SMP_PROCESSORS==1)
-  // no structure necessary.
-#elif (BX_SMP_PROCESSORS==2)
-// define the Intel MP Configuration Structure for 2 processors at
-// APIC ID 0,1.  I/O APIC at ID=2.
-.align 16
-mp_config_table:
-  db 0x50, 0x43, 0x4d, 0x50  ;; "PCMP" signature
-  dw (mp_config_end-mp_config_table)  ;; table length
-  db 4 ;; spec rev
-  db 0x65 ;; checksum
-  .ascii "BOCHSCPU"     ;; OEM id = "BOCHSCPU"
-  db 0x30, 0x2e, 0x31, 0x20 ;; vendor id = "0.1         "
-  db 0x20, 0x20, 0x20, 0x20 
-  db 0x20, 0x20, 0x20, 0x20
-  dw 0,0 ;; oem table ptr
-  dw 0 ;; oem table size
-  dw 20 ;; entry count
-  dw 0x0000, 0xfee0 ;; memory mapped address of local APIC
-  dw 0 ;; extended table length
-  db 0 ;; extended table checksum
-  db 0 ;; reserved
-mp_config_proc0:
-  db 0 ;; entry type=processor
-  db 0 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 3 ;; cpu flags: enabled, bootstrap processor
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc1:
-  db 0 ;; entry type=processor
-  db 1 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_isa_bus:
-  db 1 ;; entry type=bus
-  db 0 ;; bus ID
-  db 0x49, 0x53, 0x41, 0x20, 0x20, 0x20  ;; bus type="ISA   "
-mp_config_ioapic:
-  db 2 ;; entry type=I/O APIC
-  db 2 ;; apic id=2. linux will set.
-  db 0x11 ;; I/O APIC version number
-  db 1 ;; flags=1=enabled
-  dw 0x0000, 0xfec0 ;; memory mapped address of I/O APIC
-mp_config_irqs:
-  db 3 ;; entry type=I/O interrupt
-  db 0 ;; interrupt type=vectored interrupt
-  db 0,0 ;; flags po=0, el=0 (linux uses as default)
-  db 0 ;; source bus ID is ISA
-  db 0 ;; source bus IRQ
-  db 2 ;; destination I/O APIC ID
-  db 0 ;; destination I/O APIC interrrupt in
-  ;; repeat pattern for interrupts 0-15
-  db 3,0,0,0,0,1,2,1
-  db 3,0,0,0,0,2,2,2
-  db 3,0,0,0,0,3,2,3
-  db 3,0,0,0,0,4,2,4
-  db 3,0,0,0,0,5,2,5
-  db 3,0,0,0,0,6,2,6
-  db 3,0,0,0,0,7,2,7
-  db 3,0,0,0,0,8,2,8
-  db 3,0,0,0,0,9,2,9
-  db 3,0,0,0,0,10,2,10
-  db 3,0,0,0,0,11,2,11
-  db 3,0,0,0,0,12,2,12
-  db 3,0,0,0,0,13,2,13
-  db 3,0,0,0,0,14,2,14
-  db 3,0,0,0,0,15,2,15
-#elif (BX_SMP_PROCESSORS==4)
-// define the Intel MP Configuration Structure for 4 processors at
-// APIC ID 0,1,2,3.  I/O APIC at ID=4.
-.align 16
-mp_config_table:
-  db 0x50, 0x43, 0x4d, 0x50  ;; "PCMP" signature
-  dw (mp_config_end-mp_config_table)  ;; table length
-  db 4 ;; spec rev
-  db 0xdd ;; checksum
-  .ascii "BOCHSCPU"     ;; OEM id = "BOCHSCPU"
-  db 0x30, 0x2e, 0x31, 0x20 ;; vendor id = "0.1         "
-  db 0x20, 0x20, 0x20, 0x20 
-  db 0x20, 0x20, 0x20, 0x20
-  dw 0,0 ;; oem table ptr
-  dw 0 ;; oem table size
-  dw 22 ;; entry count
-  dw 0x0000, 0xfee0 ;; memory mapped address of local APIC
-  dw 0 ;; extended table length
-  db 0 ;; extended table checksum
-  db 0 ;; reserved
-mp_config_proc0:
-  db 0 ;; entry type=processor
-  db 0 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 3 ;; cpu flags: enabled, bootstrap processor
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc1:
-  db 0 ;; entry type=processor
-  db 1 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc2:
-  db 0 ;; entry type=processor
-  db 2 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc3:
-  db 0 ;; entry type=processor
-  db 3 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_isa_bus:
-  db 1 ;; entry type=bus
-  db 0 ;; bus ID
-  db 0x49, 0x53, 0x41, 0x20, 0x20, 0x20  ;; bus type="ISA   "
-mp_config_ioapic:
-  db 2 ;; entry type=I/O APIC
-  db 4 ;; apic id=4. linux will set.
-  db 0x11 ;; I/O APIC version number
-  db 1 ;; flags=1=enabled
-  dw 0x0000, 0xfec0 ;; memory mapped address of I/O APIC
-mp_config_irqs:
-  db 3 ;; entry type=I/O interrupt
-  db 0 ;; interrupt type=vectored interrupt
-  db 0,0 ;; flags po=0, el=0 (linux uses as default)
-  db 0 ;; source bus ID is ISA
-  db 0 ;; source bus IRQ
-  db 4 ;; destination I/O APIC ID
-  db 0 ;; destination I/O APIC interrrupt in
-  ;; repeat pattern for interrupts 0-15
-  db 3,0,0,0,0,1,4,1
-  db 3,0,0,0,0,2,4,2
-  db 3,0,0,0,0,3,4,3
-  db 3,0,0,0,0,4,4,4
-  db 3,0,0,0,0,5,4,5
-  db 3,0,0,0,0,6,4,6
-  db 3,0,0,0,0,7,4,7
-  db 3,0,0,0,0,8,4,8
-  db 3,0,0,0,0,9,4,9
-  db 3,0,0,0,0,10,4,10
-  db 3,0,0,0,0,11,4,11
-  db 3,0,0,0,0,12,4,12
-  db 3,0,0,0,0,13,4,13
-  db 3,0,0,0,0,14,4,14
-  db 3,0,0,0,0,15,4,15
-#elif (BX_SMP_PROCESSORS==8)
-// define the Intel MP Configuration Structure for 8 processors at
-// APIC ID 0,1,2,3,4,5,6,7.  I/O APIC at ID=8.
-.align 16
-mp_config_table:
-  db 0x50, 0x43, 0x4d, 0x50  ;; "PCMP" signature
-  dw (mp_config_end-mp_config_table)  ;; table length
-  db 4 ;; spec rev
-  db 0xc3 ;; checksum
-  .ascii "BOCHSCPU"     ;; OEM id = "BOCHSCPU"
-  db 0x30, 0x2e, 0x31, 0x20 ;; vendor id = "0.1         "
-  db 0x20, 0x20, 0x20, 0x20 
-  db 0x20, 0x20, 0x20, 0x20
-  dw 0,0 ;; oem table ptr
-  dw 0 ;; oem table size
-  dw 26 ;; entry count
-  dw 0x0000, 0xfee0 ;; memory mapped address of local APIC
-  dw 0 ;; extended table length
-  db 0 ;; extended table checksum
-  db 0 ;; reserved
-mp_config_proc0:
-  db 0 ;; entry type=processor
-  db 0 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 3 ;; cpu flags: enabled, bootstrap processor
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc1:
-  db 0 ;; entry type=processor
-  db 1 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc2:
-  db 0 ;; entry type=processor
-  db 2 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc3:
-  db 0 ;; entry type=processor
-  db 3 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc4:
-  db 0 ;; entry type=processor
-  db 4 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc5:
-  db 0 ;; entry type=processor
-  db 5 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc6:
-  db 0 ;; entry type=processor
-  db 6 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_proc7:
-  db 0 ;; entry type=processor
-  db 7 ;; local APIC id
-  db 0x11 ;; local APIC version number
-  db 1 ;; cpu flags: enabled
-  db 0,6,0,0 ;; cpu signature
-  dw 0x201,0 ;; feature flags
-  dw 0,0 ;; reserved
-  dw 0,0 ;; reserved
-mp_config_isa_bus:
-  db 1 ;; entry type=bus
-  db 0 ;; bus ID
-  db 0x49, 0x53, 0x41, 0x20, 0x20, 0x20  ;; bus type="ISA   "
-mp_config_ioapic:
-  db 2 ;; entry type=I/O APIC
-  db 8 ;; apic id=8
-  db 0x11 ;; I/O APIC version number
-  db 1 ;; flags=1=enabled
-  dw 0x0000, 0xfec0 ;; memory mapped address of I/O APIC
-mp_config_irqs:
-  db 3 ;; entry type=I/O interrupt
-  db 0 ;; interrupt type=vectored interrupt
-  db 0,0 ;; flags po=0, el=0 (linux uses as default)
-  db 0 ;; source bus ID is ISA
-  db 0 ;; source bus IRQ
-  db 8 ;; destination I/O APIC ID
-  db 0 ;; destination I/O APIC interrrupt in
-  ;; repeat pattern for interrupts 0-15
-  db 3,0,0,0,0,1,8,1
-  db 3,0,0,0,0,2,8,2
-  db 3,0,0,0,0,3,8,3
-  db 3,0,0,0,0,4,8,4
-  db 3,0,0,0,0,5,8,5
-  db 3,0,0,0,0,6,8,6
-  db 3,0,0,0,0,7,8,7
-  db 3,0,0,0,0,8,8,8
-  db 3,0,0,0,0,9,8,9
-  db 3,0,0,0,0,10,8,10
-  db 3,0,0,0,0,11,8,11
-  db 3,0,0,0,0,12,8,12
-  db 3,0,0,0,0,13,8,13
-  db 3,0,0,0,0,14,8,14
-  db 3,0,0,0,0,15,8,15
-#else
-#  error Sorry, rombios only has configurations for 1, 2, 4 or 8 processors.
-#endif  // if (BX_SMP_PROCESSORS==...)
-
-mp_config_end:   // this label used to find length of mp structure
- db 0
-
-#if (BX_SMP_PROCESSORS>1)
-.align 16
-mp_floating_pointer_structure:
-db 0x5f, 0x4d, 0x50, 0x5f   ; "_MP_" signature
-dw mp_config_table, 0xf ;; pointer to MP configuration table
-db 1     ;; length of this struct in 16-bit byte chunks
-db 4     ;; MP spec revision
-db 0xc1  ;; checksum
-db 0     ;; MP feature byte 1.  value 0 means look at the config table
-db 0,0,0,0     ;; MP feature bytes 2-5.
-#endif
-
 ASM_END
 
-#endif // HVMASSIST
diff --git a/tools/firmware/rombios/rombios.h b/tools/firmware/rombios/rombios.h
new file mode 100644 (file)
index 0000000..93d12a4
--- /dev/null
@@ -0,0 +1,70 @@
+/////////////////////////////////////////////////////////////////////////
+// $Id: rombios.h,v 1.8 2008/12/04 18:48:33 sshwarts Exp $
+/////////////////////////////////////////////////////////////////////////
+//
+//  Copyright (C) 2006 Volker Ruppert
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+/* define it to include QEMU specific code */
+//#define BX_QEMU
+#define LEGACY
+
+#ifndef LEGACY
+#  define BX_ROMBIOS32     1
+#else
+#  define BX_ROMBIOS32     0
+#endif
+#define DEBUG_ROMBIOS    0
+
+#define PANIC_PORT  0x400
+#define PANIC_PORT2 0x401
+#define INFO_PORT   0x402
+#define DEBUG_PORT  0x403
+
+#define BIOS_PRINTF_HALT     1
+#define BIOS_PRINTF_SCREEN   2
+#define BIOS_PRINTF_INFO     4
+#define BIOS_PRINTF_DEBUG    8
+#define BIOS_PRINTF_ALL      (BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO)
+#define BIOS_PRINTF_DEBHALT  (BIOS_PRINTF_SCREEN | BIOS_PRINTF_INFO | BIOS_PRINTF_HALT)
+
+#define printf(format, p...)  bios_printf(BIOS_PRINTF_SCREEN, format, ##p)
+
+// Defines the output macros.
+// BX_DEBUG goes to INFO port until we can easily choose debug info on a
+// per-device basis. Debug info are sent only in debug mode
+#if DEBUG_ROMBIOS
+#  define BX_DEBUG(format, p...)  bios_printf(BIOS_PRINTF_INFO, format, ##p)
+#else
+#  define BX_DEBUG(format, p...)
+#endif
+#define BX_INFO(format, p...)   bios_printf(BIOS_PRINTF_INFO, format, ##p)
+#define BX_PANIC(format, p...)  bios_printf(BIOS_PRINTF_DEBHALT, format, ##p)
+
+#define ACPI_DATA_SIZE    0x00010000L
+#define PM_IO_BASE        0xb000
+#define SMB_IO_BASE       0xb100
+
+  // Define the application NAME
+#if define HVMASSIST
+#  define BX_APPNAME "HVMAssist"
+#elif defined(BX_QEMU)
+#  define BX_APPNAME "QEMU"
+#elif defined(PLEX86)
+#  define BX_APPNAME "Plex86"
+#else
+#  define BX_APPNAME "Bochs"
+#endif
index 9adba404fc78cf4664eabfa22c64f55819c8f8c3..c7ec261081bce5209cb5a29c619c0524cb1bde72 100644 (file)
   Support for TCPA ACPI logging
  ******************************************************************/
 
-/*
- * Extend the ACPI log with the given entry by copying the
- * entry data into the log.
- * Input
- *  Pointer to the structure to be copied into the log
- *
- * Output:
- *  lower 16 bits of return code contain entry number
- *  if entry number is '0', then upper 16 bits contain error code.
- */
-Bit32u tcpa_extend_acpi_log(entry_ptr)
-    Bit32u entry_ptr;
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_EXTEND_ACPI_LOG)
-       ASM_END
-}
-
-
-/*
-   initialize the TCPA ACPI subsystem; find the ACPI tables and determine
-   where the TCPA table is.
- */
- void
-tcpa_acpi_init()
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_ACPI_INIT)
-       ASM_END
-}
-
-
-/*
- * Add measurement to log about call of int 19h
- */
- void
-tcpa_calling_int19h()
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_CALLING_INT19H)
-       ASM_END
-}
-
-/*
- * Add measurement to log about retuning from int 19h
- */
- void
-tcpa_returned_int19h()
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_RETURNED_INT19H)
-       ASM_END
-}
-
-/*
- * Add event separators for PCRs 0 to 7; specs 8.2.3
- */
- void
-tcpa_add_event_separators()
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_ADD_EVENT_SEPARATORS)
-       ASM_END
-}
-
-
-/*
- * Add a wake event to the log
- */
- void
-tcpa_wake_event()
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_WAKE_EVENT)
-       ASM_END
-}
-
-
-/*
- * Add measurement to the log about option rom scan
- * 10.4.3 : action 14
- */
- void
-tcpa_start_option_rom_scan()
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_START_OPTION_ROM_SCAN)
-       ASM_END
-}
-
-
-/*
- * Add measurement to the log about an option rom
- */
- void
-tcpa_option_rom(seg)
-    Bit32u seg;
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_OPTION_ROM)
-       ASM_END
-}
-
-/*
- * Add a measurement regarding the boot device (CDRom, Floppy, HDD) to
- * the list of measurements.
- */
-void
- tcpa_add_bootdevice(bootcd, bootdrv)
-  Bit32u bootcd;
-  Bit32u bootdrv;
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_ADD_BOOTDEVICE)
-       ASM_END
-}
-
-/*
- * Add a measurement to the log in support of 8.2.5.3
- * Creates two log entries
- *
- * Input parameter:
- *  seg    : segment where the IPL data are located
- */
- void
- tcpa_ipl(bootcd,seg,off,count)
-    Bit32u bootcd;
-    Bit32u seg;
-    Bit32u off;
-    Bit32u count;
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_IPL)
-       ASM_END
-}
-
-
-Bit32u
-tcpa_initialize_tpm(physpres)
-  Bit32u physpres;
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_INITIALIZE_TPM)
-       ASM_END
-}
-
-void
-tcpa_measure_post(from, to)
-   Bit32u from;
-   Bit32u to;
-{
-       ASM_START
-       DoUpcall(IDX_TCPA_MEASURE_POST)
-       ASM_END
-}
-
 ASM_START
 MACRO POST_MEASURE
        push word #0x000f
@@ -208,18 +52,6 @@ tcpa_do_measure_POSTs()
        ASM_END
 }
 
-Bit32u
-TCGInterruptHandler(regs_ptr, es, ds, flags_ptr)
-   Bit32u regs_ptr;
-   Bit16u es;
-   Bit16u ds;
-   Bit32u flags_ptr;
-{
-       ASM_START
-       DoUpcall(IDX_TCGINTERRUPTHANDLER)
-       ASM_END
-}
-
 /*
  * C-dispatcher for the TCG BIOS functions
  */
index 87f7414a4b59361e5eec5d2c0137b839d07ece7f..3fc786d851b976d05d8cad9545d379d3e0b84bf4 100644 (file)
@@ -38,8 +38,6 @@
 #include "vbe.h"
 #include "vbetables.h"
 
-#define VBE_TOTAL_VIDEO_MEMORY_DIV_64K (VBE_DISPI_TOTAL_VIDEO_MEMORY_MB*1024/64)
-
 // The current OEM Software Revision of this VBE Bios
 #define VBE_OEM_SOFTWARE_REV 0x0002;
 
@@ -821,7 +819,8 @@ Bit16u *AX;Bit16u ES;Bit16u DI;
         vbe_info_block.VideoModePtr_Off= DI + 34;
 
         // VBE Total Memory (in 64b blocks)
-        vbe_info_block.TotalMemory = VBE_TOTAL_VIDEO_MEMORY_DIV_64K;
+        outw(VBE_DISPI_IOPORT_INDEX, VBE_DISPI_INDEX_VIDEO_MEMORY_64K);
+        vbe_info_block.TotalMemory = inw(VBE_DISPI_IOPORT_DATA);
 
         if (vbe2_info)
        {
@@ -846,7 +845,8 @@ Bit16u *AX;Bit16u ES;Bit16u DI;
         do
         {
                 if ((cur_info->info.XResolution <= dispi_get_max_xres()) &&
-                    (cur_info->info.BitsPerPixel <= dispi_get_max_bpp())) {
+                    (cur_info->info.BitsPerPixel <= dispi_get_max_bpp()) &&
+                    (cur_info->info.XResolution * cur_info->info.XResolution * cur_info->info.BitsPerPixel <= vbe_info_block.TotalMemory << 19 )) {
 #ifdef DEBUG
                   printf("VBE found mode %x => %x\n", cur_info->mode,cur_mode);
 #endif
@@ -855,7 +855,7 @@ Bit16u *AX;Bit16u ES;Bit16u DI;
                   cur_ptr+=2;
                 } else {
 #ifdef DEBUG
-                  printf("VBE mode %x (xres=%x / bpp=%02x) not supported by display\n", cur_info->mode,cur_info->info.XResolution,cur_info->info.BitsPerPixel);
+                  printf("VBE mode %x (xres=%x / bpp=%02x) not supported \n", cur_info->mode,cur_info->info.XResolution,cur_info->info.BitsPerPixel);
 #endif
                 }
                 cur_info++;
@@ -913,7 +913,13 @@ Bit16u *AX;Bit16u CX; Bit16u ES;Bit16u DI;
                   info.WinFuncPtr = 0xC0000000UL;
                   *(Bit16u *)&(info.WinFuncPtr) = (Bit16u)(dispi_set_bank_farcall);
                 }
-                
+                outw(VBE_DISPI_IOPORT_INDEX,VBE_DISPI_INDEX_LFB_ADDRESS_H);
+                info.PhysBasePtr = inw(VBE_DISPI_IOPORT_DATA);
+                info.PhysBasePtr = info.PhysBasePtr << 16;
+#if 0                                  
+                outw(VBE_DISPI_IOPORT_INDEX,VBE_DISPI_INDEX_LFB_ADDRESS_L);
+                info.PhysBasePtr |= inw(VBE_DISPI_IOPORT_DATA);
+#endif                                                         
                 result = 0x4f;
         }
         else
index 60434ac7d79599092c66efad6d803b012ec65a27..3b78582780f91e027a8edf21e9e1cb4927668d03 100644 (file)
@@ -275,39 +275,41 @@ typedef struct ModeInfoListItem
 //        like 0xE0000000
 
 
-  #define VBE_DISPI_BANK_ADDRESS          0xA0000
-  #define VBE_DISPI_BANK_SIZE_KB          64
+  #define VBE_DISPI_BANK_ADDRESS           0xA0000
+  #define VBE_DISPI_BANK_SIZE_KB           64
   
-  #define VBE_DISPI_MAX_XRES              1024
-  #define VBE_DISPI_MAX_YRES              768
+  #define VBE_DISPI_MAX_XRES               2560
+  #define VBE_DISPI_MAX_YRES               1600
   
-  #define VBE_DISPI_IOPORT_INDEX          0x01CE
-  #define VBE_DISPI_IOPORT_DATA           0x01CF
+  #define VBE_DISPI_IOPORT_INDEX           0x01CE
+  #define VBE_DISPI_IOPORT_DATA            0x01CF
   
-  #define VBE_DISPI_INDEX_ID              0x0
-  #define VBE_DISPI_INDEX_XRES            0x1
-  #define VBE_DISPI_INDEX_YRES            0x2
-  #define VBE_DISPI_INDEX_BPP             0x3
-  #define VBE_DISPI_INDEX_ENABLE          0x4
-  #define VBE_DISPI_INDEX_BANK            0x5
-  #define VBE_DISPI_INDEX_VIRT_WIDTH      0x6
-  #define VBE_DISPI_INDEX_VIRT_HEIGHT     0x7
-  #define VBE_DISPI_INDEX_X_OFFSET        0x8
-  #define VBE_DISPI_INDEX_Y_OFFSET        0x9
-      
-  #define VBE_DISPI_ID0                   0xB0C0
-  #define VBE_DISPI_ID1                   0xB0C1
-  #define VBE_DISPI_ID2                   0xB0C2
-  #define VBE_DISPI_ID3                   0xB0C3
-  #define VBE_DISPI_ID4                   0xB0C4
-  
-  #define VBE_DISPI_DISABLED              0x00
-  #define VBE_DISPI_ENABLED               0x01
-  #define VBE_DISPI_GETCAPS               0x02
-  #define VBE_DISPI_8BIT_DAC              0x20
-  #define VBE_DISPI_LFB_ENABLED           0x40
-  #define VBE_DISPI_NOCLEARMEM            0x80
-  
-  #define VBE_DISPI_LFB_PHYSICAL_ADDRESS  0xE0000000
+  #define VBE_DISPI_INDEX_ID               0x0
+  #define VBE_DISPI_INDEX_XRES             0x1
+  #define VBE_DISPI_INDEX_YRES             0x2
+  #define VBE_DISPI_INDEX_BPP              0x3
+  #define VBE_DISPI_INDEX_ENABLE           0x4
+  #define VBE_DISPI_INDEX_BANK             0x5
+  #define VBE_DISPI_INDEX_VIRT_WIDTH       0x6
+  #define VBE_DISPI_INDEX_VIRT_HEIGHT      0x7
+  #define VBE_DISPI_INDEX_X_OFFSET         0x8
+  #define VBE_DISPI_INDEX_Y_OFFSET         0x9
+  #define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
+  #define VBE_DISPI_INDEX_LFB_ADDRESS_H    0xb
+  #define VBE_DISPI_INDEX_LFB_ADDRESS_L    0xc
+
+  #define VBE_DISPI_LFB_PHYSICAL_ADDRESS   0xE0000000
+  #define VBE_DISPI_ID0                    0xB0C0
+  #define VBE_DISPI_ID1                    0xB0C1
+  #define VBE_DISPI_ID2                    0xB0C2
+  #define VBE_DISPI_ID3                    0xB0C3
+  #define VBE_DISPI_ID4                    0xB0C4
+
+  #define VBE_DISPI_DISABLED               0x00
+  #define VBE_DISPI_ENABLED                0x01
+  #define VBE_DISPI_GETCAPS                0x02
+  #define VBE_DISPI_8BIT_DAC               0x20
+  #define VBE_DISPI_LFB_ENABLED            0x40
+  #define VBE_DISPI_NOCLEARMEM             0x80
 
 #endif
index 7014a16e77ed6262ca69361d89ccd8ba706aba38..64f3f0dfd25f46f28d6d72a1d2c99900cf0a6da4 100644 (file)
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
-#define VBE_DISPI_TOTAL_VIDEO_MEMORY_MB 8
+#define VBE_DISPI_TOTAL_VIDEO_MEMORY_MB 16
 
 typedef struct {
     int width;
@@ -42,19 +42,40 @@ ModeInfo modes[] = {
 { 1600, 1200, 24                      , 0x11F},
 
       /* BOCHS/PLE, 86 'own' mode numbers */
-{ 320, 200, 32                        , 0x140},
-{ 640, 400, 32                        , 0x141},
-{ 640, 480, 32                        , 0x142},
-{ 800, 600, 32                        , 0x143},
-{ 1024, 768, 32                       , 0x144},
-{ 1280, 1024, 32                      , 0x145},
-{ 320, 200, 8                           , 0x146},
-{ 1600, 1200, 32                      , 0x147},
-{ 1152, 864, 8                      , 0x148},
+{ 320, 200, 32                       , 0x140},
+{ 640, 400, 32                       , 0x141},
+{ 640, 480, 32                       , 0x142},
+{ 800, 600, 32                       , 0x143},
+{ 1024, 768, 32                      , 0x144},
+{ 1280, 1024, 32                     , 0x145},
+{ 320, 200, 8                        , 0x146},
+{ 1600, 1200, 32                     , 0x147},
+{ 1152, 864, 8                       , 0x148},
 { 1152, 864, 15                      , 0x149},
 { 1152, 864, 16                      , 0x14a},
 { 1152, 864, 24                      , 0x14b},
 { 1152, 864, 32                      , 0x14c},
+{ 1280, 800, 16                      , 0x178},
+{ 1280, 800, 24                      , 0x179},
+{ 1280, 800, 32                      , 0x17a},
+{ 1280, 960, 16                      , 0x17b},
+{ 1280, 960, 24                      , 0x17c},
+{ 1280, 960, 32                      , 0x17d},
+{ 1440, 900, 16                      , 0x17e},
+{ 1440, 900, 24                      , 0x17f},
+{ 1440, 900, 32                      , 0x180},
+{ 1400, 1050, 16                     , 0x181},
+{ 1400, 1050, 24                     , 0x182},
+{ 1400, 1050, 32                     , 0x183},
+{ 1680, 1050, 16                     , 0x184},
+{ 1680, 1050, 24                     , 0x185},
+{ 1680, 1050, 32                     , 0x186},
+{ 1920, 1200, 16                     , 0x187},
+{ 1920, 1200, 24                     , 0x188},
+{ 1920, 1200, 32                     , 0x189},
+{ 2560, 1600, 16                     , 0x18a},
+{ 2560, 1600, 24                     , 0x18b},
+{ 2560, 1600, 32                     , 0x18c},
 { 0, },
 };
 
index 3fd9f2f680909ada5e1ccbb82902cdd740ce8074..a9dbe008d31b65a522ae0e7cac0e19c2fa7674e9 100644 (file)
@@ -3811,9 +3811,9 @@ void printf(s)
         for (i=0; i<format_width; i++) {
           nibble = (arg >> (4 * digit)) & 0x000f;
           if (nibble <= 9)
-            outb(0x0500, nibble + '0');
+            outb(0xe9, nibble + '0');
           else
-            outb(0x0500, (nibble - 10) + 'A');
+            outb(0xe9, (nibble - 10) + 'A');
           digit--;
           }
         in_format = 0;
@@ -3823,7 +3823,7 @@ void printf(s)
       //  }
       }
     else {
-      outb(0x0500, c);
+      outb(0xe9, c);
       }
     s ++;
     }
index 8c085b929b8dc495dd5a875c65ca67ad8757975d..c03fc8aca84b0b57660a1745f828cbf949a27e62 100644 (file)
@@ -16,7 +16,6 @@ CFLAGS   += $(INCLUDES) -I./include -I$(XEN_LIBXC) -I$(XEN_INCLUDE)
 # Get gcc to generate the dependencies for us.
 CFLAGS   += -Wp,-MD,.$(@F).d
 LDFLAGS  += -L.
-DEPS     = .*.d
 
 LIB_OBJS := $(patsubst %.c,%.o,$(SRCS))
 PIC_OBJS := $(patsubst %.c,%.opic,$(SRCS))
index 7c38525ba87b64987d628e88a7ce9fe64fbfbca4..8b404214c242a996d1499256113d862ba85d1774 100644 (file)
@@ -7,9 +7,6 @@ LIBFLASK_ROOT = $(XEN_ROOT)/tools/flask/libflask
 
 PROFILE=#-pg
 BASECFLAGS=-Wall -g -Werror
-# Make gcc generate dependencies.
-BASECFLAGS += -Wp,-MD,.$(@F).d
-PROG_DEP = .*.d
 BASECFLAGS+= $(PROFILE)
 #BASECFLAGS+= -I$(XEN_ROOT)/tools
 BASECFLAGS+= $(CFLAGS_libxenctrl)
@@ -39,7 +36,7 @@ $(CLIENTS_OBJS): $(CLIENTS_SRCS)
 clean: 
        rm -f *.o *.opic *.so
        rm -f $(CLIENTS)
-       $(RM) $(PROG_DEP)
+       $(RM) $(DEPS)
 
 .PHONY: print-dir
 print-dir:
@@ -54,7 +51,7 @@ install: all
        $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
        $(INSTALL_PROG) $(CLIENTS) $(DESTDIR)$(SBINDIR)
 
--include $(PROG_DEP)
+-include $(DEPS)
 
 # never delete any intermediate files.
 .SECONDARY:
index 62920fc68ea02a22ae5f2a756bbc61cde42debbd..85651cf1fb6163b95763b6b948e214313fdd462e 100644 (file)
@@ -74,7 +74,7 @@ allow dom0_t iomem_t:mmu {map_read map_write};
 allow dom0_t pirq_t:event {vector};
 allow dom0_t xen_t:mmu {memorymap};
 
-allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
+allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust updatemp};
 allow dom0_t dom0_t:grant {query setup};
 allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo getvcpuaffinity};
 
@@ -112,6 +112,7 @@ allow domU_t evchnU-0_t:event {send};
 
 allow dom0_t dom0_t:event {send};
 allow dom0_t domU_t:grant {copy};
+allow domU_t domU_t:grant {copy};
 
 manage_domain(dom0_t, domU_t)
 
index dc5fc446ce1254b1e4be2666c74d6fb520faadd7..db6ce6f9cf49aa4ef543b0d29ce7492b5f243172 100644 (file)
@@ -13,14 +13,10 @@ CFLAGS   += $(CFLAGS_libxenstore)
 CFLAGS   += $(INCLUDES) -I.
 CFLAGS   += -D_GNU_SOURCE
 
-# Get gcc to generate the dependencies for us.
-CFLAGS   += -Wp,-MD,.$(@F).d
-DEPS      = .*.d
-
 LIBS      := -L. -L.. -L../lib
 LIBS      += $(LDFLAGS_libxenctrl)
 LIBS      += $(LDFLAGS_libxenstore)
-LIBS      += -lpthread -lrt 
+LIBS      += -lrt 
 
 OBJS     := fs-xenbus.o fs-ops.o
 
index f0d2758627adf11acbb9ea4c03d445cf1d577dca..721b2dc0a5d1fd0d6b2f86657c6b2a3a660bc2c6 100644 (file)
 #undef NDEBUG
+#include <unistd.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
 #include <malloc.h>
-#include <pthread.h>
 #include <xenctrl.h>
 #include <aio.h>
 #include <sys/mman.h>
 #include <sys/select.h>
+#include <sys/socket.h>
 #include <xen/io/ring.h>
+#include <xc_private.h>
+#include <err.h>
+#include "sys-queue.h"
 #include "fs-backend.h"
+#include "fs-debug.h"
 
 struct xs_handle *xsh = NULL;
 static struct fs_export *fs_exports = NULL;
 static int export_id = 0;
 static int mount_id = 0;
+static int pipefds[2];
+static LIST_HEAD(mount_requests_head, fs_mount) mount_requests_head;
 
-static void dispatch_response(struct fs_mount *mount, int priv_req_id)
+static void free_mount_request(struct fs_mount *mount);
+
+static void dispatch_response(struct fs_request *request)
 {
     int i;
     struct fs_op *op;
-    struct fs_request *req = &mount->requests[priv_req_id];
 
     for(i=0;;i++)
     {
         op = fsops[i];
         /* We should dispatch a response before reaching the end of the array */
         assert(op != NULL);
-        if(op->type == req->req_shadow.type)
+        if(op->type == request->req_shadow.type)
         {
-            printf("Found op for type=%d\n", op->type);
+            FS_DEBUG("Found op for type=%d\n", op->type);
             /* There needs to be a response handler */
             assert(op->response_handler != NULL);
-            op->response_handler(mount, req);
+            op->response_handler(request->mount, request);
             break;
         }
     }
 
-    req->active = 0;
-    add_id_to_freelist(priv_req_id, mount->freelist);
+    request->active = 0;
+    add_id_to_freelist(request->id, request->mount->freelist);
 }
 
-static void handle_aio_events(struct fs_mount *mount)
+static void handle_aio_event(struct fs_request *request)
 {
-    int fd, ret, count, i, notify;
-    evtchn_port_t port;
-    /* AIO control block for the evtchn file destriptor */
-    struct aiocb evtchn_cb;
-    const struct aiocb * cb_list[mount->nr_entries];
-    int request_ids[mount->nr_entries];
-
-    /* Prepare the AIO control block for evtchn */ 
-    fd = xc_evtchn_fd(mount->evth); 
-    bzero(&evtchn_cb, sizeof(struct aiocb));
-    evtchn_cb.aio_fildes = fd;
-    evtchn_cb.aio_nbytes = sizeof(port);
-    evtchn_cb.aio_buf = &port;
-    assert(aio_read(&evtchn_cb) == 0);
-
-wait_again:   
-    /* Create list of active AIO requests */
-    count = 0;
-    for(i=0; i<mount->nr_entries; i++)
-        if(mount->requests[i].active)
-        {
-            cb_list[count] = &mount->requests[i].aiocb;
-            request_ids[count] = i;
-            count++;
-        }
-    /* Add the event channel at the end of the list. Event channel needs to be
-     * handled last as it exits this function. */
-    cb_list[count] = &evtchn_cb;
-    request_ids[count] = -1;
-    count++;
-
-    /* Block till an AIO requset finishes, or we get an event */ 
-    while(1) {
-       int ret = aio_suspend(cb_list, count, NULL);
-       if (!ret)
-           break;
-       assert(errno == EINTR);
+    int ret, notify;
+
+    FS_DEBUG("handle_aio_event: mount %s request %d\n", request->mount->frontend, request->id);
+    if (request->active < 0) {
+        request->mount->nr_entries++;
+        if (!request->mount->nr_entries)
+            free_mount_request(request->mount);
+        return;
     }
-    for(i=0; i<count; i++)
-        if(aio_error(cb_list[i]) != EINPROGRESS)
-        {
-            if(request_ids[i] >= 0)
-                dispatch_response(mount, request_ids[i]);
-            else
-                goto read_event_channel;
-        }
-    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&mount->ring, notify);
-    printf("Pushed responces and notify=%d\n", notify);
-    if(notify)
-        xc_evtchn_notify(mount->evth, mount->local_evtchn);
-    
-    goto wait_again;
 
-read_event_channel:    
-    assert(aio_return(&evtchn_cb) == sizeof(evtchn_port_t)); 
-    assert(xc_evtchn_unmask(mount->evth, mount->local_evtchn) >= 0);
-}
+    ret = aio_error(&request->aiocb);
+    if(ret != EINPROGRESS && ret != ECANCELED)
+        dispatch_response(request);
 
+    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&request->mount->ring, notify);
+    FS_DEBUG("Pushed responces and notify=%d\n", notify);
+    if(notify)
+        xc_evtchn_notify(request->mount->evth, request->mount->local_evtchn);
+}
 
 static void allocate_request_array(struct fs_mount *mount)
 {
@@ -116,6 +84,7 @@ static void allocate_request_array(struct fs_mount *mount)
     for(i=0; i< nr_entries; i++)
     {
         requests[i].active = 0; 
+        requests[i].mount = mount; 
         add_id_to_freelist(i, freelist);
     }
     mount->requests = requests;
@@ -123,86 +92,102 @@ static void allocate_request_array(struct fs_mount *mount)
 }
 
 
-static void *handle_mount(void *data)
+static void handle_mount(struct fs_mount *mount)
 {
     int more, notify;
-    struct fs_mount *mount = (struct fs_mount *)data;
-    
-    printf("Starting a thread for mount: %d\n", mount->mount_id);
-    allocate_request_array(mount);
-
-    for(;;)
-    {
-        int nr_consumed=0;
-        RING_IDX cons, rp;
-        struct fsif_request *req;
+    int nr_consumed=0;
+    RING_IDX cons, rp;
+    struct fsif_request *req;
 
-        handle_aio_events(mount);
 moretodo:
-        rp = mount->ring.sring->req_prod;
-        xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
+    rp = mount->ring.sring->req_prod;
+    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
 
-        while ((cons = mount->ring.req_cons) != rp)
+    while ((cons = mount->ring.req_cons) != rp)
+    {
+        int i;
+        struct fs_op *op;
+
+        FS_DEBUG("Got a request at %d (of %d)\n", 
+                cons, RING_SIZE(&mount->ring));
+        req = RING_GET_REQUEST(&mount->ring, cons);
+        FS_DEBUG("Request type=%d\n", req->type); 
+        for(i=0;;i++)
         {
-            int i;
-            struct fs_op *op;
-
-            printf("Got a request at %d (of %d)\n", 
-                    cons, RING_SIZE(&mount->ring));
-            req = RING_GET_REQUEST(&mount->ring, cons);
-            printf("Request type=%d\n", req->type); 
-            for(i=0;;i++)
+            op = fsops[i];
+            if(op == NULL)
             {
-                op = fsops[i];
-                if(op == NULL)
-                {
-                    /* We've reached the end of the array, no appropirate
-                     * handler found. Warn, ignore and continue. */
-                    printf("WARN: Unknown request type: %d\n", req->type);
-                    mount->ring.req_cons++; 
-                    break;
-                }
-                if(op->type == req->type)
-                {
-                    /* There needs to be a dispatch handler */
-                    assert(op->dispatch_handler != NULL);
-                    op->dispatch_handler(mount, req);
-                    break;
-                }
-             }
-
-            nr_consumed++;
+                /* We've reached the end of the array, no appropirate
+                 * handler found. Warn, ignore and continue. */
+                FS_DEBUG("WARN: Unknown request type: %d\n", req->type);
+                mount->ring.req_cons++; 
+                break;
+            }
+            if(op->type == req->type)
+            {
+                /* There needs to be a dispatch handler */
+                assert(op->dispatch_handler != NULL);
+                op->dispatch_handler(mount, req);
+                break;
+            }
         }
-        printf("Backend consumed: %d requests\n", nr_consumed);
-        RING_FINAL_CHECK_FOR_REQUESTS(&mount->ring, more);
-        if(more) goto moretodo;
-
-        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&mount->ring, notify);
-        printf("Pushed responces and notify=%d\n", notify);
-        if(notify)
-            xc_evtchn_notify(mount->evth, mount->local_evtchn);
+
+        nr_consumed++;
     }
-    printf("Destroying thread for mount: %d\n", mount->mount_id);
-    xc_gnttab_munmap(mount->gnth, mount->ring.sring, 1);
+    FS_DEBUG("Backend consumed: %d requests\n", nr_consumed);
+    RING_FINAL_CHECK_FOR_REQUESTS(&mount->ring, more);
+    if(more) goto moretodo;
+
+    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&mount->ring, notify);
+    FS_DEBUG("Pushed responces and notify=%d\n", notify);
+    if(notify)
+        xc_evtchn_notify(mount->evth, mount->local_evtchn);
+}
+
+static void terminate_mount_request(struct fs_mount *mount) {
+    int count = 0, i;
+
+    FS_DEBUG("terminate_mount_request %s\n", mount->frontend);
+    xenbus_write_backend_state(mount, STATE_CLOSING);
+
+    for(i=0; i<mount->nr_entries; i++)
+        if(mount->requests[i].active) {
+            mount->requests[i].active = -1;
+            aio_cancel(mount->requests[i].aiocb.aio_fildes, &mount->requests[i].aiocb);
+            count--;
+        }
+    mount->nr_entries = count;
+
+    while (!xenbus_frontend_state_changed(mount, STATE_CLOSING));
+    xenbus_write_backend_state(mount, STATE_CLOSED);
+
+    xc_gnttab_munmap(mount->gnth, mount->ring.sring, mount->shared_ring_size);
     xc_gnttab_close(mount->gnth);
     xc_evtchn_unbind(mount->evth, mount->local_evtchn);
     xc_evtchn_close(mount->evth);
+
+    if (!count)
+        free_mount_request(mount);
+}
+
+static void free_mount_request(struct fs_mount *mount) {
+    FS_DEBUG("free_mount_request %s\n", mount->frontend);
     free(mount->frontend);
-    pthread_exit(NULL);
+    free(mount->requests);
+    free(mount->freelist);
+    LIST_REMOVE (mount, entries);
+    free(mount);
 }
 
 static void handle_connection(int frontend_dom_id, int export_id, char *frontend)
 {
     struct fs_mount *mount;
     struct fs_export *export;
-    int evt_port;
-    pthread_t handling_thread;
     struct fsif_sring *sring;
     uint32_t dom_ids[MAX_RING_SIZE];
     int i;
 
-    printf("Handling connection from dom=%d, for export=%d\n", 
+    FS_DEBUG("Handling connection from dom=%d, for export=%d\n", 
             frontend_dom_id, export_id);
     /* Try to find the export on the list */
     export = fs_exports;
@@ -214,7 +199,7 @@ static void handle_connection(int frontend_dom_id, int export_id, char *frontend
     }
     if(!export)
     {
-        printf("Could not find the export (the id is unknown).\n");
+        FS_DEBUG("Could not find the export (the id is unknown).\n");
         return;
     }
 
@@ -223,7 +208,7 @@ static void handle_connection(int frontend_dom_id, int export_id, char *frontend
     mount->export = export;
     mount->mount_id = mount_id++;
     xenbus_read_mount_request(mount, frontend);
-    printf("Frontend found at: %s (gref=%d, evtchn=%d)\n", 
+    FS_DEBUG("Frontend found at: %s (gref=%d, evtchn=%d)\n", 
             mount->frontend, mount->grefs[0], mount->remote_evtchn);
     xenbus_write_backend_node(mount);
     mount->evth = -1;
@@ -249,18 +234,24 @@ static void handle_connection(int frontend_dom_id, int export_id, char *frontend
     mount->nr_entries = mount->ring.nr_ents; 
     for (i = 0; i < MAX_FDS; i++)
         mount->fds[i] = -1;
-    xenbus_write_backend_ready(mount);
 
-    pthread_create(&handling_thread, NULL, &handle_mount, mount);
+    LIST_INSERT_HEAD(&mount_requests_head, mount, entries);
+    xenbus_watch_frontend_state(mount);
+    xenbus_write_backend_state(mount, STATE_READY);
+    
+    allocate_request_array(mount);
 }
 
 static void await_connections(void)
 {
-    int fd, ret, dom_id, export_id; 
+    int fd, max_fd, ret, dom_id, export_id; 
     fd_set fds;
     char **watch_paths;
     unsigned int len;
     char d;
+    struct fs_mount *pointer;
+
+    LIST_INIT (&mount_requests_head);
 
     assert(xsh != NULL);
     fd = xenbus_get_watch_fd(); 
@@ -268,28 +259,97 @@ static void await_connections(void)
     do {
        FD_ZERO(&fds);
        FD_SET(fd, &fds);
-        ret = select(fd+1, &fds, NULL, NULL, NULL);
-        assert(ret == 1);
-        watch_paths = xs_read_watch(xsh, &len);
-        assert(len == 2);
-        assert(strcmp(watch_paths[1], "conn-watch") == 0);
-        dom_id = -1;
-        export_id = -1;
-       d = 0;
-        printf("Path changed %s\n", watch_paths[0]);
-        sscanf(watch_paths[0], WATCH_NODE"/%d/%d/fronten%c", 
-                &dom_id, &export_id, &d);
-        if((dom_id >= 0) && (export_id >= 0) && d == 'd') {
-           char *frontend = xs_read(xsh, XBT_NULL, watch_paths[0], NULL);
-           if (frontend) {
-               handle_connection(dom_id, export_id, frontend);
-               xs_rm(xsh, XBT_NULL, watch_paths[0]);
-           }
-       }
-next_select:        
-        printf("Awaiting next connection.\n");
-        /* TODO - we need to figure out what to free */
-       free(watch_paths);
+       FD_SET(pipefds[0], &fds);
+        max_fd = fd > pipefds[0] ? fd : pipefds[0];
+        LIST_FOREACH(pointer, &mount_requests_head, entries) {
+            int tfd = xc_evtchn_fd(pointer->evth);
+            FD_SET(tfd, &fds);
+            if (tfd > max_fd) max_fd = tfd;
+        }
+        ret = select(max_fd+1, &fds, NULL, NULL, NULL);
+        if (ret < 0) {
+            if (errno == EINTR) continue;
+            /* try to recover */
+            else if (errno == EBADF) {
+                struct timeval timeout;
+                memset(&timeout, 0x00, sizeof(timeout));
+                FD_ZERO(&fds);
+                FD_SET(fd, &fds);
+                FD_SET(pipefds[0], &fds);
+                max_fd = fd > pipefds[0] ? fd : pipefds[0];
+                ret = select(max_fd + 1, &fds, NULL, NULL, &timeout);
+                if (ret < 0)
+                    err(1, "select: unrecoverable error occurred: %d\n", errno);
+
+                /* trying to find the bogus fd among the open event channels */
+                LIST_FOREACH(pointer, &mount_requests_head, entries) {
+                    int tfd = xc_evtchn_fd(pointer->evth);
+                    memset(&timeout, 0x00, sizeof(timeout));
+                    FD_ZERO(&fds);
+                    FD_SET(tfd, &fds);
+                    ret = select(tfd + 1, &fds, NULL, NULL, &timeout);
+                    if (ret < 0) {
+                        FS_DEBUG("fd %d is bogus, closing the related connection\n", tfd);
+                        pointer->evth = fd;
+                        terminate_mount_request(pointer);
+                        continue;
+                    }
+                }
+                continue;
+            } else
+                err(1, "select: unrecoverable error occurred: %d\n", errno);
+        }
+        if (FD_ISSET(fd, &fds)) {
+            watch_paths = xs_read_watch(xsh, &len);
+            if (!strcmp(watch_paths[XS_WATCH_TOKEN], "conn-watch")) {
+                dom_id = -1;
+                export_id = -1;
+                d = 0;
+                FS_DEBUG("Path changed %s\n", watch_paths[0]);
+                sscanf(watch_paths[XS_WATCH_PATH], WATCH_NODE"/%d/%d/fronten%c", 
+                        &dom_id, &export_id, &d);
+                if((dom_id >= 0) && (export_id >= 0) && d == 'd') {
+                    char *frontend = xs_read(xsh, XBT_NULL, watch_paths[XS_WATCH_PATH], NULL);
+                    if (frontend) {
+                        handle_connection(dom_id, export_id, frontend);
+                        xs_rm(xsh, XBT_NULL, watch_paths[XS_WATCH_PATH]);
+                    }
+                }
+            } else if (!strcmp(watch_paths[XS_WATCH_TOKEN], "frontend-state")) {
+                LIST_FOREACH(pointer, &mount_requests_head, entries) {
+                    if (!strncmp(pointer->frontend, watch_paths[XS_WATCH_PATH], strlen(pointer->frontend))) {
+                        char *state = xenbus_read_frontend_state(pointer);
+                        if (!state || strcmp(state, STATE_READY)) {
+                            xenbus_unwatch_frontend_state(pointer);
+                            terminate_mount_request(pointer);
+                        }
+                        free(state);
+                        break;
+                    }
+                }
+            } else {
+                FS_DEBUG("xenstore watch event unrecognized\n");
+            }
+            FS_DEBUG("Awaiting next connection.\n");
+            /* TODO - we need to figure out what to free */
+            free(watch_paths);
+        }
+        if (FD_ISSET(pipefds[0], &fds)) {
+            struct fs_request *request;
+            if (read_exact(pipefds[0], &request, sizeof(struct fs_request *)) < 0)
+                err(1, "read request failed\n");
+            handle_aio_event(request); 
+        }
+        LIST_FOREACH(pointer, &mount_requests_head, entries) {
+            if (FD_ISSET(xc_evtchn_fd(pointer->evth), &fds)) {
+                evtchn_port_t port;
+                port = xc_evtchn_pending(pointer->evth);
+                if (port != -1) {
+                    handle_mount(pointer);
+                    xc_evtchn_unmask(pointer->evth, port);
+                }
+            }
+        }
     } while (1);
 }
 
@@ -312,10 +372,29 @@ static struct fs_export* create_export(char *name, char *export_path)
     return curr_export;
 }
 
+static void aio_signal_handler(int signo, siginfo_t *info, void *context)
+{
+    struct fs_request *request = (struct fs_request*) info->si_value.sival_ptr;
+    int saved_errno = errno;
+    if (write_exact(pipefds[1], &request, sizeof(struct fs_request *)) < 0)
+        err(1, "write request filed\n");
+    errno = saved_errno;
+}
 
 int main(void)
 {
     struct fs_export *export;
+    struct sigaction act;
+    sigset_t enable;
+
+    sigemptyset(&enable);
+    sigaddset(&enable, SIGUSR2);
+    pthread_sigmask(SIG_UNBLOCK, &enable, NULL);
+
+    sigfillset(&act.sa_mask);
+    act.sa_flags = SA_SIGINFO; /* do not restart syscalls to interrupt select(); use sa_sigaction */
+    act.sa_sigaction = aio_signal_handler;
+    sigaction(SIGUSR2, &act, NULL);
 
     /* Open the connection to XenStore first */
     xsh = xs_domain_open();
@@ -328,6 +407,9 @@ int main(void)
     export = create_export("default", "/exports");
     xenbus_register_export(export);
 
+    if (socketpair(PF_UNIX,SOCK_STREAM, 0, pipefds) == -1)
+        err(1, "failed to create pipe\n");
+
     await_connections();
     /* Close the connection to XenStore when we are finished with everything */
     xs_daemon_close(xsh);
index b2a6be6f4a9746e7905a8309af540d8d88a1ed17..504569027e4c1cf3ecfe6362a75404c4d1d0b5ad 100644 (file)
@@ -7,6 +7,7 @@
 #include <xen/event_channel.h>
 #include <xen/io/ring.h>
 #include <xen/io/fsif.h>
+#include "sys-queue.h"
 
 #define ROOT_NODE           "backend/vfs"
 #define EXPORTS_SUBNODE     "exports"
@@ -25,6 +26,8 @@ struct fs_export
 
 struct fs_request
 {
+    struct fs_mount *mount;
+    int id;
     int active;
     void *page;                         /* Pointer to mapped grant */
     int count;
@@ -50,6 +53,7 @@ struct fs_mount
     struct fs_request *requests;
     unsigned short *freelist;
     int fds[MAX_FDS];
+    LIST_ENTRY(fs_mount) entries;
 };
 
 
@@ -61,7 +65,11 @@ int xenbus_register_export(struct fs_export *export);
 int xenbus_get_watch_fd(void);
 void xenbus_read_mount_request(struct fs_mount *mount, char *frontend);
 void xenbus_write_backend_node(struct fs_mount *mount);
-void xenbus_write_backend_ready(struct fs_mount *mount);
+void xenbus_write_backend_state(struct fs_mount *mount, const char *state);
+int xenbus_frontend_state_changed(struct fs_mount *mount, const char *oldstate);
+void xenbus_watch_frontend_state(struct fs_mount *mount);
+void xenbus_unwatch_frontend_state(struct fs_mount *mount);
+char* xenbus_read_frontend_state(struct fs_mount *mount);
 
 /* File operations, implemented in fs-ops.c */
 struct fs_op
diff --git a/tools/fs-back/fs-debug.h b/tools/fs-back/fs-debug.h
new file mode 100644 (file)
index 0000000..9fe53e5
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef __FS_DEBUG__
+#define __FS_DEBUG__
+
+// #define DEBUG 1
+
+#ifdef DEBUG
+#define FS_DEBUG(fmt, ...) do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define FS_DEBUG(fmt, ...) do { } while (0)
+#endif
+
+#endif /*__FS_DEBUG__*/
index 424b0547794b57c00dbe50135085a9889b944b03..6abd3543ba2ee248eac861992bc25dcd2960965e 100644 (file)
@@ -14,6 +14,7 @@
 #include <sys/mount.h>
 #include <unistd.h>
 #include "fs-backend.h"
+#include "fs-debug.h"
 
 /* For debugging only */
 #include <sys/time.h>
 
 #define BUFFER_SIZE 1024
 
-
 static unsigned short get_request(struct fs_mount *mount, struct fsif_request *req)
 {
     unsigned short id = get_id_from_freelist(mount->freelist); 
 
-    printf("Private Request id: %d\n", id);
+    FS_DEBUG("Private Request id: %d\n", id);
     memcpy(&mount->requests[id].req_shadow, req, sizeof(struct fsif_request));
     mount->requests[id].active = 1;
 
@@ -49,12 +49,11 @@ static void dispatch_file_open(struct fs_mount *mount, struct fsif_request *req)
 {
     char *file_name, full_path[BUFFER_SIZE];
     int fd;
-    struct timeval tv1, tv2;
     RING_IDX rsp_idx;
     fsif_response_t *rsp;
     uint16_t req_id;
 
-    printf("Dispatching file open operation (gref=%d).\n", req->u.fopen.gref);
+    FS_DEBUG("Dispatching file open operation (gref=%d).\n", req->u.fopen.gref);
     /* Read the request, and open file */
     file_name = xc_gnttab_map_grant_ref(mount->gnth,
                                         mount->dom_id,
@@ -62,13 +61,13 @@ static void dispatch_file_open(struct fs_mount *mount, struct fsif_request *req)
                                         PROT_READ);
    
     req_id = req->id;
-    printf("File open issued for %s\n", file_name); 
+    FS_DEBUG("File open issued for %s\n", file_name); 
     assert(BUFFER_SIZE > 
            strlen(file_name) + strlen(mount->export->export_path) + 1); 
     snprintf(full_path, sizeof(full_path), "%s/%s",
            mount->export->export_path, file_name);
     assert(xc_gnttab_munmap(mount->gnth, file_name, 1) == 0);
-    printf("Issuing open for %s\n", full_path);
+    FS_DEBUG("Issuing open for %s\n", full_path);
     fd = get_fd(mount);
     if (fd >= 0) {
         int real_fd = open(full_path, O_RDWR);
@@ -77,7 +76,7 @@ static void dispatch_file_open(struct fs_mount *mount, struct fsif_request *req)
         else
         {
             mount->fds[fd] = real_fd;
-            printf("Got FD: %d for real %d\n", fd, real_fd);
+            FS_DEBUG("Got FD: %d for real %d\n", fd, real_fd);
         }
     }
     /* We can advance the request consumer index, from here on, the request
@@ -87,7 +86,7 @@ static void dispatch_file_open(struct fs_mount *mount, struct fsif_request *req)
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)fd;
@@ -100,7 +99,7 @@ static void dispatch_file_close(struct fs_mount *mount, struct fsif_request *req
     fsif_response_t *rsp;
     uint16_t req_id;
 
-    printf("Dispatching file close operation (fd=%d).\n", req->u.fclose.fd);
+    FS_DEBUG("Dispatching file close operation (fd=%d).\n", req->u.fclose.fd);
    
     req_id = req->id;
     if (req->u.fclose.fd < MAX_FDS) {
@@ -109,7 +108,7 @@ static void dispatch_file_close(struct fs_mount *mount, struct fsif_request *req
         mount->fds[req->u.fclose.fd] = -1;
     } else
         ret = -1;
-    printf("Got ret: %d\n", ret);
+    FS_DEBUG("Got ret: %d\n", ret);
     /* We can advance the request consumer index, from here on, the request
      * should not be used (it may be overrinden by a response) */
     mount->ring.req_cons++;
@@ -117,7 +116,7 @@ static void dispatch_file_close(struct fs_mount *mount, struct fsif_request *req
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -127,7 +126,7 @@ static void dispatch_file_close(struct fs_mount *mount, struct fsif_request *req
 static void dispatch_file_read(struct fs_mount *mount, struct fsif_request *req)
 {
     void *buf;
-    int fd, i, count;
+    int fd, count;
     uint16_t req_id;
     unsigned short priv_id;
     struct fs_request *priv_req;
@@ -143,7 +142,7 @@ static void dispatch_file_read(struct fs_mount *mount, struct fsif_request *req)
                                           PROT_WRITE);
    
     req_id = req->id;
-    printf("File read issued for FD=%d (len=%"PRIu64", offest=%"PRIu64")\n", 
+    FS_DEBUG("File read issued for FD=%d (len=%"PRIu64", offest=%"PRIu64")\n", 
             req->u.fread.fd, req->u.fread.len, req->u.fread.offset); 
 
     if (req->u.fread.fd < MAX_FDS)
@@ -152,10 +151,11 @@ static void dispatch_file_read(struct fs_mount *mount, struct fsif_request *req)
         fd = -1;
 
     priv_id = get_request(mount, req);
-    printf("Private id is: %d\n", priv_id);
+    FS_DEBUG("Private id is: %d\n", priv_id);
     priv_req = &mount->requests[priv_id];
     priv_req->page = buf;
     priv_req->count = count;
+    priv_req->id = priv_id;
 
     /* Dispatch AIO read request */
     bzero(&priv_req->aiocb, sizeof(struct aiocb));
@@ -163,9 +163,11 @@ static void dispatch_file_read(struct fs_mount *mount, struct fsif_request *req)
     priv_req->aiocb.aio_nbytes = req->u.fread.len;
     priv_req->aiocb.aio_offset = req->u.fread.offset;
     priv_req->aiocb.aio_buf = buf;
+    priv_req->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+    priv_req->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
+    priv_req->aiocb.aio_sigevent.sigev_value.sival_ptr = priv_req;
     assert(aio_read(&priv_req->aiocb) >= 0);
 
-out: 
     /* We can advance the request consumer index, from here on, the request
      * should not be used (it may be overrinden by a response) */
     mount->ring.req_cons++;
@@ -185,7 +187,7 @@ static void end_file_read(struct fs_mount *mount, struct fs_request *priv_req)
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
     req_id = priv_req->req_shadow.id; 
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)aio_return(&priv_req->aiocb);
@@ -194,7 +196,7 @@ static void end_file_read(struct fs_mount *mount, struct fs_request *priv_req)
 static void dispatch_file_write(struct fs_mount *mount, struct fsif_request *req)
 {
     void *buf;
-    int fd, count, i;
+    int fd, count;
     uint16_t req_id;
     unsigned short priv_id;
     struct fs_request *priv_req;
@@ -210,7 +212,7 @@ static void dispatch_file_write(struct fs_mount *mount, struct fsif_request *req
                                           PROT_READ);
    
     req_id = req->id;
-    printf("File write issued for FD=%d (len=%"PRIu64", offest=%"PRIu64")\n", 
+    FS_DEBUG("File write issued for FD=%d (len=%"PRIu64", offest=%"PRIu64")\n", 
             req->u.fwrite.fd, req->u.fwrite.len, req->u.fwrite.offset); 
    
     if (req->u.fwrite.fd < MAX_FDS)
@@ -219,10 +221,11 @@ static void dispatch_file_write(struct fs_mount *mount, struct fsif_request *req
         fd = -1;
 
     priv_id = get_request(mount, req);
-    printf("Private id is: %d\n", priv_id);
+    FS_DEBUG("Private id is: %d\n", priv_id);
     priv_req = &mount->requests[priv_id];
     priv_req->page = buf;
     priv_req->count = count;
+    priv_req->id = priv_id;
 
     /* Dispatch AIO write request */
     bzero(&priv_req->aiocb, sizeof(struct aiocb));
@@ -230,6 +233,9 @@ static void dispatch_file_write(struct fs_mount *mount, struct fsif_request *req
     priv_req->aiocb.aio_nbytes = req->u.fwrite.len;
     priv_req->aiocb.aio_offset = req->u.fwrite.offset;
     priv_req->aiocb.aio_buf = buf;
+    priv_req->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+    priv_req->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
+    priv_req->aiocb.aio_sigevent.sigev_value.sival_ptr = priv_req;
     assert(aio_write(&priv_req->aiocb) >= 0);
 
      
@@ -252,7 +258,7 @@ static void end_file_write(struct fs_mount *mount, struct fs_request *priv_req)
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
     req_id = priv_req->req_shadow.id; 
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)aio_return(&priv_req->aiocb);
@@ -260,7 +266,6 @@ static void end_file_write(struct fs_mount *mount, struct fs_request *priv_req)
 
 static void dispatch_stat(struct fs_mount *mount, struct fsif_request *req)
 {
-    struct fsif_stat_response *buf;
     struct stat stat;
     int fd, ret;
     uint16_t req_id;
@@ -273,7 +278,7 @@ static void dispatch_stat(struct fs_mount *mount, struct fsif_request *req)
     else
         fd = -1;
 
-    printf("File stat issued for FD=%d\n", req->u.fstat.fd); 
+    FS_DEBUG("File stat issued for FD=%d\n", req->u.fstat.fd); 
    
     /* We can advance the request consumer index, from here on, the request
      * should not be used (it may be overrinden by a response) */
@@ -281,12 +286,12 @@ static void dispatch_stat(struct fs_mount *mount, struct fsif_request *req)
    
     /* Stat, and create the response */ 
     ret = fstat(fd, &stat);
-    printf("Mode=%o, uid=%d, a_time=%ld\n",
+    FS_DEBUG("Mode=%o, uid=%d, a_time=%ld\n",
             stat.st_mode, stat.st_uid, (long)stat.st_atime);
     
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->fstat.stat_ret = (uint32_t)ret;
@@ -320,7 +325,7 @@ static void dispatch_truncate(struct fs_mount *mount, struct fsif_request *req)
 
     req_id = req->id;
     length = req->u.ftruncate.length;
-    printf("File truncate issued for FD=%d, length=%"PRId64"\n", req->u.ftruncate.fd, length); 
+    FS_DEBUG("File truncate issued for FD=%d, length=%"PRId64"\n", req->u.ftruncate.fd, length); 
    
     if (req->u.ftruncate.fd < MAX_FDS)
         fd = mount->fds[req->u.ftruncate.fd];
@@ -336,7 +341,7 @@ static void dispatch_truncate(struct fs_mount *mount, struct fsif_request *req)
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -350,7 +355,7 @@ static void dispatch_remove(struct fs_mount *mount, struct fsif_request *req)
     fsif_response_t *rsp;
     uint16_t req_id;
 
-    printf("Dispatching remove operation (gref=%d).\n", req->u.fremove.gref);
+    FS_DEBUG("Dispatching remove operation (gref=%d).\n", req->u.fremove.gref);
     /* Read the request, and open file */
     file_name = xc_gnttab_map_grant_ref(mount->gnth,
                                         mount->dom_id,
@@ -358,15 +363,15 @@ static void dispatch_remove(struct fs_mount *mount, struct fsif_request *req)
                                         PROT_READ);
    
     req_id = req->id;
-    printf("File remove issued for %s\n", file_name); 
+    FS_DEBUG("File remove issued for %s\n", file_name); 
     assert(BUFFER_SIZE > 
            strlen(file_name) + strlen(mount->export->export_path) + 1); 
     snprintf(full_path, sizeof(full_path), "%s/%s",
            mount->export->export_path, file_name);
     assert(xc_gnttab_munmap(mount->gnth, file_name, 1) == 0);
-    printf("Issuing remove for %s\n", full_path);
+    FS_DEBUG("Issuing remove for %s\n", full_path);
     ret = remove(full_path);
-    printf("Got ret: %d\n", ret);
+    FS_DEBUG("Got ret: %d\n", ret);
     /* We can advance the request consumer index, from here on, the request
      * should not be used (it may be overrinden by a response) */
     mount->ring.req_cons++;
@@ -374,7 +379,7 @@ static void dispatch_remove(struct fs_mount *mount, struct fsif_request *req)
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -390,7 +395,7 @@ static void dispatch_rename(struct fs_mount *mount, struct fsif_request *req)
     fsif_response_t *rsp;
     uint16_t req_id;
 
-    printf("Dispatching rename operation (gref=%d).\n", req->u.fremove.gref);
+    FS_DEBUG("Dispatching rename operation (gref=%d).\n", req->u.fremove.gref);
     /* Read the request, and open file */
     buf = xc_gnttab_map_grant_ref(mount->gnth,
                                   mount->dom_id,
@@ -400,7 +405,7 @@ static void dispatch_rename(struct fs_mount *mount, struct fsif_request *req)
     req_id = req->id;
     old_file_name = buf + req->u.frename.old_name_offset;
     new_file_name = buf + req->u.frename.new_name_offset;
-    printf("File rename issued for %s -> %s (buf=%s)\n", 
+    FS_DEBUG("File rename issued for %s -> %s (buf=%s)\n", 
             old_file_name, new_file_name, buf); 
     assert(BUFFER_SIZE > 
            strlen(old_file_name) + strlen(mount->export->export_path) + 1); 
@@ -411,9 +416,9 @@ static void dispatch_rename(struct fs_mount *mount, struct fsif_request *req)
     snprintf(new_full_path, sizeof(new_full_path), "%s/%s",
            mount->export->export_path, new_file_name);
     assert(xc_gnttab_munmap(mount->gnth, buf, 1) == 0);
-    printf("Issuing rename for %s -> %s\n", old_full_path, new_full_path);
+    FS_DEBUG("Issuing rename for %s -> %s\n", old_full_path, new_full_path);
     ret = rename(old_full_path, new_full_path);
-    printf("Got ret: %d\n", ret);
+    FS_DEBUG("Got ret: %d\n", ret);
     /* We can advance the request consumer index, from here on, the request
      * should not be used (it may be overrinden by a response) */
     mount->ring.req_cons++;
@@ -421,7 +426,7 @@ static void dispatch_rename(struct fs_mount *mount, struct fsif_request *req)
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -438,7 +443,7 @@ static void dispatch_create(struct fs_mount *mount, struct fsif_request *req)
     fsif_response_t *rsp;
     uint16_t req_id;
 
-    printf("Dispatching file create operation (gref=%d).\n", req->u.fcreate.gref);
+    FS_DEBUG("Dispatching file create operation (gref=%d).\n", req->u.fcreate.gref);
     /* Read the request, and create file/directory */
     mode = req->u.fcreate.mode;
     directory = req->u.fcreate.directory;
@@ -448,7 +453,7 @@ static void dispatch_create(struct fs_mount *mount, struct fsif_request *req)
                                         PROT_READ);
    
     req_id = req->id;
-    printf("File create issued for %s\n", file_name); 
+    FS_DEBUG("File create issued for %s\n", file_name); 
     assert(BUFFER_SIZE > 
            strlen(file_name) + strlen(mount->export->export_path) + 1); 
     snprintf(full_path, sizeof(full_path), "%s/%s",
@@ -460,12 +465,12 @@ static void dispatch_create(struct fs_mount *mount, struct fsif_request *req)
 
     if(directory)
     {
-        printf("Issuing create for directory: %s\n", full_path);
+        FS_DEBUG("Issuing create for directory: %s\n", full_path);
         ret = mkdir(full_path, mode);
     }
     else
     {
-        printf("Issuing create for file: %s\n", full_path);
+        FS_DEBUG("Issuing create for file: %s\n", full_path);
         ret = get_fd(mount);
         if (ret >= 0) {
             int real_fd = creat(full_path, mode); 
@@ -474,15 +479,15 @@ static void dispatch_create(struct fs_mount *mount, struct fsif_request *req)
             else
             {
                 mount->fds[ret] = real_fd;
-                printf("Got FD: %d for real %d\n", ret, real_fd);
+                FS_DEBUG("Got FD: %d for real %d\n", ret, real_fd);
             }
         }
     }
-    printf("Got ret %d (errno=%d)\n", ret, errno);
+    FS_DEBUG("Got ret %d (errno=%d)\n", ret, errno);
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -499,7 +504,7 @@ static void dispatch_list(struct fs_mount *mount, struct fsif_request *req)
     DIR *dir;
     struct dirent *dirent = NULL;
 
-    printf("Dispatching list operation (gref=%d).\n", req->u.flist.gref);
+    FS_DEBUG("Dispatching list operation (gref=%d).\n", req->u.flist.gref);
     /* Read the request, and list directory */
     offset = req->u.flist.offset;
     buf = file_name = xc_gnttab_map_grant_ref(mount->gnth,
@@ -508,7 +513,7 @@ static void dispatch_list(struct fs_mount *mount, struct fsif_request *req)
                                         PROT_READ | PROT_WRITE);
    
     req_id = req->id;
-    printf("Dir list issued for %s\n", file_name); 
+    FS_DEBUG("Dir list issued for %s\n", file_name); 
     assert(BUFFER_SIZE > 
            strlen(file_name) + strlen(mount->export->export_path) + 1); 
     snprintf(full_path, sizeof(full_path), "%s/%s",
@@ -552,7 +557,7 @@ error_out:
     
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = ret_val;
@@ -566,7 +571,7 @@ static void dispatch_chmod(struct fs_mount *mount, struct fsif_request *req)
     uint16_t req_id;
     int32_t mode;
 
-    printf("Dispatching file chmod operation (fd=%d, mode=%o).\n", 
+    FS_DEBUG("Dispatching file chmod operation (fd=%d, mode=%o).\n", 
             req->u.fchmod.fd, req->u.fchmod.mode);
     req_id = req->id;
     if (req->u.fchmod.fd < MAX_FDS)
@@ -583,7 +588,7 @@ static void dispatch_chmod(struct fs_mount *mount, struct fsif_request *req)
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -598,7 +603,7 @@ static void dispatch_fs_space(struct fs_mount *mount, struct fsif_request *req)
     struct statvfs stat;
     int64_t ret;
 
-    printf("Dispatching fs space operation (gref=%d).\n", req->u.fspace.gref);
+    FS_DEBUG("Dispatching fs space operation (gref=%d).\n", req->u.fspace.gref);
     /* Read the request, and open file */
     file_name = xc_gnttab_map_grant_ref(mount->gnth,
                                         mount->dom_id,
@@ -606,13 +611,13 @@ static void dispatch_fs_space(struct fs_mount *mount, struct fsif_request *req)
                                         PROT_READ);
    
     req_id = req->id;
-    printf("Fs space issued for %s\n", file_name); 
+    FS_DEBUG("Fs space issued for %s\n", file_name); 
     assert(BUFFER_SIZE > 
            strlen(file_name) + strlen(mount->export->export_path) + 1); 
     snprintf(full_path, sizeof(full_path), "%s/%s",
            mount->export->export_path, file_name);
     assert(xc_gnttab_munmap(mount->gnth, file_name, 1) == 0);
-    printf("Issuing fs space for %s\n", full_path);
+    FS_DEBUG("Issuing fs space for %s\n", full_path);
     ret = statvfs(full_path, &stat);
     if(ret >= 0)
         ret = stat.f_bsize * stat.f_bfree;
@@ -624,7 +629,7 @@ static void dispatch_fs_space(struct fs_mount *mount, struct fsif_request *req)
 
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)ret;
@@ -643,15 +648,19 @@ static void dispatch_file_sync(struct fs_mount *mount, struct fsif_request *req)
     else
         fd = -1;
 
-    printf("File sync issued for FD=%d\n", req->u.fsync.fd); 
+    FS_DEBUG("File sync issued for FD=%d\n", req->u.fsync.fd); 
    
     priv_id = get_request(mount, req);
-    printf("Private id is: %d\n", priv_id);
+    FS_DEBUG("Private id is: %d\n", priv_id);
     priv_req = &mount->requests[priv_id];
+    priv_req->id = priv_id;
 
     /* Dispatch AIO read request */
     bzero(&priv_req->aiocb, sizeof(struct aiocb));
     priv_req->aiocb.aio_fildes = fd;
+    priv_req->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+    priv_req->aiocb.aio_sigevent.sigev_signo = SIGUSR2;
+    priv_req->aiocb.aio_sigevent.sigev_value.sival_ptr = priv_req;
     assert(aio_fsync(O_SYNC, &priv_req->aiocb) >= 0);
 
      
@@ -669,7 +678,7 @@ static void end_file_sync(struct fs_mount *mount, struct fs_request *priv_req)
     /* Get a response from the ring */
     rsp_idx = mount->ring.rsp_prod_pvt++;
     req_id = priv_req->req_shadow.id; 
-    printf("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
+    FS_DEBUG("Writing response at: idx=%d, id=%d\n", rsp_idx, req_id);
     rsp = RING_GET_RESPONSE(&mount->ring, rsp_idx);
     rsp->id = req_id; 
     rsp->ret_val = (uint64_t)aio_return(&priv_req->aiocb);
index 6a86e245e0e28c00869688ba5ec90187f9af00d0..cc47dbcfd624e88b645da18d2bdf8fd9552bd0f5 100644 (file)
@@ -4,10 +4,12 @@
 #include <stdarg.h>
 #include <string.h>
 #include <assert.h>
+#include <sys/select.h>
 #include <xenctrl.h>
 #include <xs.h>
 #include <xen/io/fsif.h>
 #include "fs-backend.h"
+#include "fs-debug.h"
 
 
 static bool xenbus_printf(struct xs_handle *xsh,
@@ -25,7 +27,7 @@ static bool xenbus_printf(struct xs_handle *xsh,
     snprintf(fullpath, sizeof(fullpath), "%s/%s", node, path);
     vsnprintf(val, sizeof(val), fmt, args);
     va_end(args);
-    printf("xenbus_printf (%s) <= %s.\n", fullpath, val);    
+    FS_DEBUG("xenbus_printf (%s) <= %s.\n", fullpath, val);    
 
     return xs_write(xsh, xbt, fullpath, val, strlen(val));
 }
@@ -57,19 +59,19 @@ int xenbus_register_export(struct fs_export *export)
     assert(xsh != NULL);
     if(xsh == NULL)
     {
-        printf("Could not open connection to xenbus deamon.\n");
+        FS_DEBUG("Could not open connection to xenbus deamon.\n");
         goto error_exit;
     }
-    printf("Connection to the xenbus deamon opened successfully.\n");
+    FS_DEBUG("Connection to the xenbus deamon opened successfully.\n");
 
     /* Start transaction */
     xst = xs_transaction_start(xsh);
     if(xst == 0)
     {
-        printf("Could not start a transaction.\n");
+        FS_DEBUG("Could not start a transaction.\n");
         goto error_exit;
     }
-    printf("XS transaction is %d\n", xst); 
+    FS_DEBUG("XS transaction is %d\n", xst); 
  
     /* Create node string */
     snprintf(node, sizeof(node), "%s/%d", EXPORTS_NODE, export->export_id); 
@@ -78,7 +80,7 @@ int xenbus_register_export(struct fs_export *export)
 
     if(!xenbus_printf(xsh, xst, node, "name", "%s", export->name))
     {
-        printf("Could not write the export node.\n");
+        FS_DEBUG("Could not write the export node.\n");
         goto error_exit;
     }
 
@@ -87,7 +89,7 @@ int xenbus_register_export(struct fs_export *export)
     perms.perms = XS_PERM_READ;
     if(!xs_set_permissions(xsh, xst, EXPORTS_NODE, &perms, 1))
     {
-        printf("Could not set permissions on the export node.\n");
+        FS_DEBUG("Could not set permissions on the export node.\n");
         goto error_exit;
     }
 
@@ -166,7 +168,7 @@ void xenbus_write_backend_node(struct fs_mount *mount)
 
     assert(xsh != NULL);
     self_id = get_self_id();
-    printf("Our own dom_id=%d\n", self_id);
+    FS_DEBUG("Our own dom_id=%d\n", self_id);
     snprintf(node, sizeof(node), "%s/backend", mount->frontend);
     snprintf(backend_node, sizeof(backend_node), "/local/domain/%d/"ROOT_NODE"/%d",
                                 self_id, mount->mount_id);
@@ -176,7 +178,7 @@ void xenbus_write_backend_node(struct fs_mount *mount)
     xs_write(xsh, XBT_NULL, node, STATE_INITIALISED, strlen(STATE_INITIALISED));
 }
 
-void xenbus_write_backend_ready(struct fs_mount *mount)
+void xenbus_write_backend_state(struct fs_mount *mount, const char *state)
 {
     char node[1024];
     int self_id;
@@ -184,6 +186,59 @@ void xenbus_write_backend_ready(struct fs_mount *mount)
     assert(xsh != NULL);
     self_id = get_self_id();
     snprintf(node, sizeof(node), ROOT_NODE"/%d/state", mount->mount_id);
-    xs_write(xsh, XBT_NULL, node, STATE_READY, strlen(STATE_READY));
+    xs_write(xsh, XBT_NULL, node, state, strlen(state));
+}
+
+void xenbus_watch_frontend_state(struct fs_mount *mount)
+{
+    int res;
+    char statepath[1024];
+
+    assert(xsh != NULL);
+    snprintf(statepath, sizeof(statepath), "%s/state", mount->frontend);
+    res = xs_watch(xsh, statepath, "frontend-state");
+    assert(res);
+}
+
+void xenbus_unwatch_frontend_state(struct fs_mount *mount)
+{
+    int res;
+    char statepath[1024];
+
+    assert(xsh != NULL);
+    snprintf(statepath, sizeof(statepath), "%s/state", mount->frontend);
+    res = xs_unwatch(xsh, statepath, "frontend-state");
+    assert(res);
+}
+
+int xenbus_frontend_state_changed(struct fs_mount *mount, const char *oldstate)
+{
+    unsigned int len;
+    char statepath[1024];
+    char *state = NULL;
+
+    assert(xsh != NULL);
+    snprintf(statepath, sizeof(statepath), "%s/state", mount->frontend);
+    state = xs_read(xsh, XBT_NULL, statepath, &len);
+    if (state && len > 0) {
+        if (strcmp(state, oldstate)) {
+            free(state);
+            return 1;
+        } else {
+            free(state);
+            return 0;
+        }
+    } else
+        return 1;
+}
+
+char* xenbus_read_frontend_state(struct fs_mount *mount)
+{
+    unsigned int len;
+    char statepath[1024];
+
+    assert(xsh != NULL);
+    snprintf(statepath, sizeof(statepath), "%s/state", mount->frontend);
+    return xs_read(xsh, XBT_NULL, statepath, &len);
 }
 
diff --git a/tools/fs-back/sys-queue.h b/tools/fs-back/sys-queue.h
new file mode 100644 (file)
index 0000000..0b75508
--- /dev/null
@@ -0,0 +1,338 @@
+/*      $NetBSD: queue.h,v 1.45.14.1 2007/07/18 20:13:24 liamjfoy Exp $ */
+
+/*
+ * Qemu version: Copy from netbsd, removed debug code, removed some of
+ * the implementations.  Left in lists, tail queues and circular queues.
+ */
+
+/*
+ * Copyright (c) 1991, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *      @(#)queue.h     8.5 (Berkeley) 8/20/94
+ */
+
+#ifndef _SYS_QUEUE_H_
+#define _SYS_QUEUE_H_
+
+/*
+ * This file defines three types of data structures:
+ * lists, tail queues, and circular queues.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ */
+
+/*
+ * List definitions.
+ */
+#define LIST_HEAD(name, type)                                           \
+struct name {                                                           \
+        struct type *lh_first;  /* first element */                     \
+}
+
+#define LIST_HEAD_INITIALIZER(head)                                     \
+        { NULL }
+
+#define LIST_ENTRY(type)                                                \
+struct {                                                                \
+        struct type *le_next;   /* next element */                      \
+        struct type **le_prev;  /* address of previous next element */  \
+}
+
+/*
+ * List functions.
+ */
+#define LIST_INIT(head) do {                                            \
+        (head)->lh_first = NULL;                                        \
+} while (/*CONSTCOND*/0)
+
+#define LIST_INSERT_AFTER(listelm, elm, field) do {                     \
+        if (((elm)->field.le_next = (listelm)->field.le_next) != NULL)  \
+                (listelm)->field.le_next->field.le_prev =               \
+                    &(elm)->field.le_next;                              \
+        (listelm)->field.le_next = (elm);                               \
+        (elm)->field.le_prev = &(listelm)->field.le_next;               \
+} while (/*CONSTCOND*/0)
+
+#define LIST_INSERT_BEFORE(listelm, elm, field) do {                    \
+        (elm)->field.le_prev = (listelm)->field.le_prev;                \
+        (elm)->field.le_next = (listelm);                               \
+        *(listelm)->field.le_prev = (elm);                              \
+        (listelm)->field.le_prev = &(elm)->field.le_next;               \
+} while (/*CONSTCOND*/0)
+
+#define LIST_INSERT_HEAD(head, elm, field) do {                         \
+        if (((elm)->field.le_next = (head)->lh_first) != NULL)          \
+                (head)->lh_first->field.le_prev = &(elm)->field.le_next;\
+        (head)->lh_first = (elm);                                       \
+        (elm)->field.le_prev = &(head)->lh_first;                       \
+} while (/*CONSTCOND*/0)
+
+#define LIST_REMOVE(elm, field) do {                                    \
+        if ((elm)->field.le_next != NULL)                               \
+                (elm)->field.le_next->field.le_prev =                   \
+                    (elm)->field.le_prev;                               \
+        *(elm)->field.le_prev = (elm)->field.le_next;                   \
+} while (/*CONSTCOND*/0)
+
+#define LIST_FOREACH(var, head, field)                                  \
+        for ((var) = ((head)->lh_first);                                \
+                (var);                                                  \
+                (var) = ((var)->field.le_next))
+
+/*
+ * List access methods.
+ */
+#define LIST_EMPTY(head)                ((head)->lh_first == NULL)
+#define LIST_FIRST(head)                ((head)->lh_first)
+#define LIST_NEXT(elm, field)           ((elm)->field.le_next)
+
+
+/*
+ * Tail queue definitions.
+ */
+#define _TAILQ_HEAD(name, type, qual)                                   \
+struct name {                                                           \
+        qual type *tqh_first;           /* first element */             \
+        qual type *qual *tqh_last;      /* addr of last next element */ \
+}
+#define TAILQ_HEAD(name, type)  _TAILQ_HEAD(name, struct type,)
+
+#define TAILQ_HEAD_INITIALIZER(head)                                    \
+        { NULL, &(head).tqh_first }
+
+#define _TAILQ_ENTRY(type, qual)                                        \
+struct {                                                                \
+        qual type *tqe_next;            /* next element */              \
+        qual type *qual *tqe_prev;      /* address of previous next element */\
+}
+#define TAILQ_ENTRY(type)       _TAILQ_ENTRY(struct type,)
+
+/*
+ * Tail queue functions.
+ */
+#define TAILQ_INIT(head) do {                                           \
+        (head)->tqh_first = NULL;                                       \
+        (head)->tqh_last = &(head)->tqh_first;                          \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do {                        \
+        if (((elm)->field.tqe_next = (head)->tqh_first) != NULL)        \
+                (head)->tqh_first->field.tqe_prev =                     \
+                    &(elm)->field.tqe_next;                             \
+        else                                                            \
+                (head)->tqh_last = &(elm)->field.tqe_next;              \
+        (head)->tqh_first = (elm);                                      \
+        (elm)->field.tqe_prev = &(head)->tqh_first;                     \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do {                        \
+        (elm)->field.tqe_next = NULL;                                   \
+        (elm)->field.tqe_prev = (head)->tqh_last;                       \
+        *(head)->tqh_last = (elm);                                      \
+        (head)->tqh_last = &(elm)->field.tqe_next;                      \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do {              \
+        if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\
+                (elm)->field.tqe_next->field.tqe_prev =                 \
+                    &(elm)->field.tqe_next;                             \
+        else                                                            \
+                (head)->tqh_last = &(elm)->field.tqe_next;              \
+        (listelm)->field.tqe_next = (elm);                              \
+        (elm)->field.tqe_prev = &(listelm)->field.tqe_next;             \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do {                   \
+        (elm)->field.tqe_prev = (listelm)->field.tqe_prev;              \
+        (elm)->field.tqe_next = (listelm);                              \
+        *(listelm)->field.tqe_prev = (elm);                             \
+        (listelm)->field.tqe_prev = &(elm)->field.tqe_next;             \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_REMOVE(head, elm, field) do {                             \
+        if (((elm)->field.tqe_next) != NULL)                            \
+                (elm)->field.tqe_next->field.tqe_prev =                 \
+                    (elm)->field.tqe_prev;                              \
+        else                                                            \
+                (head)->tqh_last = (elm)->field.tqe_prev;               \
+        *(elm)->field.tqe_prev = (elm)->field.tqe_next;                 \
+} while (/*CONSTCOND*/0)
+
+#define TAILQ_FOREACH(var, head, field)                                 \
+        for ((var) = ((head)->tqh_first);                               \
+                (var);                                                  \
+                (var) = ((var)->field.tqe_next))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field)               \
+        for ((var) = (*(((struct headname *)((head)->tqh_last))->tqh_last));    \
+                (var);                                                  \
+                (var) = (*(((struct headname *)((var)->field.tqe_prev))->tqh_last)))
+
+/*
+ * Tail queue access methods.
+ */
+#define TAILQ_EMPTY(head)               ((head)->tqh_first == NULL)
+#define TAILQ_FIRST(head)               ((head)->tqh_first)
+#define TAILQ_NEXT(elm, field)          ((elm)->field.tqe_next)
+
+#define TAILQ_LAST(head, headname) \
+        (*(((struct headname *)((head)->tqh_last))->tqh_last))
+#define TAILQ_PREV(elm, headname, field) \
+        (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+
+/*
+ * Circular queue definitions.
+ */
+#define CIRCLEQ_HEAD(name, type)                                        \
+struct name {                                                           \
+        struct type *cqh_first;         /* first element */             \
+        struct type *cqh_last;          /* last element */              \
+}
+
+#define CIRCLEQ_HEAD_INITIALIZER(head)                                  \
+        { (void *)&head, (void *)&head }
+
+#define CIRCLEQ_ENTRY(type)                                             \
+struct {                                                                \
+        struct type *cqe_next;          /* next element */              \
+        struct type *cqe_prev;          /* previous element */          \
+}
+
+/*
+ * Circular queue functions.
+ */
+#define CIRCLEQ_INIT(head) do {                                         \
+        (head)->cqh_first = (void *)(head);                             \
+        (head)->cqh_last = (void *)(head);                              \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {            \
+        (elm)->field.cqe_next = (listelm)->field.cqe_next;              \
+        (elm)->field.cqe_prev = (listelm);                              \
+        if ((listelm)->field.cqe_next == (void *)(head))                \
+                (head)->cqh_last = (elm);                               \
+        else                                                            \
+                (listelm)->field.cqe_next->field.cqe_prev = (elm);      \
+        (listelm)->field.cqe_next = (elm);                              \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {           \
+        (elm)->field.cqe_next = (listelm);                              \
+        (elm)->field.cqe_prev = (listelm)->field.cqe_prev;              \
+        if ((listelm)->field.cqe_prev == (void *)(head))                \
+                (head)->cqh_first = (elm);                              \
+        else                                                            \
+                (listelm)->field.cqe_prev->field.cqe_next = (elm);      \
+        (listelm)->field.cqe_prev = (elm);                              \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_HEAD(head, elm, field) do {                      \
+        (elm)->field.cqe_next = (head)->cqh_first;                      \
+        (elm)->field.cqe_prev = (void *)(head);                         \
+        if ((head)->cqh_last == (void *)(head))                         \
+                (head)->cqh_last = (elm);                               \
+        else                                                            \
+                (head)->cqh_first->field.cqe_prev = (elm);              \
+        (head)->cqh_first = (elm);                                      \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_INSERT_TAIL(head, elm, field) do {                      \
+        (elm)->field.cqe_next = (void *)(head);                         \
+        (elm)->field.cqe_prev = (head)->cqh_last;                       \
+        if ((head)->cqh_first == (void *)(head))                        \
+                (head)->cqh_first = (elm);                              \
+        else                                                            \
+                (head)->cqh_last->field.cqe_next = (elm);               \
+        (head)->cqh_last = (elm);                                       \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_REMOVE(head, elm, field) do {                           \
+        if ((elm)->field.cqe_next == (void *)(head))                    \
+                (head)->cqh_last = (elm)->field.cqe_prev;               \
+        else                                                            \
+                (elm)->field.cqe_next->field.cqe_prev =                 \
+                    (elm)->field.cqe_prev;                              \
+        if ((elm)->field.cqe_prev == (void *)(head))                    \
+                (head)->cqh_first = (elm)->field.cqe_next;              \
+        else                                                            \
+                (elm)->field.cqe_prev->field.cqe_next =                 \
+                    (elm)->field.cqe_next;                              \
+} while (/*CONSTCOND*/0)
+
+#define CIRCLEQ_FOREACH(var, head, field)                               \
+        for ((var) = ((head)->cqh_first);                               \
+                (var) != (const void *)(head);                          \
+                (var) = ((var)->field.cqe_next))
+
+#define CIRCLEQ_FOREACH_REVERSE(var, head, field)                       \
+        for ((var) = ((head)->cqh_last);                                \
+                (var) != (const void *)(head);                          \
+                (var) = ((var)->field.cqe_prev))
+
+/*
+ * Circular queue access methods.
+ */
+#define CIRCLEQ_EMPTY(head)             ((head)->cqh_first == (void *)(head))
+#define CIRCLEQ_FIRST(head)             ((head)->cqh_first)
+#define CIRCLEQ_LAST(head)              ((head)->cqh_last)
+#define CIRCLEQ_NEXT(elm, field)        ((elm)->field.cqe_next)
+#define CIRCLEQ_PREV(elm, field)        ((elm)->field.cqe_prev)
+
+#define CIRCLEQ_LOOP_NEXT(head, elm, field)                             \
+        (((elm)->field.cqe_next == (void *)(head))                      \
+            ? ((head)->cqh_first)                                       \
+            : (elm->field.cqe_next))
+#define CIRCLEQ_LOOP_PREV(head, elm, field)                             \
+        (((elm)->field.cqe_prev == (void *)(head))                      \
+            ? ((head)->cqh_last)                                        \
+            : (elm->field.cqe_prev))
+
+#endif  /* !_SYS_QUEUE_H_ */
diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile
new file mode 100644 (file)
index 0000000..c9edd0d
--- /dev/null
@@ -0,0 +1,96 @@
+XEN_ROOT = ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+# Init scripts.
+XEND_INITD = init.d/xend
+XENDOMAINS_INITD = init.d/xendomains
+XENDOMAINS_SYSCONFIG = init.d/sysconfig.xendomains
+
+# Xen configuration dir and configs to go there.
+XEN_CONFIG_DIR = /etc/xen
+
+# Xen script dir and scripts to go there.
+XEN_SCRIPT_DIR = /etc/xen/scripts
+XEN_SCRIPTS = network-bridge vif-bridge
+XEN_SCRIPTS += network-route vif-route
+XEN_SCRIPTS += network-nat vif-nat
+XEN_SCRIPTS += block
+XEN_SCRIPTS += block-enbd block-nbd
+XEN_SCRIPTS += vtpm vtpm-delete
+XEN_SCRIPTS += xen-hotplug-cleanup
+XEN_SCRIPTS += external-device-migrate
+XEN_SCRIPTS += vscsi
+XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
+XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
+XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
+XEN_SCRIPT_DATA += vtpm-migration.sh vtpm-impl
+
+XEN_HOTPLUG_DIR = /etc/hotplug
+XEN_HOTPLUG_SCRIPTS = xen-backend.agent
+
+UDEV_RULES_DIR = /etc/udev
+UDEV_RULES = xen-backend.rules xend.rules
+
+DI = $(if $(DISTDIR),$(shell readlink -f $(DISTDIR)),)
+DE = $(if $(DESTDIR),$(shell readlink -f $(DESTDIR)),)
+ifeq ($(findstring $(DI),$(DE)),$(DI))
+HOTPLUGS=install-hotplug install-udev
+else
+ifeq ($(shell [ -x /usr/bin/udevinfo ] && [ `/usr/bin/udevinfo -V | sed -e 's/^[^0-9]* \([0-9]\{1,\}\)[^0-9]\{0,\}/\1/'` -ge 059 ] && echo 1),1)
+HOTPLUGS=install-udev
+else
+HOTPLUGS=install-hotplug
+endif
+endif
+
+.PHONY: all
+all:
+
+.PHONY: build
+build:
+
+.PHONY: install
+install: all install-initd install-scripts $(HOTPLUGS)
+
+.PHONY: install-initd
+install-initd:
+       [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d
+       [ -d $(DESTDIR)/etc/sysconfig ] || $(INSTALL_DIR) $(DESTDIR)/etc/sysconfig
+       $(INSTALL_PROG) $(XEND_INITD) $(DESTDIR)/etc/init.d
+       $(INSTALL_PROG) $(XENDOMAINS_INITD) $(DESTDIR)/etc/init.d
+       $(INSTALL_PROG) $(XENDOMAINS_SYSCONFIG) $(DESTDIR)/etc/sysconfig/xendomains
+
+.PHONY: install-scripts
+install-scripts:
+       [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \
+               $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
+       set -e; for i in $(XEN_SCRIPTS); \
+           do \
+           $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+       done
+       set -e; for i in $(XEN_SCRIPT_DATA); \
+           do \
+           $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+       done
+
+.PHONY: install-hotplug
+install-hotplug:
+       [ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \
+               $(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR)
+       set -e; for i in $(XEN_HOTPLUG_SCRIPTS); \
+           do \
+           $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_HOTPLUG_DIR); \
+       done
+
+.PHONY: install-udev
+install-udev:
+       [ -d $(DESTDIR)$(UDEV_RULES_DIR) ] || \
+               $(INSTALL_DIR) $(DESTDIR)$(UDEV_RULES_DIR)/rules.d
+       set -e; for i in $(UDEV_RULES); \
+           do \
+           $(INSTALL_DATA) $$i $(DESTDIR)$(UDEV_RULES_DIR); \
+           ln -sf ../$$i $(DESTDIR)$(UDEV_RULES_DIR)/rules.d; \
+       done
+
+.PHONY: clean
+clean:
diff --git a/tools/hotplug/Linux/block b/tools/hotplug/Linux/block
new file mode 100644 (file)
index 0000000..8c61744
--- /dev/null
@@ -0,0 +1,381 @@
+#!/bin/bash
+
+dir=$(dirname "$0")
+. "$dir/block-common.sh"
+
+expand_dev() {
+  local dev
+  case $1 in
+  /*)
+    dev=$1
+    ;;
+  *)
+    dev=/dev/$1
+    ;;
+  esac
+  echo -n $dev
+}
+
+
+##
+# check_sharing device mode
+#
+# Check whether the device requested is already in use.  To use the device in
+# read-only mode, it may be in use in read-only mode, but may not be in use in
+# read-write anywhere at all.  To use the device in read-write mode, it must
+# not be in use anywhere at all.
+#
+# Prints one of
+#
+#    'local': the device may not be used because it is mounted in the current
+#             (i.e. the privileged domain) in a way incompatible with the
+#             requested mode;
+#    'guest': the device may not be used because it already mounted by a guest
+#             in a way incompatible with the requested mode; or
+#    'ok':    the device may be used.
+#
+check_sharing()
+{
+  local dev="$1"
+  local mode="$2"
+
+  local devmm=$(device_major_minor "$dev")
+  local file
+
+  if [ "$mode" = 'w' ]
+  then
+    toskip="^$"
+  else
+    toskip="^[^ ]* [^ ]* [^ ]* ro[, ]"
+  fi
+
+  for file in $(cat /proc/mounts | grep -v "$toskip" | cut -f 1 -d ' ')
+  do
+    if [ -e "$file" ]
+    then
+      local d=$(device_major_minor "$file")
+
+      if [ "$d" = "$devmm" ]
+      then
+        echo 'local'
+        return
+      fi
+    fi
+  done
+
+  local base_path="$XENBUS_BASE_PATH/$XENBUS_TYPE"
+  for dom in $(xenstore-list "$base_path")
+  do
+    for dev in $(xenstore-list "$base_path/$dom")
+    do
+      d=$(xenstore_read_default "$base_path/$dom/$dev/physical-device" "")
+
+      if [ "$d" = "$devmm" ]
+      then
+        if [ "$mode" = 'w' ]
+        then
+          if ! same_vm $dom
+          then
+            echo 'guest'
+            return
+          fi
+        else
+          local m=$(xenstore_read "$base_path/$dom/$dev/mode")
+          m=$(canonicalise_mode "$m")
+
+          if [ "$m" = 'w' ]
+          then
+            if ! same_vm $dom
+            then
+              echo 'guest'
+              return
+            fi
+          fi
+        fi
+      fi
+    done
+  done
+
+  echo 'ok'
+}
+
+
+##
+# check_device_sharing dev mode
+#
+# Perform the sharing check for the given physical device and mode.
+#
+check_device_sharing()
+{
+  local dev="$1"
+  local mode=$(canonicalise_mode "$2")
+  local result
+
+  if [ "x$mode" = 'x!' ]
+  then
+    return 0
+  fi
+
+  result=$(check_sharing "$dev" "$mode")
+
+  if [ "$result" != 'ok' ]
+  then
+    do_ebusy "Device $dev is mounted " "$mode" "$result"
+  fi
+}
+
+
+##
+# check_device_sharing file dev mode
+#
+# Perform the sharing check for the given file mounted through the given
+# loopback interface, in the given mode.
+#
+check_file_sharing()
+{
+  local file="$1"
+  local dev="$2"
+  local mode="$3"
+
+  result=$(check_sharing "$dev" "$mode")
+
+  if [ "$result" != 'ok' ]
+  then
+    do_ebusy "File $file is loopback-mounted through $dev,
+which is mounted " "$mode" "$result"
+  fi
+}
+
+
+##
+# do_ebusy prefix mode result
+#
+# Helper function for check_device_sharing check_file_sharing, calling ebusy
+# with an error message constructed from the given prefix, mode, and result
+# from a call to check_sharing.
+#
+do_ebusy()
+{
+  local prefix="$1"
+  local mode="$2"
+  local result="$3"
+
+  if [ "$result" = 'guest' ]
+  then
+    dom='a guest '
+    when='now'
+  else
+    dom='the privileged '
+    when='by a guest'
+  fi
+
+  if [ "$mode" = 'w' ]
+  then
+    m1=''
+    m2=''
+  else
+    m1='read-write '
+    m2='read-only '
+  fi
+
+  release_lock "block"
+  ebusy \
+"${prefix}${m1}in ${dom}domain,
+and so cannot be mounted ${m2}${when}."
+}
+
+
+t=$(xenstore_read_default "$XENBUS_PATH/type" 'MISSING')
+
+case "$command" in
+  add)
+    phys=$(xenstore_read_default "$XENBUS_PATH/physical-device" 'MISSING')
+    if [ "$phys" != 'MISSING' ]
+    then
+      # Depending upon the hotplug configuration, it is possible for this
+      # script to be called twice, so just bail.
+      exit 0
+    fi
+
+    if [ -n "$t" ]
+    then
+      p=$(xenstore_read "$XENBUS_PATH/params")
+      mode=$(xenstore_read "$XENBUS_PATH/mode")
+    fi
+
+    case $t in 
+      phy)
+        dev=$(expand_dev $p)
+        FRONTEND_ID=$(xenstore_read "$XENBUS_PATH/frontend-id")
+        FRONTEND_UUID=$(xenstore_read_default \
+            "/local/domain/$FRONTEND_ID/vm" 'unknown')
+
+        if [ -L "$dev" ]
+        then
+          dev=$(readlink -f "$dev") || fatal "$dev link does not exist."
+        fi
+        test -e "$dev" || fatal "$dev does not exist."
+        test -b "$dev" || fatal "$dev is not a block device."
+
+        claim_lock "block"
+        check_device_sharing "$dev" "$mode"
+       write_dev "$dev"
+        release_lock "block"
+       exit 0
+       ;;
+
+      file)
+        # Canonicalise the file, for sharing check comparison, and the mode
+        # for ease of use here.
+        file=$(readlink -f "$p") || fatal "$p does not exist."
+        test -f "$file" || fatal "$file does not exist."
+        mode=$(canonicalise_mode "$mode")
+
+        claim_lock "block"
+
+        if [ "$mode" = 'w' ] && ! stat "$file" -c %A | grep -q w
+        then
+          release_lock "block"
+          ebusy \
+"File $file is read-only, and so I will not
+mount it read-write in a guest domain."
+        fi
+
+        loopdev=''
+        for dev in /dev/loop*
+        do
+          if [ ! -b "$dev" ]
+          then
+            continue
+          fi
+
+          f=$(losetup "$dev" 2>/dev/null) || f=''
+
+          if [ "$f" ]
+          then
+            # $dev is in use.  Check sharing.
+            if [ "x$mode" = 'x!' ]
+            then
+              continue
+            fi
+
+            f=$(echo "$f" | sed -e 's/.*(\(.*\)).*/\1/g')
+
+            # $f is the filename, as read from losetup, but the loopback
+            # driver truncates filenames at 64 characters, so we need to go
+            # trawling through the store if it's longer than that.  Truncation
+            # is indicated by an asterisk at the end of the filename.
+            if expr index "$f" '*' >/dev/null
+            then
+              found=""
+              for dom in $(xenstore-list "$XENBUS_BASE_PATH")
+              do
+                for domdev in $(xenstore-list "$XENBUS_BASE_PATH/$dom")
+                do
+                  d=$(xenstore_read_default \
+                        "$XENBUS_BASE_PATH/$dom/$domdev/node" "")
+                  if [ "$d" = "$dev" ]
+                  then
+                    f=$(xenstore_read "$XENBUS_BASE_PATH/$dom/$domdev/params")
+                    found=1
+                    break 2
+                  fi
+                done
+              done
+
+              if [ ! "$found" ]
+              then
+                # This loopback device is in use by someone else, so skip it.
+                log debug "Loopback sharing check skips device $dev."
+                continue
+              fi
+            fi
+
+            # Canonicalise the filename for the comparison.
+
+            # I have seen this readlink fails because the filename given by
+            # losetup is only the basename.  This cannot happen when the loop
+            # device is set up through this script, because file is
+            # canonicalised above, but it may happen when loop devices are set
+            # up some other way.  This readlink may also conceivably fail if
+            # the file backing this loop device has been removed.
+
+            # For maximum safety, in the case that $f does not resolve, we
+            # assume that $file and $f are in the same directory.
+
+            # If you create a loopback filesystem, remove it and continue to
+            # run on it, and then create another file with the same name, then
+            # this check will block that -- don't do that.
+
+            # If you create loop devices through some other mechanism, use
+            # relative filenames, and then use the same filename through this
+            # script, then this check will block that -- don't do that either.
+
+            f=$(readlink -f "$f" || echo $(dirname "$file")/$(basename "$f"))
+
+
+            if [ "$f" = "$file" ]
+            then
+              check_file_sharing "$file" "$dev" "$mode"
+            fi
+          else
+            # $dev is not in use, so we'll remember it for use later; we want
+            # to finish the sharing check first.
+
+            if [ "$loopdev" = '' ]
+            then
+              loopdev="$dev"
+            fi
+          fi
+        done
+
+        if [ "$loopdev" = '' ]
+        then
+          release_lock "block"
+          fatal 'Failed to find an unused loop device'
+        fi
+
+        if LANG=C losetup -h 2>&1 | grep read-only >/dev/null
+        then
+          roflag="-$mode"; roflag="${roflag#-w}"; roflag="${roflag#-!}"
+        else
+          roflag=''
+        fi
+        do_or_die losetup $roflag "$loopdev" "$file"
+        xenstore_write "$XENBUS_PATH/node" "$loopdev"
+        write_dev "$loopdev"
+        release_lock "block"
+        exit 0
+       ;;
+
+      "")
+        claim_lock "block"
+        success
+        release_lock "block"
+       ;;
+    esac
+    ;;
+
+  remove)
+    case $t in 
+      phy)
+       exit 0
+       ;;
+
+      file)
+        node=$(xenstore_read "$XENBUS_PATH/node")
+       losetup -d "$node"
+       exit 0
+       ;;
+
+      "")
+        exit 0
+       ;;
+    esac
+    ;;
+
+esac
+
+# If we've reached here, $t is neither phy nor file, so fire a helper script.
+[ -x /etc/xen/scripts/block-"$t" ] && \
+  /etc/xen/scripts/block-"$t" "$command" $node
diff --git a/tools/hotplug/Linux/block-common.sh b/tools/hotplug/Linux/block-common.sh
new file mode 100644 (file)
index 0000000..a0ebc9b
--- /dev/null
@@ -0,0 +1,116 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+
+if [ "$command" != "add" ] &&
+   [ "$command" != "remove" ]
+then
+  log err "Invalid command: $command"
+  exit 1
+fi
+
+
+XENBUS_PATH="${XENBUS_PATH:?}"
+
+
+ebusy()
+{
+  xenstore_write "$XENBUS_PATH/hotplug-error" "$*" \
+                 "$XENBUS_PATH/hotplug-status" busy
+  log err "$@"
+  exit 1
+}
+
+
+##
+# Print the given device's major and minor numbers, written in hex and
+# separated by a colon.
+device_major_minor()
+{
+  stat -L -c %t:%T "$1"
+}
+
+
+##
+# Write physical-device = MM,mm to the store, where MM and mm are the major 
+# and minor numbers of device respectively.
+#
+# @param device The device from which major and minor numbers are read, which
+#               will be written into the store.
+#
+write_dev() {
+  local mm
+  
+  mm=$(device_major_minor "$1")
+  if [ -z $mm ]
+  then
+    fatal "Backend device does not exist"
+  fi
+  xenstore_write "$XENBUS_PATH/physical-device" "$mm"
+
+  success
+}
+
+
+##
+# canonicalise_mode mode
+#
+# Takes the given mode, which may be r, w, ro, rw, w!, or rw!, or variations
+# thereof, and canonicalises them to one of
+#
+#   'r': perform checks for a new read-only mount;
+#   'w': perform checks for a read-write mount; or
+#   '!': perform no checks at all.
+#
+canonicalise_mode()
+{
+  local mode="$1"
+
+  if ! expr index "$mode" 'w' >/dev/null
+  then
+    echo 'r'
+  elif ! expr index "$mode" '!' >/dev/null
+  then
+    echo 'w'
+  else
+    echo '!'
+  fi
+}
+
+
+same_vm()
+{
+  local otherdom="$1"
+  # Note that othervm can be MISSING here, because Xend will be racing with
+  # the hotplug scripts -- the entries in /local/domain can be removed by
+  # Xend before the hotplug scripts have removed the entry in
+  # /local/domain/0/backend/.  In this case, we want to pretend that the
+  # VM is the same as FRONTEND_UUID, because that way the 'sharing' will be
+  # allowed.
+  local othervm=$(xenstore_read_default "/local/domain/$otherdom/vm"         \
+                  "$FRONTEND_UUID")
+
+  [ "$FRONTEND_UUID" = "$othervm" ]
+}
+
diff --git a/tools/hotplug/Linux/block-enbd b/tools/hotplug/Linux/block-enbd
new file mode 100644 (file)
index 0000000..67faa84
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Usage: block-enbd [bind server ctl_port |unbind node]
+#
+# The node argument to unbind is the name of the device node we are to
+# unbind.
+#
+# This assumes you're running a correctly configured server at the other end!
+
+dir=$(dirname "$0")
+. "$dir/block-common.sh"
+
+case "$command" in
+  add)
+    for dev in /dev/nd*; do
+      if nbd-client $2:$3 $dev; then
+        write_dev $dev
+        exit 0
+      fi
+    done
+    exit 1
+    ;;
+  remove)
+    nbd-client -d $2
+    exit 0
+    ;;
+esac
diff --git a/tools/hotplug/Linux/block-nbd b/tools/hotplug/Linux/block-nbd
new file mode 100644 (file)
index 0000000..b29b315
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Usage: block-nbd [bind server ctl_port |unbind node]
+#
+# The node argument to unbind is the name of the device node we are to
+# unbind.
+#
+# This assumes you're running a correctly configured server at the other end!
+
+dir=$(dirname "$0")
+. "$dir/block-common.sh"
+
+case "$command" in
+  add)
+    for dev in /dev/nbd*; do
+      if nbd-client $2 $3 $dev; then
+        write_dev $dev
+        exit 0
+      fi
+    done
+    exit 1
+    ;;
+  remove)
+    nbd-client -d $2
+    exit 0
+    ;;
+esac
diff --git a/tools/hotplug/Linux/external-device-migrate b/tools/hotplug/Linux/external-device-migrate
new file mode 100644 (file)
index 0000000..a411348
--- /dev/null
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+# Copyright (c) 2005 IBM Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+set -x
+
+# This script is called by XenD for migration of external devices
+# It does not handle the migration of those devices itself, but
+# passes the requests on to further applications
+# It handles the low-level command line parsing and some of the
+# synchronization
+
+dir=$(dirname "$0")
+. "$dir/logging.sh"
+
+
+function ext_dev_migrate_usage() {
+cat <<EOF
+Pass the following command line parameters to the script:
+
+-step <n>              : n-th migration step
+-host <host>           : the destination host
+-domname <domain name> : name of the domain that is migrating
+-type <device type>    : the type of device that is migrating
+-subtype <dev. subtype>: the subtype of the device
+-recover               : indicates recovery request; an error
+                         occurred during migration
+-help                  : display this help screen
+EOF
+}
+
+# Parse the command line paramters. The following parameters must be
+# passed as the first ones in the sequence:
+#  -step       [required]
+#  -host       [required]
+#  -domname    [required]
+#  -type       [required]
+#  -subtype    [optional]
+#  -recover    [optional]
+# The remaining ones will be passed to the called function.
+function evaluate_params()
+{
+       local step host domname typ recover filename func stype
+       stype=""
+       while [ $# -ge 1 ]; do
+               case "$1" in
+               -step)          step=$2; shift; shift;;
+               -host)          host=$2; shift; shift;;
+               -domname)       domname=$2; shift; shift;;
+               -type)          typ=$2; shift; shift;;
+               -subtype)       stype=$2; shift; shift;;
+               -recover)       recover=1; shift;;
+               -help)          ext_dev_migrate_usage; exit 0;;
+               *)              break;;
+               esac
+       done
+
+       if [ "$step"    = "" -o \
+            "$host"    = "" -o \
+            "$typ"     = "" -o \
+            "$domname" = "" ]; then
+               echo "Error: Parameter(s) missing (-step/-host/-type/-domname)" 1>&2
+               echo "" 1>&2
+               echo "$0 -help for usage." 1>&2
+               exit 1
+       fi
+
+       filename="$dir/$typ$stype-migration.sh"
+       if [ ! -r $filename ]; then
+               echo "Error: Could not find script '$filename'"
+               return
+       fi
+       . "$filename"
+
+       if [ "$recover" = "1" ]; then
+               func="$typ"_recover
+               eval $func $host $domname $step $*
+       else
+               func="$typ"_migration_step
+               eval $func $host $domname $step $*
+       fi
+}
+
+evaluate_params "$@"
diff --git a/tools/hotplug/Linux/init.d/sysconfig.xendomains b/tools/hotplug/Linux/init.d/sysconfig.xendomains
new file mode 100644 (file)
index 0000000..e93b1a4
--- /dev/null
@@ -0,0 +1,137 @@
+## Path: System/xen
+## Description: xen domain start/stop on boot
+## Type: string
+## Default: 
+#
+# The xendomains script can send SysRq requests to domains on shutdown.
+# If you don't want to MIGRATE, SAVE, or SHUTDOWN, this may be a possibility
+# to do a quick and dirty shutdown ("s e i u o") or at least sync the disks
+# of the domains ("s").
+#
+XENDOMAINS_SYSRQ=""
+
+## Type: integer 
+## Default: 100000
+#
+# If XENDOMAINS_SYSRQ is set, this variable determines how long to wait
+# (in microseconds) after each SysRq, so the domain has a chance to react.
+# If you want to a quick'n'dirty shutdown via SysRq, you may want to set
+# it to a relatively high value (1200000).
+#
+XENDOMAINS_USLEEP=100000
+
+## Type: integer
+## Default: 5000000
+#
+# When creating a guest domain, it is sensible to allow a little time for it
+# to get started before creating another domain or proceeding through the
+# boot process.  Without this, the booting guests will thrash the disk as they
+# start up.  This timeout (in microseconds) specifies the delay after guest
+# domain creation.
+#
+XENDOMAINS_CREATE_USLEEP=5000000
+
+## Type: string
+## Default: ""
+#
+# Set this to a non-empty string if you want to migrate virtual machines
+# on shutdown. The string will be passed to the xm migrate DOMID command
+# as is: It should contain the target IP address of the physical machine
+# to migrate to and optionally parameters like --live. Leave empty if
+# you don't want to try virtual machine relocation on shutdown.
+# If migration succeeds, neither SAVE nor SHUTDOWN will be executed for
+# that domain.
+#
+XENDOMAINS_MIGRATE=""
+
+## Type: string
+## Default: /var/lib/xen/save
+#
+# Directory to save running domains to when the system (dom0) is
+# shut down. Will also be used to restore domains from if # XENDOMAINS_RESTORE
+# is set (see below). Leave empty to disable domain saving on shutdown 
+# (e.g. because you rather shut domains down).
+# If domain saving does succeed, SHUTDOWN will not be executed.
+#
+XENDOMAINS_SAVE=/var/lib/xen/save
+
+## Type: string
+## Default: "--halt --wait"
+#
+# If neither MIGRATE nor SAVE were enabled or if they failed, you can
+# try to shut down a domain by sending it a shutdown request. To do this,
+# set this to "--halt --wait". Omit the "--wait" flag to avoid waiting
+# for the domain to be really down. Leave empty to skip domain shutdown.
+#
+XENDOMAINS_SHUTDOWN="--halt --wait"
+
+## Type: string
+## Default: "--all --halt --wait"
+#
+# After we have gone over all virtual machines (resp. all automatically
+# started ones, see XENDOMAINS_AUTO_ONLY below) in a loop and sent SysRq,
+# migrated, saved and/or shutdown according to the settings above, we
+# might want to shutdown the virtual machines that are still running
+# for some reason or another. To do this, set this variable to
+# "--all --halt --wait", it will be passed to xm shutdown.
+# Leave it empty not to do anything special here.
+# (Note: This will hit all virtual machines, even if XENDOMAINS_AUTO_ONLY
+# is set.)
+# 
+XENDOMAINS_SHUTDOWN_ALL="--all --halt --wait"
+
+## Type: boolean
+## Default: true
+#
+# This variable determines whether saved domains from XENDOMAINS_SAVE
+# will be restored on system startup. 
+#
+XENDOMAINS_RESTORE=true
+
+## Type: string
+## Default: /etc/xen/auto
+#
+# This variable sets the directory where domains configurations
+# are stored that should be started on system startup automatically.
+# Leave empty if you don't want to start domains automatically
+# (or just don't place any xen domain config files in that dir).
+# Note that the script tries to be clever if both RESTORE and AUTO are 
+# set: It will first restore saved domains and then only start domains
+# in AUTO which are not running yet. 
+# Note that the name matching is somewhat fuzzy.
+#
+XENDOMAINS_AUTO=/etc/xen/auto
+
+## Type: boolean
+## Default: false
+# 
+# If this variable is set to "true", only the domains started via config 
+# files in XENDOMAINS_AUTO will be treated according to XENDOMAINS_SYSRQ,
+# XENDOMAINS_MIGRATE, XENDOMAINS_SAVE, XENDMAINS_SHUTDOWN; otherwise
+# all running domains will be. 
+# Note that the name matching is somewhat fuzzy.
+# 
+XENDOMAINS_AUTO_ONLY=false
+
+## Type: integer
+## Default: 300
+#
+# On xendomains stop, a number of xm commands (xm migrate, save, shutdown,
+# shutdown --all) may be executed. In the worst case, these commands may
+# stall forever, which will prevent a successful shutdown of the machine.
+# If this variable is non-zero, the script will set up a watchdog timer
+# for every of these xm commands and time it out after the number of seconds
+# specified by this variable.
+# Note that SHUTDOWN_ALL will not be called if no virtual machines or only
+# zombies are still running, so you don't need to enable this timeout just
+# for the zombie case.
+# The setting should be large enough to make sure that migrate/save/shutdown
+# can succeed. If you do live migrations, keep in mind that live migration
+# of a 1GB machine over Gigabit ethernet may actually take something like
+# 100s (assuming that live migration uses 10% of the network # bandwidth).
+# Depending on the virtual machine, a shutdown may also require a significant
+# amount of time. So better setup this variable to a huge number and hope the
+# watchdog never fires.
+#
+XENDOMAINS_STOP_MAXWAIT=300
+
diff --git a/tools/hotplug/Linux/init.d/xend b/tools/hotplug/Linux/init.d/xend
new file mode 100755 (executable)
index 0000000..32dfc84
--- /dev/null
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# xend         Script to start and stop the Xen control daemon.
+#
+# Author:       Keir Fraser <keir.fraser@cl.cam.ac.uk>
+#
+# chkconfig: 2345 98 01
+# description: Starts and stops the Xen control daemon.
+### BEGIN INIT INFO
+# Provides:          xend
+# Required-Start:    $syslog $remote_fs
+# Should-Start:
+# Required-Stop:     $syslog $remote_fs
+# Should-Stop:
+# Default-Start:     3 4 5
+# Default-Stop:      0 1 2 6
+# Default-Enabled:   yes
+# Short-Description: Start/stop xend
+# Description:       Starts and stops the Xen control daemon.
+### END INIT INFO
+
+if ! grep -q "control_d" /proc/xen/capabilities ; then
+       exit 0
+fi
+
+# Wait for Xend to be up
+function await_daemons_up
+{
+       i=1
+       rets=10
+       xend status
+       while [ $? -ne 0 -a $i -lt $rets ]; do
+           sleep 1
+           echo -n .
+           i=$(($i + 1))
+           xend status
+       done
+}
+
+case "$1" in
+  start)
+       touch /var/lock/subsys/xend
+       xend start
+       await_daemons_up
+       ;;
+  stop)
+       xend stop
+       rm -f /var/lock/subsys/xend
+       ;;
+  status)
+       xend status
+       ;;
+  reload)
+        xend reload
+        ;;
+  restart|force-reload)
+       xend restart
+       await_daemons_up
+       ;;
+  *)
+       # do not advertise unreasonable commands that there is no reason
+       # to use with this device
+       echo $"Usage: $0 {start|stop|status|restart|reload|force-reload}"
+       exit 1
+esac
+
+exit $?
+
diff --git a/tools/hotplug/Linux/init.d/xendomains b/tools/hotplug/Linux/init.d/xendomains
new file mode 100644 (file)
index 0000000..e353441
--- /dev/null
@@ -0,0 +1,554 @@
+#!/bin/bash
+#
+# /etc/init.d/xendomains
+# Start / stop domains automatically when domain 0 boots / shuts down.
+#
+# chkconfig: 345 99 00
+# description: Start / stop Xen domains.
+#
+# This script offers fairly basic functionality.  It should work on Redhat
+# but also on LSB-compliant SuSE releases and on Debian with the LSB package
+# installed.  (LSB is the Linux Standard Base)
+#
+# Based on the example in the "Designing High Quality Integrated Linux
+# Applications HOWTO" by Avi Alkalay
+# <http://www.tldp.org/HOWTO/HighQuality-Apps-HOWTO/>
+#
+### BEGIN INIT INFO
+# Provides:          xendomains
+# Required-Start:    $syslog $remote_fs xend
+# Should-Start:
+# Required-Stop:     $syslog $remote_fs xend
+# Should-Stop:
+# Default-Start:     3 4 5
+# Default-Stop:      0 1 2 6
+# Default-Enabled:   yes
+# Short-Description: Start/stop secondary xen domains
+# Description:       Start / stop domains automatically when domain 0 
+#                    boots / shuts down.
+### END INIT INFO
+
+# Correct exit code would probably be 5, but it's enough 
+# if xend complains if we're not running as privileged domain
+if ! [ -e /proc/xen/privcmd ]; then
+       exit 0
+fi
+
+LOCKFILE=/var/lock/subsys/xendomains
+XENDOM_CONFIG=/etc/sysconfig/xendomains
+
+test -r $XENDOM_CONFIG || { echo "$XENDOM_CONFIG not existing";
+       if [ "$1" = "stop" ]; then exit 0;
+       else exit 6; fi; }
+
+. $XENDOM_CONFIG
+
+# Use the SUSE rc_ init script functions;
+# emulate them on LSB, RH and other systems
+if test -e /etc/rc.status; then
+    # SUSE rc script library
+    . /etc/rc.status
+else    
+    _cmd=$1
+    declare -a _SMSG
+    if test "${_cmd}" = "status"; then
+       _SMSG=(running dead dead unused unknown)
+       _RC_UNUSED=3
+    else
+       _SMSG=(done failed failed missed failed skipped unused failed failed)
+       _RC_UNUSED=6
+    fi
+    if test -e /etc/init.d/functions; then
+       # REDHAT
+       . /etc/init.d/functions
+       echo_rc()
+       {
+           #echo -n "  [${_SMSG[${_RC_RV}]}] "
+           if test ${_RC_RV} = 0; then
+               success "  [${_SMSG[${_RC_RV}]}] "
+           else
+               failure "  [${_SMSG[${_RC_RV}]}] "
+           fi
+       }
+    elif test -e /lib/lsb/init-functions; then
+       # LSB    
+       . /lib/lsb/init-functions
+        if alias log_success_msg >/dev/null 2>/dev/null; then
+         echo_rc()
+         {
+              echo "  [${_SMSG[${_RC_RV}]}] "
+         }
+        else
+         echo_rc()
+         {
+           if test ${_RC_RV} = 0; then
+               log_success_msg "  [${_SMSG[${_RC_RV}]}] "
+           else
+               log_failure_msg "  [${_SMSG[${_RC_RV}]}] "
+           fi
+         }
+        fi
+    else    
+       # emulate it
+       echo_rc()
+       {
+           echo "  [${_SMSG[${_RC_RV}]}] "
+       }
+    fi
+    rc_reset() { _RC_RV=0; }
+    rc_failed()
+    {
+       if test -z "$1"; then 
+           _RC_RV=1;
+       elif test "$1" != "0"; then 
+           _RC_RV=$1; 
+       fi
+       return ${_RC_RV}
+    }
+    rc_check()
+    {
+       return rc_failed $?
+    }  
+    rc_status()
+    {
+       rc_failed $?
+       if test "$1" = "-r"; then _RC_RV=0; shift; fi
+       if test "$1" = "-s"; then rc_failed 5; echo_rc; rc_failed 3; shift; fi
+       if test "$1" = "-u"; then rc_failed ${_RC_UNUSED}; echo_rc; rc_failed 3; shift; fi
+       if test "$1" = "-v"; then echo_rc; shift; fi
+       if test "$1" = "-r"; then _RC_RV=0; shift; fi
+       return ${_RC_RV}
+    }
+    rc_exit() { exit ${_RC_RV}; }
+    rc_active() 
+    {
+       if test -z "$RUNLEVEL"; then read RUNLEVEL REST < <(/sbin/runlevel); fi
+       if test -e /etc/init.d/S[0-9][0-9]${1}; then return 0; fi
+       return 1
+    }
+fi
+
+if ! which usleep >&/dev/null
+then
+  usleep()
+  {
+    if [ -n "$1" ]
+    then
+      sleep $(( $1 / 1000000 ))
+    fi
+  }
+fi
+
+# Reset status of this service
+rc_reset
+
+##
+# Returns 0 (success) if the given parameter names a directory, and that
+# directory is not empty.
+#
+contains_something()
+{
+  if [ -d "$1" ] && [ `/bin/ls $1 | wc -l` -gt 0 ]
+  then
+    return 0
+  else
+    return 1
+  fi
+}
+
+# read name from xen config file
+rdname()
+{
+    NM=$(xm create --quiet --dryrun --defconfig "$1" |
+         sed -n 's/^.*(name \(.*\))$/\1/p')
+}
+
+rdnames()
+{
+    NAMES=
+    if ! contains_something "$XENDOMAINS_AUTO"
+    then 
+       return
+    fi
+    for dom in $XENDOMAINS_AUTO/*; do
+       rdname $dom
+       if test -z $NAMES; then 
+           NAMES=$NM; 
+       else
+           NAMES="$NAMES|$NM"
+       fi
+    done
+}
+
+parseln()
+{
+    if [[ "$1" =~ "\(domain" ]]; then
+        name=;id=
+    else if [[ "$1" =~ "\(name" ]]; then
+        name=$(echo $1 | sed -e 's/^.*(name \(.*\))$/\1/')
+    else if [[ "$1" =~ "\(domid" ]]; then
+        id=$(echo $1 | sed -e 's/^.*(domid \(.*\))$/\1/')
+    fi; fi; fi
+
+    [ -n "$name" -a -n "$id" ] && return 0 || return 1
+}
+
+is_running()
+{
+    rdname $1
+    RC=1
+    name=;id=
+    while read LN; do
+       parseln "$LN" || continue
+       if test $id = 0; then continue; fi
+       case $name in 
+           ($NM)
+               RC=0
+               ;;
+       esac
+    done < <(xm list -l | grep '(\(domain\|domid\|name\)')
+    return $RC
+}
+
+start() 
+{
+    if [ -f $LOCKFILE ]; then 
+       echo -e "xendomains already running (lockfile exists)"
+       return; 
+    fi
+
+    saved_domains=" "
+    if [ "$XENDOMAINS_RESTORE" = "true" ] &&
+       contains_something "$XENDOMAINS_SAVE"
+    then
+       mkdir -p $(dirname "$LOCKFILE")
+       touch $LOCKFILE
+       echo -n "Restoring Xen domains:"
+       saved_domains=`ls $XENDOMAINS_SAVE`
+        for dom in $XENDOMAINS_SAVE/*; do
+            if [ -f $dom ] ; then
+                HEADER=`head -c 16 $dom | head -n 1 2> /dev/null`
+                if [ $HEADER = "LinuxGuestRecord" ]; then
+                    echo -n " ${dom##*/}"
+                    XMR=`xm restore $dom 2>&1 1>/dev/null`
+                    #xm restore $dom
+                    if [ $? -ne 0 ]; then
+                        echo -e "\nAn error occurred while restoring domain ${dom##*/}:\n$XMR"
+                        rc_failed $?
+                        echo -e '!'
+                    else
+                        # mv $dom ${dom%/*}/.${dom##*/}
+                        rm $dom
+                    fi
+                fi
+            fi
+        done
+       echo -e
+    fi
+
+    if contains_something "$XENDOMAINS_AUTO"
+    then
+       touch $LOCKFILE
+       echo -n "Starting auto Xen domains:"
+       # We expect config scripts for auto starting domains to be in
+       # XENDOMAINS_AUTO - they could just be symlinks to files elsewhere
+
+       # Create all domains with config files in XENDOMAINS_AUTO.
+       # TODO: We should record which domain name belongs 
+       # so we have the option to selectively shut down / migrate later
+       # If a domain statefile from $XENDOMAINS_SAVE matches a domain name
+       # in $XENDOMAINS_AUTO, do not try to start that domain; if it didn't 
+       # restore correctly it requires administrative attention.
+       for dom in $XENDOMAINS_AUTO/*; do
+           echo -n " ${dom##*/}"
+           shortdom=$(echo $dom | sed -n 's/^.*\/\(.*\)$/\1/p')
+           echo $saved_domains | grep -w $shortdom > /dev/null
+           if [ $? -eq 0 ] || is_running $dom; then
+               echo -n "(skip)"
+           else
+               XMC=`xm create --quiet --defconfig $dom`
+               if [ $? -ne 0 ]; then
+                   echo -e "\nAn error occurred while creating domain ${dom##*/}: $XMC\n"
+                   rc_failed $?
+                   echo -e '!'
+               else
+                   usleep $XENDOMAINS_CREATE_USLEEP
+               fi
+           fi
+       done
+    fi
+}
+
+all_zombies()
+{
+    name=;id=
+    while read LN; do
+       parseln "$LN" || continue
+       if test $id = 0; then continue; fi
+       if test "$state" != "-b---d" -a "$state" != "-----d"; then
+           return 1;
+       fi
+    done < <(xm list -l | grep '(\(domain\|domid\|name\)')
+    return 0
+}
+
+# Wait for max $XENDOMAINS_STOP_MAXWAIT for xm $1 to finish;
+# if it has not exited by that time kill it, so the init script will
+# succeed within a finite amount of time; if $2 is nonnull, it will
+# kill the command as well as soon as no domain (except for zombies)
+# are left (used for shutdown --all). Third parameter, if any, suppresses
+# output of dots per working state (formatting issues)
+watchdog_xm()
+{
+    if test -z "$XENDOMAINS_STOP_MAXWAIT" -o "$XENDOMAINS_STOP_MAXWAIT" = "0"; then
+       exit
+    fi
+
+    usleep 20000
+    for no in `seq 0 $XENDOMAINS_STOP_MAXWAIT`; do
+       # exit if xm save/migrate/shutdown is finished
+       PSAX=`ps axlw | grep "xm $1" | grep -v grep`
+       if test -z "$PSAX"; then exit; fi
+       if ! test -n "$3"; then echo -n '.'; fi
+       sleep 1
+       # go to kill immediately if there's only zombies left
+       if all_zombies && test -n "$2"; then break; fi
+    done
+    sleep 1
+    read PSF PSUID PSPID PSPPID < <(echo "$PSAX")
+    # kill xm $1
+    kill $PSPID >/dev/null 2>&1
+    
+    echo -e .
+}
+
+stop()
+{
+    exec 3>&2 2> /dev/null
+    
+    # Collect list of domains to shut down
+    if test "$XENDOMAINS_AUTO_ONLY" = "true"; then
+       rdnames
+    fi
+    echo -n "Shutting down Xen domains:"
+    name=;id=
+    while read LN; do
+       parseln "$LN" || continue
+       if test $id = 0; then continue; fi
+       echo -n " $name"
+       if test "$XENDOMAINS_AUTO_ONLY" = "true"; then
+           eval "
+           case \"\$name\" in
+               ($NAMES)
+                   # nothing
+                   ;;
+               (*)
+                   echo -e '(skip)'
+                   continue
+                   ;;
+           esac
+           "
+       fi
+       # XENDOMAINS_SYSRQ chould be something like just "s" 
+       # or "s e i u" or even "s e s i u o"
+       # for the latter, you should set XENDOMAINS_USLEEP to 1200000 or so
+       if test -n "$XENDOMAINS_SYSRQ"; then
+           for sysrq in $XENDOMAINS_SYSRQ; do
+               echo -n "(SR-$sysrq)"
+               XMR=`xm sysrq $id $sysrq 2>&1 1>/dev/null`
+               if test $? -ne 0; then
+                   echo -e "\nAn error occurred while doing sysrq on domain:\n$XMR\n"
+                   rc_failed $?
+                   echo -n '!'
+               fi
+               # usleep just ignores empty arg
+               usleep $XENDOMAINS_USLEEP
+           done
+       fi
+       if test "$state" = "-b---d" -o "$state" = "-----d"; then
+           echo -n "(zomb)"
+           continue
+       fi
+       if test -n "$XENDOMAINS_MIGRATE"; then
+           echo -n "(migr)"
+           watchdog_xm migrate &
+           WDOG_PID=$!
+           XMR=`xm migrate $id $XENDOMAINS_MIGRATE 2>&1 1>/dev/null`
+           if test $? -ne 0; then
+               echo -e "\nAn error occurred while migrating domain:\n$XMR\n"
+               rc_failed $?
+               echo -e '!'
+
+               kill $WDOG_PID >/dev/null 2>&1
+           else
+               kill $WDOG_PID >/dev/null 2>&1
+               
+               echo -e .
+               usleep 1000
+               continue
+           fi
+       fi
+       if test -n "$XENDOMAINS_SAVE"; then
+           echo -n "(save)"
+           watchdog_xm save &
+           WDOG_PID=$!
+           mkdir -p "$XENDOMAINS_SAVE"
+           XMR=`xm save $id $XENDOMAINS_SAVE/$name 2>&1 1>/dev/null`
+           if test $? -ne 0; then
+               echo -e "\nAn error occurred while saving domain:\n$XMR\n"
+               rc_failed $?
+               echo -e '!'
+               kill $WDOG_PID >/dev/null 2>&1
+           else
+               kill $WDOG_PID >/dev/null 2>&1
+               echo -e .
+               usleep 1000
+               continue
+           fi
+       fi
+       if test -n "$XENDOMAINS_SHUTDOWN"; then
+           # XENDOMAINS_SHUTDOWN should be "--halt --wait"
+           echo -n "(shut)"
+           watchdog_xm shutdown &
+           WDOG_PID=$!
+           XMR=`xm shutdown $id $XENDOMAINS_SHUTDOWN 2>&1 1>/dev/null`
+           if test $? -ne 0; then
+               echo -e "\nAn error occurred while shutting down domain:\n$XMR\n"
+               rc_failed $?
+               echo -e '!'
+           fi
+           kill $WDOG_PID >/dev/null 2>&1
+       fi
+    done < <(xm list -l | grep '(\(domain\|domid\|name\)')
+
+    # NB. this shuts down ALL Xen domains (politely), not just the ones in
+    # AUTODIR/*
+    # This is because it's easier to do ;-) but arguably if this script is run
+    # on system shutdown then it's also the right thing to do.
+    if ! all_zombies && test -n "$XENDOMAINS_SHUTDOWN_ALL"; then
+       # XENDOMAINS_SHUTDOWN_ALL should be "--all --halt --wait"
+       echo -n " SHUTDOWN_ALL "
+       watchdog_xm shutdown 1 false &
+       WDOG_PID=$!
+       XMR=`xm shutdown $XENDOMAINS_SHUTDOWN_ALL 2>&1 1>/dev/null`
+       if test $? -ne 0; then
+           echo -e "\nAn error occurred while shutting down all domains: $XMR\n"
+           rc_failed $?
+           echo -e '!'
+       fi
+       kill $WDOG_PID >/dev/null 2>&1
+    fi
+
+    # Unconditionally delete lock file
+    rm -f $LOCKFILE
+    
+    exec 2>&3
+}
+
+check_domain_up()
+{
+    name=;id=
+    while read LN; do
+       parseln "$LN" || continue
+       if test $id = 0; then continue; fi
+       case $name in 
+           ($1)
+               return 0
+               ;;
+       esac
+    done < <(xm list -l | grep '(\(domain\|domid\|name\)')
+    return 1
+}
+
+check_all_auto_domains_up()
+{
+    if ! contains_something "$XENDOMAINS_AUTO"
+    then
+      return 0
+    fi
+    missing=
+    for nm in $XENDOMAINS_AUTO/*; do
+       rdname $nm
+       found=0
+       if check_domain_up "$NM"; then 
+           echo -n " $name"
+       else 
+           missing="$missing $NM"
+       fi
+    done
+    if test -n "$missing"; then
+       echo -n " MISS AUTO:$missing"
+       return 1
+    fi
+    return 0
+}
+
+check_all_saved_domains_up()
+{
+    if ! contains_something "$XENDOMAINS_SAVE" 
+    then
+      return 0
+    fi
+    missing=`/bin/ls $XENDOMAINS_SAVE`
+    echo -n " MISS SAVED: " $missing
+    return 1
+}
+
+# This does NOT necessarily restart all running domains: instead it
+# stops all running domains and then boots all the domains specified in
+# AUTODIR.  If other domains have been started manually then they will
+# not get restarted.
+# Commented out to avoid confusion!
+
+restart()
+{
+    stop
+    start
+}
+
+reload()
+{
+    restart
+}
+
+
+case "$1" in
+    start)
+       start
+       rc_status
+       if test -f $LOCKFILE; then rc_status -v; fi
+       ;;
+
+    stop)
+       stop
+       rc_status -v
+       ;;
+
+    restart)
+       restart
+       ;;
+    reload)
+       reload
+       ;;
+
+    status)
+       echo -n "Checking for xendomains:" 
+       if test ! -f $LOCKFILE; then 
+           rc_failed 3
+       else
+           check_all_auto_domains_up
+           rc_status
+           check_all_saved_domains_up
+           rc_status
+       fi
+       rc_status -v
+       ;;
+
+    *)
+       echo "Usage: $0 {start|stop|restart|reload|status}"
+       rc_failed 3
+       rc_status -v
+       ;;
+esac
+
+rc_exit
diff --git a/tools/hotplug/Linux/locking.sh b/tools/hotplug/Linux/locking.sh
new file mode 100644 (file)
index 0000000..6ff58e7
--- /dev/null
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+#
+# Serialisation
+#
+
+LOCK_SLEEPTIME=1
+LOCK_SPINNING_RETRIES=5
+LOCK_RETRIES=100
+LOCK_BASEDIR=/var/run/xen-hotplug
+
+
+claim_lock()
+{
+  local lockdir="$LOCK_BASEDIR/$1"
+  mkdir -p "$LOCK_BASEDIR"
+  _claim_lock "$lockdir"
+}
+
+
+release_lock()
+{
+  _release_lock "$LOCK_BASEDIR/$1"
+}
+
+
+_claim_lock()
+{
+  local lockdir="$1"
+  local owner=$(_lock_owner "$lockdir")
+  local retries=0
+
+  while [ $retries -lt $LOCK_RETRIES ]
+  do
+    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
+      _update_lock_info "$lockdir" && return
+
+    local new_owner=$(_lock_owner "$lockdir")
+    if [ "$new_owner" != "$owner" ]
+    then
+      owner="$new_owner"
+      retries=0
+    fi
+
+    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
+    then
+      sleep $LOCK_SLEEPTIME
+    else
+      sleep 0
+    fi
+    retries=$(($retries + 1))
+  done
+  _steal_lock "$lockdir"
+}
+
+
+_release_lock()
+{
+  trap sigerr ERR
+  rm -rf "$1" 2>/dev/null || true
+}
+
+
+_steal_lock()
+{
+  local lockdir="$1"
+  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
+  log err "Forced to steal lock on $lockdir from $owner!"
+  _release_lock "$lockdir"
+  _claim_lock "$lockdir"
+}
+
+
+_lock_owner()
+{
+  cat "$1/owner" 2>/dev/null || echo "unknown"
+}
+
+
+_update_lock_info()
+{
+  echo "$$: $0" >"$1/owner"
+}
diff --git a/tools/hotplug/Linux/logging.sh b/tools/hotplug/Linux/logging.sh
new file mode 100644 (file)
index 0000000..c1bc699
--- /dev/null
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+log() {
+  local level="$1"
+  shift
+  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
+}
diff --git a/tools/hotplug/Linux/network-bridge b/tools/hotplug/Linux/network-bridge
new file mode 100644 (file)
index 0000000..9d7be4e
--- /dev/null
@@ -0,0 +1,310 @@
+#!/bin/bash
+#============================================================================
+# Default Xen network start/stop script.
+# Xend calls a network script when it starts.
+# The script name to use is defined in /etc/xen/xend-config.sxp
+# in the network-script field.
+#
+# This script creates a bridge (default ${netdev}), adds a device
+# (defaults to the device on the default gateway route) to it, copies
+# the IP addresses from the device to the bridge and adjusts the routes
+# accordingly.
+#
+# If all goes well, this should ensure that networking stays up.
+# However, some configurations are upset by this, especially
+# NFS roots. If the bridged setup does not meet your needs,
+# configure a different script, for example using routing instead.
+#
+# Usage:
+#
+# network-bridge (start|stop|status) {VAR=VAL}*
+#
+# Vars:
+#
+# bridge     The bridge to use (default ${netdev}).
+# netdev     The interface to add to the bridge (default gateway device).
+# antispoof  Whether to use iptables to prevent spoofing (default no).
+#
+# Internal Vars:
+# pdev="p${netdev}"
+# tdev=tmpbridge
+#
+# start:
+# Creates the bridge as tdev
+# Copies the IP and MAC addresses from pdev to bridge
+# Renames netdev to be pdev 
+# Renames tdev to bridge
+# Enslaves pdev to bridge
+#
+# stop:
+# Removes pdev from the bridge
+# Transfers addresses, routes from bridge to pdev
+# Renames bridge to tdev
+# Renames pdev to netdev 
+# Deletes tdev
+#
+# status:
+# Print addresses, interfaces, routes
+#
+#============================================================================
+
+
+dir=$(dirname "$0")
+. "$dir/xen-script-common.sh"
+. "$dir/xen-network-common.sh"
+
+findCommand "$@"
+evalVariables "$@"
+
+is_network_root () {
+    local rootfs=$(awk '{ if ($1 !~ /^[ \t]*#/ && $2 == "/") { print $3; }}' /etc/mtab)
+    local rootopts=$(awk '{ if ($1 !~ /^[ \t]*#/ && $2 == "/") { print $4; }}' /etc/mtab)
+
+    [[ "$rootfs" =~ "^nfs" ]] || [[ "$rootopts" =~ "_netdev" ]] && has_nfsroot=1 || has_nfsroot=0
+    if [ $has_nfsroot -eq 1 ]; then
+        local bparms=$(cat /proc/cmdline)
+        for p in $bparms; do
+            local ipaddr=$(echo $p | awk /nfsroot=/'{ print substr($1,9,index($1,":")-9) }')
+            if [ "$ipaddr" != "" ]; then
+                local nfsdev=$(ip route get $ipaddr | awk /$ipaddr/'{ print $3 }')
+                [[ "$nfsdev" == "$netdev" ]] && return 0 || return 1
+            fi
+        done
+    fi
+    return 1
+}
+
+find_alt_device () {
+    local interf=$1
+    local prefix=${interf%[[:digit:]]}
+    local ifs=$(ip link show | grep " $prefix" |\
+                gawk '{ printf ("%s",substr($2,1,length($2)-1)) }' |\
+                sed s/$interf//)
+    echo "$ifs"
+}
+
+netdev=${netdev:-$(ip route list 0.0.0.0/0  | \
+                   sed 's/.*dev \([a-z]\+[0-9]\+\).*$/\1/')}
+if is_network_root ; then
+    altdevs=$(find_alt_device $netdev)
+    for netdev in $altdevs; do break; done
+    if [ -z "$netdev" ]; then
+        [ -x /usr/bin/logger ] && /usr/bin/logger "network-bridge: bridging not supported on network root; not starting"
+        exit
+    fi
+fi
+netdev=${netdev:-eth0}
+bridge=${bridge:-${netdev}}
+antispoof=${antispoof:-no}
+
+pdev="p${netdev}"
+tdev=tmpbridge
+
+get_ip_info() {
+    addr_pfx=`ip addr show dev $1 | egrep '^ *inet' | sed -e 's/ *inet //' -e 's/ .*//'`
+    gateway=`ip route show dev $1 | fgrep default | sed 's/default via //'`
+}
+    
+do_ifup() {
+    if ! ifup $1 ; then
+        if [ -n "$addr_pfx" ] ; then
+            # use the info from get_ip_info()
+            ip addr flush $1
+            ip addr add ${addr_pfx} dev $1
+            ip link set dev $1 up
+            [ -n "$gateway" ] && ip route add default via ${gateway}
+        fi
+    fi
+}
+
+# Usage: transfer_addrs src dst
+# Copy all IP addresses (including aliases) from device $src to device $dst.
+transfer_addrs () {
+    local src=$1
+    local dst=$2
+    # Don't bother if $dst already has IP addresses.
+    if ip addr show dev ${dst} | egrep -q '^ *inet ' ; then
+        return
+    fi
+    # Address lines start with 'inet' and have the device in them.
+    # Replace 'inet' with 'ip addr add' and change the device name $src
+    # to 'dev $src'.
+    ip addr show dev ${src} | egrep '^ *inet ' | sed -e "
+s/inet/ip addr add/
+s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+/[0-9]\+\)@\1@
+s/${src}/dev ${dst} label ${dst}/
+s/secondary//
+" | sh -e
+    # Remove automatic routes on destination device
+    ip route list | sed -ne "
+/dev ${dst}\( \|$\)/ {
+  s/^/ip route del /
+  p
+}" | sh -e
+}
+
+# Usage: transfer_routes src dst
+# Get all IP routes to device $src, delete them, and
+# add the same routes to device $dst.
+# The original routes have to be deleted, otherwise adding them
+# for $dst fails (duplicate routes).
+transfer_routes () {
+    local src=$1
+    local dst=$2
+    # List all routes and grep the ones with $src in.
+    # Stick 'ip route del' on the front to delete.
+    # Change $src to $dst and use 'ip route add' to add.
+    ip route list | sed -ne "
+/dev ${src}\( \|$\)/ {
+  h
+  s/^/ip route del /
+  P
+  g
+  s/${src}/${dst}/
+  s/^/ip route add /
+  P
+  d
+}" | sh -e
+}
+
+
+##
+# link_exists interface
+#
+# Returns 0 if the interface named exists (whether up or down), 1 otherwise.
+#
+link_exists()
+{
+    if ip link show "$1" >/dev/null 2>/dev/null
+    then
+        return 0
+    else
+        return 1
+    fi
+}
+
+# Set the default forwarding policy for $dev to drop.
+# Allow forwarding to the bridge.
+antispoofing () {
+    iptables -P FORWARD DROP
+    iptables -F FORWARD
+    iptables -A FORWARD -m physdev --physdev-in ${pdev} -j ACCEPT
+}
+
+# Usage: show_status dev bridge
+# Print ifconfig and routes.
+show_status () {
+    local dev=$1
+    local bridge=$2
+    
+    echo '============================================================'
+    ip addr show ${dev}
+    ip addr show ${bridge}
+    echo ' '
+    brctl show ${bridge}
+    echo ' '
+    ip route list
+    echo ' '
+    route -n
+    echo '============================================================'
+}
+
+op_start () {
+    if [ "${bridge}" = "null" ] ; then
+       return
+    fi
+
+    if link_exists "$pdev"; then
+        # The device is already up.
+        return
+    fi
+
+    create_bridge ${tdev}
+
+    preiftransfer ${netdev}
+    transfer_addrs ${netdev} ${tdev}
+    if ! ifdown ${netdev}; then
+       # If ifdown fails, remember the IP details.
+       get_ip_info ${netdev}
+       ip link set ${netdev} down
+       ip addr flush ${netdev}
+    fi
+    ip link set ${netdev} name ${pdev}
+    ip link set ${tdev} name ${bridge}
+
+    setup_bridge_port ${pdev}
+
+    add_to_bridge2 ${bridge} ${pdev}
+    do_ifup ${bridge}
+
+    if [ ${antispoof} = 'yes' ] ; then
+       antispoofing
+    fi
+}
+
+op_stop () {
+    if [ "${bridge}" = "null" ]; then
+       return
+    fi
+    if ! link_exists "$bridge"; then
+       return
+    fi
+
+    transfer_addrs ${bridge} ${pdev}
+    if ! ifdown ${bridge}; then
+       get_ip_info ${bridge}
+    fi
+    ip link set ${pdev} down
+    ip addr flush ${bridge}
+
+    brctl delif ${bridge} ${pdev}
+    ip link set ${bridge} down
+
+    ip link set ${bridge} name ${tdev}
+    ip link set ${pdev} name ${netdev}
+    do_ifup ${netdev}
+
+    brctl delbr ${tdev}
+}
+
+# adds $dev to $bridge but waits for $dev to be in running state first
+add_to_bridge2() {
+    local bridge=$1
+    local dev=$2
+    local maxtries=10
+
+    echo -n "Waiting for ${dev} to negotiate link."
+    ip link set ${dev} up
+    for i in `seq ${maxtries}` ; do
+       if ifconfig ${dev} | grep -q RUNNING ; then
+           break
+       else
+           echo -n '.'
+           sleep 1
+       fi
+    done
+
+    if [ ${i} -eq ${maxtries} ] ; then echo -n '(link isnt in running state)' ; fi
+    echo
+
+    add_to_bridge ${bridge} ${dev}
+}
+
+case "$command" in
+    start)
+       op_start
+       ;;
+    
+    stop)
+       op_stop
+       ;;
+
+    status)
+       show_status ${netdev} ${bridge}
+       ;;
+
+    *)
+       echo "Unknown command: $command" >&2
+       echo 'Valid commands are: start, stop, status' >&2
+       exit 1
+esac
diff --git a/tools/hotplug/Linux/network-nat b/tools/hotplug/Linux/network-nat
new file mode 100644 (file)
index 0000000..d9c62c6
--- /dev/null
@@ -0,0 +1,119 @@
+#!/bin/bash -x
+#============================================================================
+# Default Xen network start/stop script when using NAT.
+# Xend calls a network script when it starts.
+# The script name to use is defined in /etc/xen/xend-config.sxp
+# in the network-script field.
+#
+# Usage:
+#
+# network-nat (start|stop|status) {VAR=VAL}*
+#
+# Vars:
+#
+# netdev     The gateway interface (default eth0).
+# antispoof  Whether to use iptables to prevent spoofing (default no).
+# dhcp       Whether to alter the local DHCP configuration (default no).
+#
+#============================================================================
+
+dir=$(dirname "$0")
+. "$dir/xen-script-common.sh"
+. "$dir/xen-network-common.sh"
+
+findCommand "$@"
+evalVariables "$@"
+
+netdev=${netdev:-eth0}
+# antispoofing not yet implemented
+antispoof=${antispoof:-no}
+
+# turn on dhcp feature by default if dhcpd is installed
+if [ -f /etc/dhcpd.conf ]
+then
+       dhcp=${dhcp:-yes}
+else
+       dhcp=${dhcp:-no}
+fi
+
+
+if [ "$dhcp" != 'no' ]
+then
+  dhcpd_conf_file=$(find_dhcpd_conf_file)
+  dhcpd_init_file=$(find_dhcpd_init_file)
+  if [ -z "$dhcpd_conf_file" ] || [ -z "$dhcpd_init_file" ]
+  then
+    echo 'Failed to find dhcpd configuration or init file.' >&2
+    exit 1
+  fi
+fi
+
+
+function dhcp_start()
+{
+  if ! grep -q "subnet 10.0.0.0" "$dhcpd_conf_file"
+  then
+    echo >>"$dhcpd_conf_file" "subnet 10.0.0.0 netmask 255.255.0.0 {}"
+  fi
+
+  "$dhcpd_init_file" restart
+}
+
+
+function dhcp_stop()
+{
+  local tmpfile=$(mktemp)
+  grep -v "subnet 10.0.0.0" "$dhcpd_conf_file" >"$tmpfile"
+  if diff "$tmpfile" "$dhcpd_conf_file" >&/dev/null
+  then
+    rm "$tmpfile"
+  else
+    mv "$tmpfile" "$dhcpd_conf_file"
+  fi
+
+  "$dhcpd_init_file" restart
+}
+
+
+op_start() {
+       echo 1 >/proc/sys/net/ipv4/ip_forward
+       iptables -t nat -A POSTROUTING -o ${netdev} -j MASQUERADE
+        [ "$dhcp" != 'no' ] && dhcp_start
+}
+
+
+op_stop() {
+        [ "$dhcp" != 'no' ] && dhcp_stop
+       iptables -t nat -D POSTROUTING -o ${netdev} -j MASQUERADE
+}
+
+
+show_status() {
+    echo '============================================================'
+    ifconfig
+    echo ' '
+    ip route list
+    echo ' '
+    route -n
+    echo '============================================================'
+
+}
+
+case "$command" in
+    start)
+        op_start
+        ;;
+    
+    stop)
+        op_stop
+        ;;
+
+    status)
+        show_status
+       ;;
+
+    *)
+       echo "Unknown command: $command" >&2
+       echo 'Valid commands are: start, stop, status' >&2
+       exit 1
+esac
diff --git a/tools/hotplug/Linux/network-route b/tools/hotplug/Linux/network-route
new file mode 100644 (file)
index 0000000..574441e
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/bash
+#============================================================================
+# Default Xen network start/stop script.
+# Xend calls a network script when it starts.
+# The script name to use is defined in /etc/xen/xend-config.sxp
+# in the network-script field.
+#
+# Usage:
+#
+# network-route (start|stop|status) {VAR=VAL}*
+#
+# Vars:
+#
+# netdev     The gateway interface (default eth0).
+# antispoof  Whether to use iptables to prevent spoofing (default yes).
+#
+#============================================================================
+
+dir=$(dirname "$0")
+. "$dir/xen-script-common.sh"
+
+evalVariables "$@"
+
+netdev=${netdev:-eth${vifnum}}
+
+echo 1 >/proc/sys/net/ipv4/ip_forward
+echo 1 >/proc/sys/net/ipv4/conf/${netdev}/proxy_arp
diff --git a/tools/hotplug/Linux/vif-bridge b/tools/hotplug/Linux/vif-bridge
new file mode 100644 (file)
index 0000000..1b698d7
--- /dev/null
@@ -0,0 +1,100 @@
+#!/bin/bash
+#============================================================================
+# /etc/xen/vif-bridge
+#
+# Script for configuring a vif in bridged mode.
+# The hotplugging system will call this script if it is specified either in
+# the device configuration given to Xend, or the default Xend configuration
+# in /etc/xen/xend-config.sxp.  If the script is specified in neither of those
+# places, then this script is the default.
+#
+# Usage:
+# vif-bridge (add|remove|online|offline)
+#
+# Environment vars:
+# vif         vif interface name (required).
+# XENBUS_PATH path to this device's details in the XenStore (required).
+#
+# Read from the store:
+# bridge  bridge to add the vif to (optional).  Defaults to searching for the
+#         bridge itself.
+# ip      list of IP networks for the vif, space-separated (optional).
+#
+# up:
+# Enslaves the vif interface to the bridge and adds iptables rules
+# for its ip addresses (if any).
+#
+# down:
+# Removes the vif interface from the bridge and removes the iptables
+# rules for its ip addresses (if any).
+#============================================================================
+
+dir=$(dirname "$0")
+. "$dir/vif-common.sh"
+
+bridge=${bridge:-}
+bridge=$(xenstore_read_default "$XENBUS_PATH/bridge" "$bridge")
+
+if [ -z "$bridge" ]
+then
+  bridge=$(brctl show | cut -d "
+" -f 2 | cut -f 1)
+
+  if [ -z "$bridge" ]
+  then
+     fatal "Could not find bridge, and none was specified"
+  fi
+else
+  #
+  # Old style bridge setup with netloop, used to have a bridge name
+  # of xenbrX, enslaving pethX and vif0.X, and then configuring
+  # eth0.
+  #
+  # New style bridge setup does not use netloop, so the bridge name
+  # is ethX and the physical device is enslaved pethX
+  #
+  # So if...
+  #
+  #   - User asks for xenbrX
+  #   - AND xenbrX doesn't exist
+  #   - AND there is a ethX device which is a bridge
+  #
+  # ..then we translate xenbrX to ethX
+  #
+  # This lets old config files work without modification
+  #
+  if [ ! -e "/sys/class/net/$bridge" ] && [ -z "${bridge##xenbr*}" ]
+  then
+     if [ -e "/sys/class/net/eth${bridge#xenbr}/bridge" ]
+     then
+        bridge="eth${bridge#xenbr}"
+     fi
+  fi
+fi
+
+RET=0
+ip link show $bridge 1>/dev/null 2>&1 || RET=1
+if [ "$RET" -eq 1 ]
+then
+    fatal "Could not find bridge device $bridge"
+fi
+
+case "$command" in
+    online)
+       setup_bridge_port "$vif"
+       add_to_bridge "$bridge" "$vif"
+        ;;
+
+    offline)
+        do_without_error brctl delif "$bridge" "$vif"
+        do_without_error ifconfig "$vif" down
+        ;;
+esac
+
+handle_iptable
+
+log debug "Successful vif-bridge $command for $vif, bridge $bridge."
+if [ "$command" == "online" ]
+then
+  success
+fi
diff --git a/tools/hotplug/Linux/vif-common.sh b/tools/hotplug/Linux/vif-common.sh
new file mode 100644 (file)
index 0000000..ee67ee2
--- /dev/null
@@ -0,0 +1,151 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+. "$dir/xen-network-common.sh"
+
+findCommand "$@"
+
+if [ "$command" != "online" ]  &&
+   [ "$command" != "offline" ] &&
+   [ "$command" != "add" ]     &&
+   [ "$command" != "remove" ]
+then
+  log err "Invalid command: $command"
+  exit 1
+fi
+
+case "$command" in
+    add | remove)
+        exit 0
+        ;;
+esac
+
+
+# Parameters may be read from the environment, the command line arguments, and
+# the store, with overriding in that order.  The environment is given by the
+# driver, the command line is given by the Xend global configuration, and
+# store details are given by the per-domain or per-device configuration.
+
+evalVariables "$@"
+
+ip=${ip:-}
+ip=$(xenstore_read_default "$XENBUS_PATH/ip" "$ip")
+
+# Check presence of compulsory args.
+XENBUS_PATH="${XENBUS_PATH:?}"
+vif="${vif:?}"
+
+
+vifname=$(xenstore_read_default "$XENBUS_PATH/vifname" "")
+if [ "$vifname" ]
+then
+  if [ "$command" == "online" ] && ! ip link show "$vifname" >&/dev/null
+  then
+    do_or_die ip link set "$vif" name "$vifname"
+  fi
+  vif="$vifname"
+fi
+
+
+frob_iptable()
+{
+  if [ "$command" == "online" ]
+  then
+    local c="-A"
+  else
+    local c="-D"
+  fi
+
+  iptables "$c" FORWARD -m physdev --physdev-in "$vif" "$@" -j ACCEPT \
+    2>/dev/null ||
+    [ "$c" == "-D" ] ||
+    log err \
+     "iptables $c FORWARD -m physdev --physdev-in $vif $@ -j ACCEPT failed.
+If you are using iptables, this may affect networking for guest domains."
+}
+
+
+##
+# Add or remove the appropriate entries in the iptables.  With antispoofing
+# turned on, we have to explicitly allow packets to the interface, regardless
+# of the ip setting.  If ip is set, then we additionally restrict the packets
+# to those coming from the specified networks, though we allow DHCP requests
+# as well.
+#
+handle_iptable()
+{
+  # Check for a working iptables installation.  Checking for the iptables
+  # binary is not sufficient, because the user may not have the appropriate
+  # modules installed.  If iptables is not working, then there's no need to do
+  # anything with it, so we can just return.
+  if ! iptables -L -n >&/dev/null
+  then
+    return
+  fi
+
+  if [ "$ip" != "" ]
+  then
+      local addr
+      for addr in $ip
+      do
+        frob_iptable -s "$addr"
+      done
+
+      # Always allow the domain to talk to a DHCP server.
+      frob_iptable -p udp --sport 68 --dport 67
+  else
+      # No IP addresses have been specified, so allow anything.
+      frob_iptable
+  fi
+}
+
+
+##
+# ip_of interface
+#
+# Print the IP address currently in use at the given interface, or nothing if
+# the interface is not up.
+#
+ip_of()
+{
+  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed -n '1 s,/.*,,p'
+}
+
+
+##
+# dom0_ip
+#
+# Print the IP address of the interface in dom0 through which we are routing.
+# This is the IP address on the interface specified as "netdev" as a parameter
+# to these scripts, or eth0 by default.  This function will call fatal if no
+# such interface could be found.
+#
+dom0_ip()
+{
+  local nd=${netdev:-eth0}
+  local result=$(ip_of "$nd")
+  if [ -z "$result" ]
+  then
+      fatal
+"$netdev is not up.  Bring it up or specify another interface with " \
+"netdev=<if> as a parameter to $0."
+  fi
+  echo "$result"
+}
diff --git a/tools/hotplug/Linux/vif-nat b/tools/hotplug/Linux/vif-nat
new file mode 100644 (file)
index 0000000..75bdf5c
--- /dev/null
@@ -0,0 +1,192 @@
+#!/bin/bash
+#============================================================================
+# /etc/xen/vif-nat
+#
+# Script for configuring a vif in routed-nat mode.
+# The hotplugging system will call this script if it is specified either in
+# the device configuration given to Xend, or the default Xend configuration
+# in /etc/xen/xend-config.sxp.  If the script is specified in neither of those
+# places, then vif-bridge is the default.
+#
+# Usage:
+# vif-nat (add|remove|online|offline)
+#
+# Environment vars:
+# vif         vif interface name (required).
+# XENBUS_PATH path to this device's details in the XenStore (required).
+#
+# Parameters:
+# dhcp        Whether to alter the local DHCP configuration to include this
+#             new host (default no).
+#
+# Read from the store:
+# ip      list of IP networks for the vif, space-separated (default given in
+#         this script).
+#============================================================================
+
+
+dir=$(dirname "$0")
+. "$dir/vif-common.sh"
+
+# turn on dhcp feature by default if dhcpd is installed
+if [ -f /etc/dhcpd.conf ]
+then
+       dhcp=${dhcp:-yes}
+else
+       dhcp=${dhcp:-no}
+fi
+
+if [ "$dhcp" != 'no' ]
+then
+  dhcpd_conf_file=$(find_dhcpd_conf_file)
+  dhcpd_init_file=$(find_dhcpd_init_file)
+  dhcpd_arg_file=$(find_dhcpd_arg_file)
+  if [ -z "$dhcpd_conf_file" ] || [ -z "$dhcpd_init_file" ] || [ -z "$dhcpd_arg_file" ]
+  then
+    echo 'Failed to find dhcpd configuration or init or args file.' >&2
+    exit 1
+  fi
+fi
+
+
+domid=$(xenstore_read "$XENBUS_PATH/frontend-id")
+vifid=$(xenstore_read "$XENBUS_PATH/handle")
+vifid=$(( $vifid + 1 ))
+
+
+ip_from_dom()
+{
+  local domid1=$(( $domid / 256 ))
+  local domid2=$(( $domid % 256 ))
+
+  echo "10.$domid1.$domid2.$vifid/16"
+}
+
+
+routing_ip()
+{
+  echo $(echo $1 | awk -F. '{print $1"."$2"."$3"."$4 + 127}')
+}
+
+
+dotted_quad()
+{
+ echo\
+ $(( ($1 & 0xFF000000) >> 24))\
+.$(( ($1 & 0x00FF0000) >> 16))\
+.$(( ($1 & 0x0000FF00) >> 8 ))\
+.$((  $1 & 0x000000FF       ))
+}
+
+
+if [ "$ip" = "" ]
+then
+  ip=$(ip_from_dom)
+fi
+
+router_ip=$(routing_ip "$ip")
+
+# Split the given IP/bits pair.
+vif_ip=`echo ${ip} | awk -F/ '{print $1}'`
+
+hostname=$(xenstore_read "$XENBUS_PATH/domain" | tr -- '_.:/+' '-----')
+if [ "$vifid" != "1" ]
+then
+  hostname="$hostname-$vifid"
+fi
+
+dhcparg_remove_entry()
+{
+  local tmpfile=$(mktemp)
+  sed -e "s/$vif //" "$dhcpd_arg_file" >"$tmpfile"
+  if diff "$tmpfile" "$dhcpd_arg_file" >/dev/null
+  then
+    rm "$tmpfile"
+  else
+    mv "$tmpfile" "$dhcpd_arg_file"
+  fi
+}
+
+dhcparg_add_entry()
+{
+  dhcparg_remove_entry
+  local tmpfile=$(mktemp)
+  # handle Red Hat, SUSE, and Debian styles, with or without quotes
+  sed -e 's/^DHCPDARGS="*\([^"]*\)"*/DHCPDARGS="\1'"$vif "'"/' \
+     "$dhcpd_arg_file" >"$tmpfile" && mv "$tmpfile" "$dhcpd_arg_file"
+  sed -e 's/^DHCPD_INTERFACE="*\([^"]*\)"*/DHCPD_INTERFACE="\1'"$vif "'"/' \
+     "$dhcpd_arg_file" >"$tmpfile" && mv "$tmpfile" "$dhcpd_arg_file"
+  sed -e 's/^INTERFACES="*\([^"]*\)"*/INTERFACES="\1'"$vif "'"/' \
+     "$dhcpd_arg_file" >"$tmpfile" && mv "$tmpfile" "$dhcpd_arg_file"
+  rm -f "$tmpfile"
+}
+
+dhcp_remove_entry()
+{
+  local tmpfile=$(mktemp)
+  grep -v "host $hostname" "$dhcpd_conf_file" >"$tmpfile"
+  if diff "$tmpfile" "$dhcpd_conf_file" >/dev/null
+  then
+    rm "$tmpfile"
+  else
+    mv "$tmpfile" "$dhcpd_conf_file"
+  fi
+  dhcparg_remove_entry
+}
+
+
+dhcp_up()
+{
+  claim_lock "vif-nat-dhcp"
+  dhcp_remove_entry
+  mac=$(xenstore_read "$XENBUS_PATH/mac")
+  echo >>"$dhcpd_conf_file" \
+"host $hostname { hardware ethernet $mac; fixed-address $vif_ip; option routers $router_ip; option host-name \"$hostname\"; }"
+  dhcparg_add_entry
+  release_lock "vif-nat-dhcp"
+  "$dhcpd_init_file" restart || true
+}
+
+
+dhcp_down()
+{
+  claim_lock "vif-nat-dhcp"
+  dhcp_remove_entry
+  release_lock "vif-nat-dhcp"
+  "$dhcpd_init_file" restart || true # We need to ignore failure because
+                                     # ISC dhcpd 3 borks if there is nothing
+                                     # for it to do, which is the case if
+                                     # the outgoing interface is not
+                                     # configured to offer leases and there
+                                     # are no vifs.
+}
+
+
+case "$command" in
+    online)
+        if ip route | grep -q "dev $vif"
+        then
+          log debug "$vif already up"
+          exit 0
+        fi
+
+        do_or_die ip link set "$vif" up arp on
+        do_or_die ip addr add "$router_ip" dev "$vif"
+        do_or_die ip route add "$vif_ip" dev "$vif" src "$router_ip"
+        echo 1 >/proc/sys/net/ipv4/conf/${vif}/proxy_arp
+        [ "$dhcp" != 'no' ] && dhcp_up
+        ;;
+    offline)
+        [ "$dhcp" != 'no' ] && dhcp_down
+        do_without_error ifconfig "$vif" down
+        ;;
+esac
+
+
+handle_iptable
+
+log debug "Successful vif-nat $command for $vif."
+if [ "$command" = "online" ]
+then
+  success
+fi
diff --git a/tools/hotplug/Linux/vif-route b/tools/hotplug/Linux/vif-route
new file mode 100644 (file)
index 0000000..f5fd88e
--- /dev/null
@@ -0,0 +1,56 @@
+#!/bin/bash
+#============================================================================
+# /etc/xen/vif-route
+#
+# Script for configuring a vif in routed mode.
+# The hotplugging system will call this script if it is specified either in
+# the device configuration given to Xend, or the default Xend configuration
+# in /etc/xen/xend-config.sxp.  If the script is specified in neither of those
+# places, then vif-bridge is the default.
+#
+# Usage:
+# vif-route (add|remove|online|offline)
+#
+# Environment vars:
+# vif         vif interface name (required).
+# XENBUS_PATH path to this device's details in the XenStore (required).
+#
+# Read from the store:
+# ip      list of IP networks for the vif, space-separated (default given in
+#         this script).
+#============================================================================
+
+dir=$(dirname "$0")
+. "$dir/vif-common.sh"
+
+main_ip=$(dom0_ip)
+
+case "$command" in
+    online)
+        ifconfig ${vif} ${main_ip} netmask 255.255.255.255 up
+        echo 1 >/proc/sys/net/ipv4/conf/${vif}/proxy_arp
+        ipcmd='add'
+        cmdprefix=''
+        ;;
+    offline)
+        do_without_error ifdown ${vif}
+        ipcmd='del'
+        cmdprefix='do_without_error'
+        ;;
+esac
+
+if [ "${ip}" ] ; then
+    # If we've been given a list of IP addresses, then add routes from dom0 to
+    # the guest using those addresses.
+    for addr in ${ip} ; do
+      ${cmdprefix} ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip}
+    done 
+fi
+
+handle_iptable
+
+log debug "Successful vif-route $command for $vif."
+if [ "$command" = "online" ]
+then
+  success
+fi
diff --git a/tools/hotplug/Linux/vscsi b/tools/hotplug/Linux/vscsi
new file mode 100644 (file)
index 0000000..5ac2614
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, FUJITSU Limited
+# Based on the block scripts code.
+#
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+
+case "$command" in
+       add)
+               success
+               ;;
+       remove)
+               # TODO
+               exit 0
+               ;;
+esac
+
+exit 0
diff --git a/tools/hotplug/Linux/vtpm b/tools/hotplug/Linux/vtpm
new file mode 100644 (file)
index 0000000..38a4532
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+dir=$(dirname "$0")
+. "$dir/vtpm-hotplug-common.sh"
+
+vtpm_fatal_error=0
+
+case "$command" in
+  add)
+    vtpm_create_instance
+  ;;
+  remove)
+    vtpm_remove_instance
+  ;;
+esac
+
+if [ $vtpm_fatal_error -eq 0 ]; then
+       log debug "Successful vTPM operation '$command'."
+       success
+else
+       fatal "Error while executing vTPM operation '$command'."
+fi
diff --git a/tools/hotplug/Linux/vtpm-common.sh b/tools/hotplug/Linux/vtpm-common.sh
new file mode 100644 (file)
index 0000000..a45868e
--- /dev/null
@@ -0,0 +1,448 @@
+#
+# Copyright (c) 2005 IBM Corporation
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+dir=$(dirname "$0")
+. "$dir/logging.sh"
+. "$dir/locking.sh"
+
+VTPMDB="/var/vtpm/vtpm.db"
+
+#In the vtpm-impl file some commands should be defined:
+#      vtpm_create, vtpm_setup, vtpm_start, etc. (see below)
+if [ -r "$dir/vtpm-impl.alt" ]; then
+       . "$dir/vtpm-impl.alt"
+elif [ -r "$dir/vtpm-impl" ]; then
+       . "$dir/vtpm-impl"
+else
+       function vtpm_create () {
+               true
+       }
+       function vtpm_setup() {
+               true
+       }
+       function vtpm_start() {
+               true
+       }
+       function vtpm_suspend() {
+               true
+       }
+       function vtpm_resume() {
+               true
+       }
+       function vtpm_delete() {
+               true
+       }
+       function vtpm_migrate() {
+               echo "Error: vTPM migration accross machines not implemented."
+       }
+       function vtpm_migrate_local() {
+               echo "Error: local vTPM migration not supported"
+       }
+       function vtpm_migrate_recover() {
+               true
+       }
+fi
+
+
+#Find the instance number for the vtpm given the name of the domain
+# Parameters
+# - vmname : the name of the vm
+# Return value
+#  Returns '0' if instance number could not be found, otherwise
+#  it returns the instance number in the variable 'instance'
+function vtpmdb_find_instance () {
+       local vmname ret instance
+       vmname=$1
+       ret=0
+
+       instance=$(cat $VTPMDB |                   \
+                 awk -vvmname=$vmname             \
+                 '{                               \
+                    if ( 1 != index($1,"#")) {    \
+                      if ( $1 == vmname ) {       \
+                        print $2;                 \
+                        exit;                     \
+                      }                           \
+                    }                             \
+                  }')
+       if [ "$instance" != "" ]; then
+               ret=$instance
+       fi
+       echo "$ret"
+}
+
+
+# Check whether a particular instance number is still available
+# returns "0" if it is not available, "1" otherwise.
+function vtpmdb_is_free_instancenum () {
+       local instance instances avail i
+       instance=$1
+       avail=1
+       #Allowed instance number range: 1-255
+       if [ $instance -eq 0 -o $instance -gt 255 ]; then
+               avail=0
+       else
+               instances=$(cat $VTPMDB |                \
+                          gawk                          \
+                          '{                            \
+                              if (1 != index($1,"#")) { \
+                                printf("%s ",$2);       \
+                              }                         \
+                           }')
+               for i in $instances; do
+                       if [ $i -eq $instance ]; then
+                               avail=0
+                               break
+                       fi
+               done
+       fi
+       echo "$avail"
+}
+
+
+# Get an available instance number given the database
+# Returns an unused instance number
+function vtpmdb_get_free_instancenum () {
+       local ctr instances don found
+       instances=$(cat $VTPMDB |                \
+                  gawk                          \
+                  '{                            \
+                      if (1 != index($1,"#")) { \
+                        printf("%s ",$2);       \
+                      }                         \
+                   }')
+       ctr=1
+       don=0
+       while [ $don -eq 0 ]; do
+               found=0
+               for i in $instances; do
+                       if [ $i -eq $ctr ]; then
+                               found=1;
+                               break;
+                       fi
+               done
+
+               if [ $found -eq 0 ]; then
+                       don=1
+                       break
+               fi
+               let ctr=ctr+1
+       done
+       echo "$ctr"
+}
+
+
+# Add a domain name and instance number to the DB file
+function vtpmdb_add_instance () {
+       local res vmname inst
+       vmname=$1
+       inst=$2
+
+       if [ ! -f $VTPMDB ]; then
+               echo "#Database for VM to vTPM association" > $VTPMDB
+               echo "#1st column: domain name" >> $VTPMDB
+               echo "#2nd column: TPM instance number" >> $VTPMDB
+       fi
+       res=$(vtpmdb_validate_entry $vmname $inst)
+       if [ $res -eq 0 ]; then
+               echo "$vmname $inst" >> $VTPMDB
+       fi
+}
+
+
+#Validate whether an entry is the same as passed to this
+#function
+function vtpmdb_validate_entry () {
+       local res rc vmname inst
+       rc=0
+       vmname=$1
+       inst=$2
+
+       res=$(cat $VTPMDB |            \
+            gawk -vvmname=$vmname     \
+                 -vinst=$inst         \
+            '{                        \
+                if ( 1 == index($1,"#")) {\
+                } else                \
+                if ( $1 == vmname &&  \
+                     $2 == inst) {    \
+                   printf("1");       \
+                   exit;              \
+                } else                \
+                if ( $1 == vmname ||  \
+                     $2 == inst) {    \
+                   printf("2");       \
+                   exit;              \
+                }                     \
+            }')
+
+       if [ "$res" == "1" ]; then
+               rc=1
+       elif [ "$res" == "2" ]; then
+               rc=2
+       fi
+       echo "$rc"
+}
+
+
+#Remove an entry from the vTPM database given its domain name
+#and instance number
+function vtpmdb_remove_entry () {
+       local vmname instance VTPMDB_TMP
+       vmname=$1
+       instance=$2
+       VTPMDB_TMP="$VTPMDB".tmp
+
+       $(cat $VTPMDB |            \
+        gawk -vvmname=$vmname     \
+        '{                        \
+           if ( $1 != vmname ) {  \
+             print $0;            \
+           }                      \
+        '} > $VTPMDB_TMP)
+       if [ -e $VTPMDB_TMP ]; then
+               mv -f $VTPMDB_TMP $VTPMDB
+               vtpm_delete $instance
+       else
+               log err "Error creating temporary file '$VTPMDB_TMP'."
+       fi
+}
+
+
+# Find the reason for the creation of this device:
+# Returns 'resume' or 'create'
+function vtpm_get_create_reason () {
+       local resume
+       resume=$(xenstore_read $XENBUS_PATH/resume)
+       if [ "$resume" == "True" ]; then
+               echo "resume"
+       else
+               echo "create"
+       fi
+}
+
+
+#Create a vTPM instance
+# If no entry in the TPM database is found, the instance is
+# created and an entry added to the database.
+function vtpm_create_instance () {
+       local res instance domname reason uuid
+       uuid=$(xenstore_read "$XENBUS_PATH"/uuid)
+       reason=$(vtpm_get_create_reason)
+
+       claim_lock vtpmdb
+
+       instance="0"
+
+       if [ "$uuid" != "" ]; then
+               instance=$(vtpmdb_find_instance $uuid)
+       fi
+       if [ "$instance" == "0" ]; then
+               domname=$(xenstore_read "$XENBUS_PATH"/domain)
+               instance=$(vtpmdb_find_instance $domname)
+       fi
+
+       if [ "$instance" == "0" -a "$reason" != "create" ]; then
+               release_lock vtpmdb
+               return
+       fi
+
+       if [ "$instance" == "0" ]; then
+               #Try to give the preferred instance to the domain
+               instance=$(xenstore_read "$XENBUS_PATH"/pref_instance)
+               if [ "$instance" != "" ]; then
+                       res=$(vtpmdb_is_free_instancenum $instance)
+                       if [ $res -eq 0 ]; then
+                               instance=$(vtpmdb_get_free_instancenum)
+                       fi
+               else
+                       instance=$(vtpmdb_get_free_instancenum)
+               fi
+
+               vtpm_create $instance
+
+               if [ $vtpm_fatal_error -eq 0 ]; then
+                       if [ "$uuid" != "" ]; then
+                               vtpmdb_add_instance $uuid $instance
+                       else
+                               vtpmdb_add_instance $domname $instance
+                       fi
+               fi
+       else
+               if [ "$reason" == "resume" ]; then
+                       vtpm_resume $instance
+               else
+                       vtpm_start $instance
+               fi
+       fi
+
+       release_lock vtpmdb
+
+       xenstore_write $XENBUS_PATH/instance $instance
+}
+
+
+#Remove an instance when a VM is terminating or suspending.
+#Since it is assumed that the VM will appear again, the
+#entry is kept in the VTPMDB file.
+function vtpm_remove_instance () {
+       local instance reason domname uuid
+       #Stop script execution quietly if path does not exist (anymore)
+       xenstore-exists "$XENBUS_PATH"/domain
+       uuid=$(xenstore_read "$XENBUS_PATH"/uuid)
+
+       claim_lock vtpmdb
+
+       instance="0"
+
+       if [ "$uuid" != "" ]; then
+               instance=$(vtpmdb_find_instance $uuid)
+       fi
+
+       if [ "$instance" == "0" ]; then
+               domname=$(xenstore_read "$XENBUS_PATH"/domain)
+               instance=$(vtpmdb_find_instance $domname)
+       fi
+
+       if [ "$instance" != "0" ]; then
+               vtpm_suspend $instance
+       fi
+
+       release_lock vtpmdb
+}
+
+
+#Remove an entry in the VTPMDB file given the domain's name
+#1st parameter: The name of the domain
+function vtpm_delete_instance () {
+       local instance
+
+       claim_lock vtpmdb
+
+       instance=$(vtpmdb_find_instance $1)
+       if [ "$instance" != "0" ]; then
+               vtpmdb_remove_entry $1 $instance
+       fi
+
+       release_lock vtpmdb
+}
+
+# Determine whether the given address is local to this machine
+# Return values:
+#  "-1" : the given machine name is invalid
+#  "0"  : this is not an address of this machine
+#  "1"  : this is an address local to this machine
+function vtpm_isLocalAddress() {
+       local addr res
+       addr=$(ping $1 -c 1 |  \
+              gawk '{ print substr($3,2,length($3)-2); exit }')
+       if [ "$addr" == "" ]; then
+               echo "-1"
+               return
+       fi
+       res=$(ifconfig | grep "inet addr" |  \
+            gawk -vaddr=$addr               \
+            '{                              \
+               if ( addr == substr($2, 6)) {\
+                 print "1";                 \
+               }                            \
+            }'                              \
+           )
+       if [ "$res" == "" ]; then
+               echo "0"
+               return
+       fi
+       echo "1"
+}
+
+# Perform a migration step. This function differentiates between migration
+# to the local host or to a remote machine.
+# Parameters:
+# 1st: destination host to migrate to
+# 2nd: name of the domain to migrate
+# 3rd: the migration step to perform
+function vtpm_migration_step() {
+       local res=$(vtpm_isLocalAddress $1)
+       if [ "$res" == "0" ]; then
+               vtpm_migrate $1 $2 $3
+       else
+               vtpm_migrate_local
+       fi
+}
+
+# Recover from migration due to an error. This function differentiates
+# between migration to the local host or to a remote machine.
+# Parameters:
+# 1st: destination host the migration was going to
+# 2nd: name of the domain that was to be migrated
+# 3rd: the last successful migration step that was done
+function vtpm_recover() {
+       local res
+       res=$(vtpm_isLocalAddress $1)
+       if [ "$res" == "0" ]; then
+               vtpm_migrate_recover $1 $2 $3
+       fi
+}
+
+
+#Determine the domain id given a domain's name.
+#1st parameter: name of the domain
+#return value: domain id  or -1 if domain id could not be determined
+function vtpm_domid_from_name () {
+       local id name ids
+       ids=$(xenstore-list /local/domain)
+       for id in $ids; do
+               name=$(xenstore-read /local/domain/$id/name)
+               if [ "$name" == "$1" ]; then
+                       echo "$id"
+                       return
+               fi
+       done
+       echo "-1"
+}
+
+#Determine the virtual TPM's instance number using the domain ID.
+#1st parm: domain ID
+function vtpm_uuid_by_domid() {
+       echo $(xenstore-read /local/domain/0/backend/vtpm/$1/0/uuid)
+}
+
+
+# Determine the vTPM's UUID by the name of the VM
+function vtpm_uuid_from_vmname() {
+       local domid=$(vtpm_domid_from_name $1)
+       if [ "$domid" != "-1" ]; then
+               echo $(vtpm_uuid_by_domid $domid)
+               return
+       fi
+       echo ""
+}
+
+#Add a virtual TPM instance number and its associated domain name
+#to the VTPMDB file and activate usage of this virtual TPM instance
+#by writing the instance number into the xenstore
+#1st parm: name of virtual machine
+#2nd parm: instance of associated virtual TPM
+function vtpm_add_and_activate() {
+       local domid=$(vtpm_domid_from_name $1)
+       local vtpm_uuid=$(vtpm_uuid_from_vmname $1)
+       if [ "$vtpm_uuid" != "" -a "$domid" != "-1" ]; then
+               vtpmdb_add_instance $vtpm_uuid $2
+               xenstore-write backend/vtpm/$domid/0/instance $2
+       fi
+}
diff --git a/tools/hotplug/Linux/vtpm-delete b/tools/hotplug/Linux/vtpm-delete
new file mode 100644 (file)
index 0000000..b75b95b
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# This scripts must be called the following way:
+# vtpm-delete <vtpm uuid>
+# or
+# vtpm-delete --vmname <vm name>
+
+dir=$(dirname "$0")
+. "$dir/vtpm-common.sh"
+
+if [ "$1" == "--vmname" ]; then
+       vtpm_uuid=$(vtpm_uuid_from_vmname $2)
+       if [ "$vtpm_uuid" != "" ];then
+               vtpm_delete_instance $vtpm_uuid
+       fi
+else
+       vtpm_delete_instance $1
+fi
diff --git a/tools/hotplug/Linux/vtpm-hotplug-common.sh b/tools/hotplug/Linux/vtpm-hotplug-common.sh
new file mode 100644 (file)
index 0000000..9fd35e7
--- /dev/null
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2005 IBM Corporation
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+if [ "$command" != "online" ]  &&
+   [ "$command" != "offline" ] &&
+   [ "$command" != "add" ]     &&
+   [ "$command" != "remove" ]
+then
+       log err "Invalid command: $command"
+       exit 1
+fi
+
+
+XENBUS_PATH="${XENBUS_PATH:?}"
+
+. "$dir/vtpm-common.sh"
diff --git a/tools/hotplug/Linux/vtpm-impl b/tools/hotplug/Linux/vtpm-impl
new file mode 100644 (file)
index 0000000..4f9a1fd
--- /dev/null
@@ -0,0 +1,208 @@
+#!/bin/bash
+# ===================================================================
+# 
+# Copyright (c) 2005, Intel Corp.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without 
+# modification, are permitted provided that the following conditions 
+# are met:
+#
+#   * Redistributions of source code must retain the above copyright 
+#     notice, this list of conditions and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above 
+#     copyright notice, this list of conditions and the following 
+#     disclaimer in the documentation and/or other materials provided 
+#     with the distribution.
+#   * Neither the name of Intel Corporation nor the names of its 
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ===================================================================
+
+#            |        SRC        |    TAG  |      CMD SIZE     |        ORD       |mtype|strt
+TPM_CMD_OPEN=\\x00\\x00\\x00\\x00\\x01\\xc1\\x00\\x00\\x00\\x11\\x01\\x00\\x00\\x01\\x01\\x01
+TPM_CMD_RESM=\\x00\\x00\\x00\\x00\\x01\\xc1\\x00\\x00\\x00\\x11\\x01\\x00\\x00\\x01\\x01\\x02
+TPM_CMD_CLOS=\\x00\\x00\\x00\\x00\\x01\\xc1\\x00\\x00\\x00\\x0e\\x01\\x00\\x00\\x02
+TPM_CMD_DELE=\\x00\\x00\\x00\\x00\\x01\\xc1\\x00\\x00\\x00\\x0e\\x01\\x00\\x00\\x03
+
+TPM_TYPE_PVM=\\x01
+TPM_TYPE_HVM=\\x02
+
+TPM_SUCCESS=00000000
+
+TX_VTPM_MANAGER=/var/vtpm/fifos/from_console.fifo
+RX_VTPM_MANAGER=/var/vtpm/fifos/to_console.fifo
+
+VTPM_MIG=/usr/bin/vtpm_migrator
+
+# -------------------- Helpers for binary streams -----------
+
+function str_to_hex32() {
+ printf "%0.8x" $1
+}
+
+function hex32_to_bin() {
+ local inst=$(str_to_hex32 $1);
+ local n1=`echo $inst | sed 's/\(..\)....../\\\\x\1/'`
+ local n2=`echo $inst | sed 's/..\(..\)..../\\\\x\1/'`
+ local n3=`echo $inst | sed 's/....\(..\)../\\\\x\1/'`
+ local n4=`echo $inst | sed 's/......\(..\)/\\\\x\1/'`
+
+ echo "$n1$n2$n3$n4"
+}
+
+function vtpm_manager_cmd() {
+ local cmd=$1;
+ local inst=$2;
+ local inst_bin=$(hex32_to_bin $inst);
+
+ claim_lock vtpm_mgr
+
+ #send cmd to vtpm_manager
+ printf "$cmd$inst_bin" > $TX_VTPM_MANAGER
+
+ #recv response
+ set +e
+ local resp_hex=`dd skip=10 bs=1 count=4 if=$RX_VTPM_MANAGER 2> /dev/null | xxd -ps`
+ set -e
+
+ release_lock vtpm_mgr
+
+ #return whether the command was successful
+ if [ $resp_hex -ne $TPM_SUCCESS ]; then
+   vtpm_fatal_error=1
+   false
+  else
+   true
+ fi
+}
+
+# Helper to get vm type to pass to vtpm_manager open/resume
+function vtpm_get_type() {
+ local inst=$(xenstore_read $XENBUS_PATH/frontend-id)
+ local vm=$(xenstore_read /local/domain/$inst/vm)
+ if [ "$vm" != "" ]; then
+  local ostype=$(xenstore-read $vm/image/ostype)
+  if [ "$ostype" == "hvm" ]; then
+   echo $TPM_TYPE_HVM;
+  else
+   echo $TPM_TYPE_PVM;
+  fi
+ fi
+}
+
+# ------------------ Command handlers -----------------
+
+# Create new vtpm instance & set it up for use
+function vtpm_create () {
+ # Creation is handled implicitly by the manager on first setup
+ # so just set it up for use
+ $(vtpm_start $1)
+}
+
+# Setup vtpm instance for use.
+function vtpm_start() {
+ local vmtype=$(vtpm_get_type);
+ $(vtpm_manager_cmd $TPM_CMD_OPEN$vmtype $1)
+}
+
+function vtpm_resume() {
+ local vmtype=$(vtpm_get_type);
+ $(vtpm_manager_cmd $TPM_CMD_RESM$vmtype $1)
+}
+
+# Reset the vtpm AKA clear PCRs
+function vtpm_reset() {
+ #not used by current implemenation
+ true
+}
+
+# Shutdown the vtpm while the vm is down
+# This could be a suspend of shutdown
+# we cannot distinquish, so save the state
+# and decide on startup if we should keep is
+function vtpm_suspend() {
+ $(vtpm_manager_cmd $TPM_CMD_CLOS $1)
+}
+
+
+function vtpm_delete() {
+ local inst=$1
+ if $(vtpm_manager_cmd $TPM_CMD_DELE $inst); then
+   rm -f /var/vtpm/vtpm_dm_$1.data
+   true
+ else 
+   vtpm_fatal_error=1
+   false
+ fi
+}
+
+# Perform a migration step. This function differentiates between migration
+# to the local host or to a remote machine.
+# Parameters:
+# 1st: destination host to migrate to
+# 2nd: name of the domain to migrate
+# 3rd: the migration step to perform
+function vtpm_migrate() {
+ local instance res
+
+ instance=$(vtpmdb_find_instance $2)
+ if [ "$instance" == "" ]; then
+  log err "VTPM Migratoin failed. Unable to translation of domain name"
+  echo "Error: VTPM Migration failed while looking up instance number"
+ fi
+
+ case "$3" in
+  0)
+   #Incicate migration supported
+   echo "0" 
+  ;;
+
+  1)
+   # Get Public Key from Destination
+   # Call vtpm_manager's migration part 1
+   claim_lock vtpm_mgr
+   $VTPM_MIG $1 $2 $instance $3
+   release_lock vtpm_mgr
+  ;;
+
+  2)
+   # Call manager's migration step 2 and send result to destination
+   # If successful remove from db
+   claim_lock vtpm_mgr
+   $VTPM_MIG $1 $2 $instance $3
+   release_lock vtpm_mgr
+  ;;
+
+  3)
+   if `ps x | grep "$VTPM_MIG $1"`; then
+    log err "VTPM Migration failed to complete."
+    echo "Error: VTPM Migration failed to complete."
+   fi
+  ;;
+ esac
+}
+
+
+function vtpm_migrate_recover() {
+ echo "Error: Recovery not supported yet" 
+}
+
+function vtpm_migrate_local() {
+ echo "Error: local vTPM migration not supported"
+}
diff --git a/tools/hotplug/Linux/vtpm-migration.sh b/tools/hotplug/Linux/vtpm-migration.sh
new file mode 100644 (file)
index 0000000..7e38ae2
--- /dev/null
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2005 IBM Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+dir=$(dirname "$0")
+. "$dir/vtpm-common.sh"
diff --git a/tools/hotplug/Linux/xen-backend.agent b/tools/hotplug/Linux/xen-backend.agent
new file mode 100644 (file)
index 0000000..5cb536a
--- /dev/null
@@ -0,0 +1,39 @@
+#! /bin/bash
+
+PATH=/etc/xen/scripts:$PATH
+
+. /etc/xen/scripts/locking.sh
+
+claim_lock xenbus_hotplug_global
+
+case "$XENBUS_TYPE" in
+  tap)
+    /etc/xen/scripts/blktap "$ACTION"
+    ;;
+  vbd)
+    /etc/xen/scripts/block "$ACTION"
+    ;;
+  vtpm)
+    /etc/xen/scripts/vtpm "$ACTION"
+    ;;
+  vif)
+    [ -n "$script" ] && $script "$ACTION"
+    ;;
+  vscsi)
+    /etc/xen/scripts/vscsi "$ACTION"
+    ;;
+esac
+
+case "$ACTION" in
+  add)
+    ;;
+  remove)
+    /etc/xen/scripts/xen-hotplug-cleanup
+    ;;
+  online)
+    ;;
+  offline)
+    ;;
+esac
+
+release_lock xenbus_hotplug_global
diff --git a/tools/hotplug/Linux/xen-backend.rules b/tools/hotplug/Linux/xen-backend.rules
new file mode 100644 (file)
index 0000000..af0e231
--- /dev/null
@@ -0,0 +1,8 @@
+SUBSYSTEM=="xen-backend", KERNEL=="vbd*", RUN+="/etc/xen/scripts/block $env{ACTION}"
+SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm $env{ACTION}"
+SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} online"
+SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="offline", RUN+="$env{script} offline"
+SUBSYSTEM=="xen-backend", KERNEL=="vscsi*", RUN+="/etc/xen/scripts/vscsi $env{ACTION}"
+SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/etc/xen/scripts/xen-hotplug-cleanup"
+KERNEL=="evtchn", NAME="xen/%k"
+KERNEL=="blktap[0-9]*", NAME="xen/%k"
diff --git a/tools/hotplug/Linux/xen-hotplug-cleanup b/tools/hotplug/Linux/xen-hotplug-cleanup
new file mode 100644 (file)
index 0000000..706359d
--- /dev/null
@@ -0,0 +1,36 @@
+#! /bin/bash
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+# Claim the lock protecting /etc/xen/scripts/block.  This stops a race whereby
+# paths in the store would disappear underneath that script as it attempted to
+# read from the store checking for device sharing.
+# Any other scripts that do similar things will have to have their lock
+# claimed too.
+# This is pretty horrible, but there's not really a nicer way of solving this.
+claim_lock "block"
+
+# split backend/DEVCLASS/VMID/DEVID on slashes
+path_array=( ${XENBUS_PATH//\// } )
+# get /vm/UUID path
+vm=$(xenstore_read_default "/local/domain/${path_array[2]}/vm" "")
+# construct /vm/UUID/device/DEVCLASS/DEVID
+if [ "$vm" != "" ]; then
+  vm_dev="$vm/device/${path_array[1]}/${path_array[3]}"
+else
+  vm_dev=
+fi
+
+# remove device frontend store entries
+xenstore-rm -t \
+  $(xenstore-read "$XENBUS_PATH/frontend" 2>/dev/null) 2>/dev/null || true
+
+# remove device backend store entries
+xenstore-rm -t "$XENBUS_PATH"        2>/dev/null || true
+xenstore-rm -t "error/$XENBUS_PATH"  2>/dev/null || true
+
+# remove device path from /vm/UUID
+[ "$vm_dev" != "" ] && xenstore-rm -t "$vm_dev" 2>/dev/null || true
+
+release_lock "block"
diff --git a/tools/hotplug/Linux/xen-hotplug-common.sh b/tools/hotplug/Linux/xen-hotplug-common.sh
new file mode 100644 (file)
index 0000000..980a627
--- /dev/null
@@ -0,0 +1,93 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+
+dir=$(dirname "$0")
+. "$dir/logging.sh"
+. "$dir/xen-script-common.sh"
+. "$dir/locking.sh"
+
+exec 2>>/var/log/xen/xen-hotplug.log
+
+export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
+export LANG="POSIX"
+unset $(set | grep ^LC_ | cut -d= -f1)
+
+fatal() {
+  xenstore_write "$XENBUS_PATH/hotplug-error" "$*" \
+                 "$XENBUS_PATH/hotplug-status" error
+  log err "$@"
+  exit 1
+}
+
+success() {
+  # Tell DevController that backend is "connected"
+  xenstore_write "$XENBUS_PATH/hotplug-status" connected
+}
+
+do_or_die() {
+  "$@" || fatal "$@ failed"
+}
+
+do_without_error() {
+  "$@" 2>/dev/null || log debug "$@ failed"
+}
+
+sigerr() {
+  fatal "$0 failed; error detected."
+}
+
+trap sigerr ERR
+
+
+##
+# xenstore_read <path>+
+#
+# Read each of the given paths, returning each result on a separate line, or
+# exit this script if any of the paths is missing.
+#
+xenstore_read() {
+  local v=$(xenstore-read "$@" || true)
+  [ "$v" != "" ] || fatal "xenstore-read $@ failed."
+  echo "$v"
+}
+
+
+##
+# xenstore_read_default <path> <default>
+#
+# Read the given path, returning the value there or the given default if the
+# path is not present.
+#
+xenstore_read_default() {
+  xenstore-read "$1" 2>/dev/null || echo "$2"
+}
+
+
+##
+# xenstore_write (<path> <value>)+
+#
+# Write each of the key/value pairs to the store, and exit this script if any
+# such writing fails.
+#
+xenstore_write() {
+  log debug "Writing $@ to xenstore."
+  xenstore-write "$@" || fatal "Writing $@ to xenstore failed."
+}
+
+
+log debug "$@" "XENBUS_PATH=$XENBUS_PATH"
diff --git a/tools/hotplug/Linux/xen-network-common.sh b/tools/hotplug/Linux/xen-network-common.sh
new file mode 100644 (file)
index 0000000..7014333
--- /dev/null
@@ -0,0 +1,118 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+
+# Gentoo doesn't have ifup/ifdown, so we define appropriate alternatives.
+
+# Other platforms just use ifup / ifdown directly.
+
+##
+# preiftransfer
+#
+# @param $1 The current name for the physical device, which is also the name
+#           that the virtual device will take once the physical device has
+#           been renamed.
+
+if ! which ifup >/dev/null 2>/dev/null
+then
+  preiftransfer()
+  {
+    true
+  }
+  ifup()
+  {
+    false
+  }
+  ifdown()
+  {
+    false
+  }
+else
+  preiftransfer()
+  {
+    true
+  }
+fi
+
+
+first_file()
+{
+  t="$1"
+  shift
+  for file in $@
+  do
+    if [ "$t" "$file" ]
+    then
+      echo "$file"
+      return
+    fi
+  done
+}
+
+find_dhcpd_conf_file()
+{
+  first_file -f /etc/dhcp3/dhcpd.conf /etc/dhcpd.conf
+}
+
+
+find_dhcpd_init_file()
+{
+  first_file -x /etc/init.d/{dhcp3-server,dhcp,dhcpd}
+}
+
+find_dhcpd_arg_file()
+{
+  first_file -f /etc/sysconfig/dhcpd /etc/defaults/dhcp /etc/default/dhcp3-server
+}
+
+# configure interfaces which act as pure bridge ports:
+setup_bridge_port() {
+    local dev="$1"
+
+    # take interface down ...
+    ip link set ${dev} down
+
+    # ... and configure it
+    ip addr flush ${dev}
+}
+
+# Usage: create_bridge bridge
+create_bridge () {
+    local bridge=$1
+
+    # Don't create the bridge if it already exists.
+    if [ ! -e "/sys/class/net/${bridge}/bridge" ]; then
+       brctl addbr ${bridge}
+       brctl stp ${bridge} off
+       brctl setfd ${bridge} 0
+    fi
+}
+
+# Usage: add_to_bridge bridge dev
+add_to_bridge () {
+    local bridge=$1
+    local dev=$2
+
+    # Don't add $dev to $bridge if it's already on a bridge.
+    if [ -e "/sys/class/net/${bridge}/brif/${dev}" ]; then
+       ip link set ${dev} up || true
+       return
+    fi
+    brctl addif ${bridge} ${dev}
+    ip link set ${dev} up
+}
+
diff --git a/tools/hotplug/Linux/xen-script-common.sh b/tools/hotplug/Linux/xen-script-common.sh
new file mode 100644 (file)
index 0000000..f6841ac
--- /dev/null
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+
+set -e
+
+
+evalVariables()
+{
+  for arg in "$@"
+  do
+    if expr 'index' "$arg" '=' '>' '1' >/dev/null
+    then
+      eval "$arg"
+    fi
+  done
+}
+
+
+findCommand()
+{
+  for arg in "$@"
+  do
+    if ! expr 'index' "$arg" '=' >/dev/null
+    then
+      command="$arg"
+      return
+    fi
+  done
+}
diff --git a/tools/hotplug/Linux/xend.rules b/tools/hotplug/Linux/xend.rules
new file mode 100644 (file)
index 0000000..d996555
--- /dev/null
@@ -0,0 +1,3 @@
+SUBSYSTEM=="pci", RUN+="socket:/org/xen/xend/udev_event"
+#SUBSYSTEM=="scsi", RUN+="socket:/org/xen/xend/udev_event"
+#SUBSYSTEM=="net", KERNEL!="vif[0-9]*.[0-9]*|tap[0-9]*.[0-9]*", RUN+="socket:/org/xen/xend/udev_event"
diff --git a/tools/hotplug/Makefile b/tools/hotplug/Makefile
new file mode 100644 (file)
index 0000000..979e916
--- /dev/null
@@ -0,0 +1,9 @@
+XEN_ROOT = ../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+SUBDIRS-y := common
+SUBDIRS-$(CONFIG_NetBSD) += NetBSD
+SUBDIRS-$(CONFIG_Linux) += Linux
+
+.PHONY: all clean install
+all clean install: %: subdirs-%
diff --git a/tools/hotplug/NetBSD/Makefile b/tools/hotplug/NetBSD/Makefile
new file mode 100644 (file)
index 0000000..8577a6f
--- /dev/null
@@ -0,0 +1,39 @@
+XEN_ROOT = ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+# Xen configuration dir and configs to go there.
+XEN_CONFIG_DIR = /etc/xen
+
+# Xen script dir and scripts to go there.
+XEN_SCRIPT_DIR = $(XEN_CONFIG_DIR)/scripts
+XEN_SCRIPTS =
+XEN_SCRIPTS += block-nbsd
+XEN_SCRIPTS += qemu-ifup-nbsd
+XEN_SCRIPTS += vif-bridge-nbsd
+XEN_SCRIPTS += vif-ip-nbsd
+
+XEN_SCRIPT_DATA =
+
+.PHONY: all
+all:
+
+.PHONY: build
+build:
+
+.PHONY: install
+install: all install-scripts
+
+.PHONY: install-scripts
+install-scripts:
+       $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
+       set -e; for i in $(XEN_SCRIPTS); \
+          do \
+          $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+       done
+       set -e; for i in $(XEN_SCRIPT_DATA); \
+          do \
+          $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+       done
+
+.PHONY: clean
+clean:
diff --git a/tools/hotplug/NetBSD/block-nbsd b/tools/hotplug/NetBSD/block-nbsd
new file mode 100644 (file)
index 0000000..915ddb7
--- /dev/null
@@ -0,0 +1,88 @@
+#!/bin/sh -e
+
+# $NetBSD: block-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $
+# Called by xenbackendd
+# Usage: block xsdir_backend_path state
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+error() {
+       echo "$@" >&2
+       xenstore_write $xpath/hotplug-status error
+       exit 1
+}
+       
+
+xpath=$1
+xstatus=$2
+xtype=$(xenstore-read "$xpath/type")
+xparams=$(xenstore-read "$xpath/params")
+
+case $xstatus in
+6)
+       # device removed
+       case $xtype in
+       file)
+               vnd=$(xenstore-read "$xpath/vnd" || echo none)
+               if [ $vnd != none ]; then
+                       vnconfig -u $vnd
+               fi
+               ;;
+       phy)
+               ;;
+       *)
+               echo "unknown type $xtype" >&2
+               ;;
+       esac
+       xenstore-rm $xpath
+       exit 0
+       ;;
+2)
+       case $xtype in
+       file)
+               # Store the list of available vnd(4) devices in
+               #``available_disks'', and mark them as ``free''.
+               list=`ls -1 /dev/vnd[0-9]*d | sed "s,/dev/vnd,,;s,d,," | sort -n`
+               for i in $list; do
+                       disk="vnd$i"
+                       available_disks="$available_disks $disk"
+                       eval $disk=free
+               done
+               # Mark the used vnd(4) devices as ``used''.
+               for disk in `sysctl hw.disknames`; do
+                       case $disk in
+                       vnd[0-9]*) eval $disk=used ;;
+                       esac
+               done
+               # Configure the first free vnd(4) device.
+               for disk in $available_disks; do
+                       eval status=\$$disk
+                       if [ "$status" = "free" ] && \
+                           vnconfig /dev/${disk}d $xparams >/dev/null; then
+                               device=/dev/${disk}d
+                               echo vnconfig /dev/${disk}d $xparams
+                               break   
+                       fi
+               done
+               if [ x$device = x ] ; then
+                       error "no available vnd device"
+               fi
+               echo xenstore-write $xpath/vnd $device
+               xenstore-write $xpath/vnd $device
+               ;;
+       phy)
+               device=$xparams
+               ;;
+       esac
+       physical_device=$(stat -f '%r' "$device")
+       echo xenstore-write $xpath/physical-device $physical_device
+       xenstore-write $xpath/physical-device $physical_device
+       echo xenstore-write $xpath/hotplug-status connected
+       xenstore-write $xpath/hotplug-status connected
+       exit 0
+       ;;
+*)
+       exit 0
+       ;;
+esac
diff --git a/tools/hotplug/NetBSD/qemu-ifup-nbsd b/tools/hotplug/NetBSD/qemu-ifup-nbsd
new file mode 100644 (file)
index 0000000..eee7876
--- /dev/null
@@ -0,0 +1,3 @@
+#!/bin/sh
+ifconfig $1 up
+exec /sbin/brconfig $2 add $1
diff --git a/tools/hotplug/NetBSD/vif-bridge-nbsd b/tools/hotplug/NetBSD/vif-bridge-nbsd
new file mode 100644 (file)
index 0000000..bedb387
--- /dev/null
@@ -0,0 +1,35 @@
+#!/bin/sh -e
+
+# $NetBSD: vif-bridge-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $
+# Called by xenbackendd
+# Usage: vif-bridge xsdir_backend_path state
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+xpath=$1
+xstatus=$2
+
+case $xstatus in
+6)
+       # device removed
+       xenstore-rm $xpath
+       exit 0
+       ;;
+2)
+       xbridge=$(xenstore-read "$xpath/bridge")
+       xfid=$(xenstore-read "$xpath/frontend-id")
+       xhandle=$(xenstore-read "$xpath/handle")
+       iface=xvif$xfid.$xhandle
+       echo ifconfig $iface up
+       ifconfig $iface up
+       brconfig $xbridge add $iface
+       echo brconfig $xbridge add $iface
+       xenstore-write $xpath/hotplug-status connected
+       echo xenstore-write $xpath/hotplug-status connected
+       exit 0
+       ;;
+*)
+       exit 0
+       ;;
+esac
diff --git a/tools/hotplug/NetBSD/vif-ip-nbsd b/tools/hotplug/NetBSD/vif-ip-nbsd
new file mode 100644 (file)
index 0000000..d8b5bb9
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh -e
+
+# $NetBSD: vif-ip-nbsd,v 1.1.1.1 2008/08/07 20:26:57 cegger Exp $
+# Called by xenbackendd
+# Usage: vif-ip xsdir_backend_path state
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+xpath=$1
+xstatus=$2
+
+case $xstatus in
+6)
+       # device removed
+       xenstore-rm $xpath
+       exit 0
+       ;;
+2)
+       xip=$(xenstore-read "$xpath/ip")
+       xfid=$(xenstore-read "$xpath/frontend-id")
+       xhandle=$(xenstore-read "$xpath/handle")
+       iface=xvif$xfid.$xhandle
+       echo ifconfig $iface $xip up
+       ifconfig $iface $xip up
+       xenstore-write $xpath/hotplug-status connected
+       echo xenstore-write $xpath/hotplug-status connected
+       exit 0
+       ;;
+*)
+       exit 0
+       ;;
+esac
diff --git a/tools/hotplug/common/Makefile b/tools/hotplug/common/Makefile
new file mode 100644 (file)
index 0000000..b69b999
--- /dev/null
@@ -0,0 +1,37 @@
+XEN_ROOT = ../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+# OS-independent hotplug scripts go in this directory
+
+# Xen configuration dir and configs to go there.
+XEN_CONFIG_DIR = /etc/xen
+
+# Xen script dir and scripts to go there.
+XEN_SCRIPT_DIR = /etc/xen/scripts
+XEN_SCRIPTS =
+XEN_SCRIPT_DATA =
+
+.PHONY: all
+all:
+
+.PHONY: build
+build:
+
+.PHONY: install
+install: all install-scripts
+
+.PHONY: install-scripts
+install-scripts:
+       [ -d $(DESTDIR)$(XEN_SCRIPT_DIR) ] || \
+               $(INSTALL_DIR) $(DESTDIR)$(XEN_SCRIPT_DIR)
+       set -e; for i in $(XEN_SCRIPTS); \
+          do \
+          $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+       done
+       set -e; for i in $(XEN_SCRIPT_DATA); \
+          do \
+          $(INSTALL_DATA) $$i $(DESTDIR)$(XEN_SCRIPT_DIR); \
+       done
+
+.PHONY: clean
+clean:
index db0b7605596e0d544f82c4edcb95c5f6f77c3f1d..62c27170ef840d1ca18b5ea135c793c426f26950 100644 (file)
@@ -10,11 +10,12 @@ xen-foreign:
 
 xen/.dir:
        @rm -rf xen
-       mkdir xen
+       mkdir -p xen/libelf
        ln -sf ../$(XEN_ROOT)/xen/include/public/COPYING xen
        ln -sf $(addprefix ../,$(wildcard $(XEN_ROOT)/xen/include/public/*.h)) xen
        ln -sf $(addprefix ../$(XEN_ROOT)/xen/include/public/,arch-ia64 arch-x86 hvm io xsm) xen
        ln -sf ../xen-sys/$(XEN_OS) xen/sys
+       ln -sf $(addprefix ../../$(XEN_ROOT)/xen/include/xen/,libelf.h elfstructs.h) xen/libelf/
        ln -s ../xen-foreign xen/foreign
        touch $@
 
index ead6cd1559583dbe4a54fa913f4f448a5f049e36..5f2908502b5cf09556bb0644a6033030c002034a 100644 (file)
@@ -1,7 +1,7 @@
 
 structs                   |  x86_32  x86_64    ia64
 
-start_info                |    1104    1152    1152
+start_info                |    1112    1168    1168
 trap_info                 |       8      16       -
 pt_fpreg                  |       -       -      16
 cpu_user_regs             |      68     200       -
index da981b717895a25103bb4f1fa23f664db282d6d4..dd67529ae70f6e2a3d79d454ef68f7468bb494be 100644 (file)
@@ -6,7 +6,7 @@ includedir=$(prefix)/include
 libdir=$(prefix)/lib
 
 ARCH := $(shell uname -m | sed -e s/i.86/i386/)
-CFLAGS := -nostdlib -nostartfiles -Wall -I. -g -fomit-frame-pointer -O2 -fPIC
+CFLAGS = -nostdlib -nostartfiles -Wall -I. -g -fomit-frame-pointer -O2 -fPIC
 SO_CFLAGS=-shared $(CFLAGS)
 L_CFLAGS=$(CFLAGS)
 LINK_FLAGS=
index afc08bdaab727f4c4011b9e930b852de213360b9..accbafef15a65915526f2a171f1e9411d070e26a 100644 (file)
@@ -1,8 +1,6 @@
 include $(XEN_ROOT)/tools/Rules.mk
 
-DEPS = .*.d
-
-CFLAGS += -I$(XEN_ROOT)/tools/libfsimage/common/ -Werror -Wp,-MD,.$(@F).d
+CFLAGS += -I$(XEN_ROOT)/tools/libfsimage/common/ -Werror
 LDFLAGS += -L../common/
 
 PIC_OBJS := $(patsubst %.c,%.opic,$(LIB_SRCS-y))
index 641bca53b3a7945784e5111275ca78da1b73fe50..48851acfa6719cacb7e12312ffd15720e9b56e41 100644 (file)
@@ -4,9 +4,6 @@ include $(XEN_ROOT)/tools/Rules.mk
 MAJOR = 1.0
 MINOR = 0
 
-CFLAGS += -Werror -Wp,-MD,.$(@F).d
-DEPS = .*.d
-
 LDFLAGS-$(CONFIG_SunOS) = -Wl,-M -Wl,mapfile-SunOS
 LDFLAGS-$(CONFIG_Linux) = -Wl,mapfile-GNU
 LDFLAGS = $(LDFLAGS-y)
index 1409e3cde6eb770a3884889a46c7c342e10bd60b..0e4e6aeeaff92a098e4c5c200268f89b18957943 100644 (file)
@@ -298,8 +298,7 @@ uberblock_verify(uberblock_phys_t *ub, int offset)
                return (-1);
 
        if (uber->ub_magic == UBERBLOCK_MAGIC &&
-           uber->ub_version >= SPA_VERSION_1 &&
-           uber->ub_version <= SPA_VERSION)
+           uber->ub_version > 0 && uber->ub_version <= SPA_VERSION)
                return (0);
 
        return (-1);
index 01873548539c0593c828e84f7a8db641c69c79d5..3fd679e458b1ef9ccbcdfef681f85c51e3b85e03 100644 (file)
 /*
  * On-disk version number.
  */
-#define        SPA_VERSION_1                   1ULL
-#define        SPA_VERSION_2                   2ULL
-#define        SPA_VERSION_3                   3ULL
-#define        SPA_VERSION_4                   4ULL
-#define        SPA_VERSION_5                   5ULL
-#define        SPA_VERSION_6                   6ULL
-#define        SPA_VERSION_7                   7ULL
-#define        SPA_VERSION_8                   8ULL
-#define        SPA_VERSION_9                   9ULL
-#define        SPA_VERSION_10                  10ULL
-#define        SPA_VERSION                     SPA_VERSION_10
+#define        SPA_VERSION                     14ULL
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
index 4ec156c1d828ec6ba1bb292a788502322ac53361..acd7067e572052c6b57280c6406fbd774c204f35 100644 (file)
@@ -1,7 +1,7 @@
 XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
 
-MAJOR    = 3.2
+MAJOR    = 3.4
 MINOR    = 0
 
 CTRL_SRCS-y       :=
@@ -29,7 +29,7 @@ CTRL_SRCS-$(CONFIG_NetBSD) += xc_netbsd.c
 CTRL_SRCS-$(CONFIG_MiniOS) += xc_minios.c
 
 GUEST_SRCS-y :=
-GUEST_SRCS-y += xg_private.c
+GUEST_SRCS-y += xg_private.c xc_suspend.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
 GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
 
@@ -62,10 +62,7 @@ CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE
 # libraries.
 #CFLAGS   += -DVALGRIND -O0 -ggdb3
 
-# Get gcc to generate the dependencies for us.
-CFLAGS   += -Wp,-MD,.$(@F).d
 LDFLAGS  += -L.
-DEPS     = .*.d
 
 CTRL_LIB_OBJS := $(patsubst %.c,%.o,$(CTRL_SRCS-y))
 CTRL_PIC_OBJS := $(patsubst %.c,%.opic,$(CTRL_SRCS-y))
index 89f1143203c9963f48479a2e08bd5de10088d356..291679e7ea25e18f0d6a43531af4a4c57125163a 100644 (file)
@@ -1099,6 +1099,22 @@ error_out:
     return -1;
 }
 
+/* xc_hvm_build_target_mem: 
+ * Create a domain for a pre-ballooned virtualized Linux, using
+ * files/filenames.  If target < memsize, domain is created with
+ * memsize pages marked populate-on-demand, and with a PoD cache size
+ * of target.  If target == memsize, pages are populated normally.
+ */
+int xc_hvm_build_target_mem(int xc_handle,
+                            uint32_t domid,
+                            int memsize,
+                            int target,
+                            const char *image_name)
+{
+    /* XXX:PoD isn't supported yet */
+    return xc_hvm_build(xc_handle, domid, target, image_name);
+}
+
 /*
  * From asm/pgtable.h
  */
index 290c8e726f6fa1dfb4b2fadfe1c01aef6db98e82..69f4e892503e073d0641ee872ca87df094737e2e 100644 (file)
@@ -128,7 +128,8 @@ xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
     fprintf(stderr, "ip=%016lx, b0=%016lx\n", ctxt->regs.ip, ctxt->regs.b[0]);
 
     /* Initialize and set registers.  */
-    ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online;
+    ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online |
+        VGCF_SET_AR_ITC;
     if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) {
         ERROR("Couldn't set vcpu context");
         return -1;
index 9ee39810409be150753e2b3d1035886137a972e8..ef1e8dd7a1bde8a4c84b3c5558ccddbc04da4d7b 100644 (file)
@@ -58,9 +58,6 @@
 /* number of pages to write at a time */
 #define DUMP_INCREMENT (4 * 1024)
 
-/* Don't yet support cross-address-size core dump */
-#define guest_width (sizeof (unsigned long))
-
 /* string table */
 struct xc_core_strtab {
     char       *strings;
@@ -240,7 +237,7 @@ xc_core_ehdr_init(Elf64_Ehdr *ehdr)
     ehdr->e_ident[EI_ABIVERSION] = EV_CURRENT;
 
     ehdr->e_type = ET_CORE;
-    ehdr->e_machine = ELF_ARCH_MACHINE;
+    /* e_machine will be filled in later */
     ehdr->e_version = EV_CURRENT;
     ehdr->e_entry = 0;
     ehdr->e_phoff = 0;
@@ -359,7 +356,8 @@ elfnote_dump_core_header(
 }
 
 static int
-elfnote_dump_xen_version(void *args, dumpcore_rtn_t dump_rtn, int xc_handle)
+elfnote_dump_xen_version(void *args, dumpcore_rtn_t dump_rtn, int xc_handle,
+                         unsigned int guest_width)
 {
     int sts;
     struct elfnote elfnote;
@@ -371,6 +369,12 @@ elfnote_dump_xen_version(void *args, dumpcore_rtn_t dump_rtn, int xc_handle)
     elfnote.descsz = sizeof(xen_version);
     elfnote.type = XEN_ELFNOTE_DUMPCORE_XEN_VERSION;
     elfnote_fill_xen_version(xc_handle, &xen_version);
+    if (guest_width < sizeof(unsigned long))
+    {
+        // 32 bit elf file format differs in pagesize's alignment
+        char *p = (char *)&xen_version.pagesize;
+        memmove(p - 4, p, sizeof(xen_version.pagesize));
+    }
     sts = dump_rtn(args, (char*)&elfnote, sizeof(elfnote));
     if ( sts != 0 )
         return sts;
@@ -396,6 +400,24 @@ elfnote_dump_format_version(void *args, dumpcore_rtn_t dump_rtn)
     return dump_rtn(args, (char*)&format_version, sizeof(format_version));
 }
 
+static int
+get_guest_width(int xc_handle,
+                uint32_t domid,
+                unsigned int *guest_width)
+{
+    DECLARE_DOMCTL;
+
+    memset(&domctl, 0, sizeof(domctl));
+    domctl.domain = domid;
+    domctl.cmd = XEN_DOMCTL_get_address_size;
+
+    if ( do_domctl(xc_handle, &domctl) != 0 )
+        return 1;
+        
+    *guest_width = domctl.u.address_size.size / 8;
+    return 0;
+}
+
 int
 xc_domain_dumpcore_via_callback(int xc_handle,
                                 uint32_t domid,
@@ -403,7 +425,8 @@ xc_domain_dumpcore_via_callback(int xc_handle,
                                 dumpcore_rtn_t dump_rtn)
 {
     xc_dominfo_t info;
-    shared_info_t *live_shinfo = NULL;
+    shared_info_any_t *live_shinfo = NULL;
+    unsigned int guest_width; 
 
     int nr_vcpus = 0;
     char *dump_mem, *dump_mem_start = NULL;
@@ -437,6 +460,12 @@ xc_domain_dumpcore_via_callback(int xc_handle,
     uint16_t strtab_idx;
     struct xc_core_section_headers *sheaders = NULL;
     Elf64_Shdr *shdr;
+    if ( get_guest_width(xc_handle, domid, &guest_width) != 0 )
+    {
+        PERROR("Could not get address size for domain");
+        return sts;
+    }
 
     xc_core_arch_context_init(&arch_ctxt);
     if ( (dump_mem_start = malloc(DUMP_INCREMENT*PAGE_SIZE)) == NULL )
@@ -489,7 +518,17 @@ xc_domain_dumpcore_via_callback(int xc_handle,
     if ( sts != 0 )
         goto out;
 
+    /*
+     * Note: this is the *current* number of pages and may change under
+     * a live dump-core.  We'll just take this value, and if more pages
+     * exist, we'll skip them.  If there's less, then we'll just not use
+     * all the array...
+     *
+     * We don't want to use the total potential size of the memory map
+     * since that is usually much higher than info.nr_pages.
+     */
     nr_pages = info.nr_pages;
+
     if ( !auto_translated_physmap )
     {
         /* obtain p2m table */
@@ -500,7 +539,7 @@ xc_domain_dumpcore_via_callback(int xc_handle,
             goto out;
         }
 
-        sts = xc_core_arch_map_p2m(xc_handle, &info, live_shinfo,
+        sts = xc_core_arch_map_p2m(xc_handle, guest_width, &info, live_shinfo,
                                    &p2m, &p2m_size);
         if ( sts != 0 )
             goto out;
@@ -676,6 +715,7 @@ xc_domain_dumpcore_via_callback(int xc_handle,
     /* write out elf header */
     ehdr.e_shnum = sheaders->num;
     ehdr.e_shstrndx = strtab_idx;
+    ehdr.e_machine = ELF_ARCH_MACHINE;
     sts = dump_rtn(args, (char*)&ehdr, sizeof(ehdr));
     if ( sts != 0 )
         goto out;
@@ -697,7 +737,7 @@ xc_domain_dumpcore_via_callback(int xc_handle,
         goto out;
 
     /* elf note section: xen version */
-    sts = elfnote_dump_xen_version(args, dump_rtn, xc_handle);
+    sts = elfnote_dump_xen_version(args, dump_rtn, xc_handle, guest_width);
     if ( sts != 0 )
         goto out;
 
@@ -757,9 +797,21 @@ xc_domain_dumpcore_via_callback(int xc_handle,
 
             if ( !auto_translated_physmap )
             {
-                gmfn = p2m[i];
-                if ( gmfn == INVALID_P2M_ENTRY )
-                    continue;
+                if ( guest_width >= sizeof(unsigned long) )
+                {
+                    if ( guest_width == sizeof(unsigned long) )
+                        gmfn = p2m[i];
+                    else
+                        gmfn = ((uint64_t *)p2m)[i];
+                    if ( gmfn == INVALID_P2M_ENTRY )
+                        continue;
+                }
+                else
+                {
+                    gmfn = ((uint32_t *)p2m)[i];
+                    if ( gmfn == (uint32_t)INVALID_P2M_ENTRY )
+                       continue;
+                }
 
                 p2m_array[j].pfn = i;
                 p2m_array[j].gmfn = gmfn;
@@ -802,7 +854,7 @@ copy_done:
         /* When live dump-mode (-L option) is specified,
          * guest domain may reduce memory. pad with zero pages.
          */
-        IPRINTF("j (%ld) != nr_pages (%ld)", j , nr_pages);
+        IPRINTF("j (%ld) != nr_pages (%ld)", j, nr_pages);
         memset(dump_mem_start, 0, PAGE_SIZE);
         for (; j < nr_pages; j++) {
             sts = dump_rtn(args, dump_mem_start, PAGE_SIZE);
@@ -891,7 +943,7 @@ xc_domain_dumpcore(int xc_handle,
     struct dump_args da;
     int sts;
 
-    if ( (da.fd = open(corename, O_CREAT|O_RDWR, S_IWUSR|S_IRUSR)) < 0 )
+    if ( (da.fd = open(corename, O_CREAT|O_RDWR|O_TRUNC, S_IWUSR|S_IRUSR)) < 0 )
     {
         PERROR("Could not open corefile %s", corename);
         return -errno;
index c5663e94261870996f0ac7be37b6beb2149eb961..d148732f13dd69d5abb8f1d14ebb24f69ea4c495 100644 (file)
@@ -23,7 +23,7 @@
 
 #include "xen/version.h"
 #include "xg_private.h"
-#include "xen/elfstructs.h"
+#include "xen/libelf/elfstructs.h"
 
 /* section names */
 #define XEN_DUMPCORE_SEC_NOTE                   ".note.Xen"
@@ -136,12 +136,12 @@ int xc_core_arch_auto_translated_physmap(const xc_dominfo_t *info);
 struct xc_core_arch_context;
 int xc_core_arch_memory_map_get(int xc_handle,
                                 struct xc_core_arch_context *arch_ctxt,
-                                xc_dominfo_t *info, shared_info_t *live_shinfo,
+                                xc_dominfo_t *info, shared_info_any_t *live_shinfo,
                                 xc_core_memory_map_t **mapp,
                                 unsigned int *nr_entries);
-int xc_core_arch_map_p2m(int xc_handle, xc_dominfo_t *info,
-                         shared_info_t *live_shinfo, xen_pfn_t **live_p2m,
-                         unsigned long *pfnp);
+int xc_core_arch_map_p2m(int xc_handle, unsigned int guest_width,
+                         xc_dominfo_t *info, shared_info_any_t *live_shinfo,
+                         xen_pfn_t **live_p2m, unsigned long *pfnp);
 
 
 #if defined (__i386__) || defined (__x86_64__)
index 23e886ebbfd502bc0c6ff5419573aef359823e4b..b385d65f7dd7e6efabaa2504001d5efa034b2a30 100644 (file)
@@ -68,7 +68,7 @@ xc_core_arch_auto_translated_physmap(const xc_dominfo_t *info)
 /* see setup_guest() @ xc_linux_build.c */
 static int
 memory_map_get_old_domu(int xc_handle, xc_dominfo_t *info,
-                        shared_info_t *live_shinfo,
+                        shared_info_any_t *live_shinfo,
                         xc_core_memory_map_t **mapp, unsigned int *nr_entries)
 {
     xc_core_memory_map_t *map = NULL;
@@ -96,7 +96,7 @@ out:
 /* see setup_guest() @ xc_ia64_hvm_build.c */
 static int
 memory_map_get_old_hvm(int xc_handle, xc_dominfo_t *info, 
-                       shared_info_t *live_shinfo,
+                       shared_info_any_t *live_shinfo,
                        xc_core_memory_map_t **mapp, unsigned int *nr_entries)
 {
     const xc_core_memory_map_t gfw_map[] = {
@@ -155,7 +155,7 @@ out:
 
 static int
 memory_map_get_old(int xc_handle, xc_dominfo_t *info, 
-                   shared_info_t *live_shinfo,
+                   shared_info_any_t *live_shinfo,
                    xc_core_memory_map_t **mapp, unsigned int *nr_entries)
 {
     if ( info->hvm )
@@ -170,7 +170,8 @@ memory_map_get_old(int xc_handle, xc_dominfo_t *info,
 int
 xc_core_arch_memory_map_get(int xc_handle,
                             struct xc_core_arch_context *arch_ctxt,
-                            xc_dominfo_t *info, shared_info_t *live_shinfo,
+                            xc_dominfo_t *info,
+                            shared_info_any_t *live_shinfo,
                             xc_core_memory_map_t **mapp,
                             unsigned int *nr_entries)
 {
@@ -190,8 +191,8 @@ xc_core_arch_memory_map_get(int xc_handle,
     }
 
     /* copy before use in case someone updating them */
-    if (xc_ia64_copy_memmap(xc_handle, info->domid, live_shinfo, &memmap_info,
-                            NULL)) {
+    if (xc_ia64_copy_memmap(xc_handle, info->domid, &live_shinfo->s,
+                            &memmap_info, NULL)) {
         goto old;
     }
 
@@ -235,8 +236,8 @@ old:
 }
 
 int
-xc_core_arch_map_p2m(int xc_handle, xc_dominfo_t *info,
-                     shared_info_t *live_shinfo, xen_pfn_t **live_p2m,
+xc_core_arch_map_p2m(int xc_handle, unsigned int guest_width, xc_dominfo_t *info,
+                     shared_info_any_t *live_shinfo, xen_pfn_t **live_p2m,
                      unsigned long *pfnp)
 {
     /*
index d9eaa49b975816b7f4b2048c8e7ed876ec2450c8..765c745cb6d8c80f2d4b7796e158fa4d6bc6c1f8 100644 (file)
 
 #include "xg_private.h"
 #include "xc_core.h"
+#include "xc_e820.h"
+
+#define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f))
+
+#ifndef MAX
+#define MAX(_a, _b) ((_a) >= (_b) ? (_a) : (_b))
+#endif
+
+int
+xc_core_arch_gpfn_may_present(struct xc_core_arch_context *arch_ctxt,
+                              unsigned long pfn)
+{
+    if ((pfn >= 0xa0 && pfn < 0xc0) /* VGA hole */
+        || (pfn >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+            && pfn < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */
+        return 0;
+    return 1;
+}
 
-/* Don't yet support cross-address-size core dump */
-#define guest_width (sizeof (unsigned long))
 
 static int nr_gpfns(int xc_handle, domid_t domid)
 {
@@ -37,7 +53,7 @@ xc_core_arch_auto_translated_physmap(const xc_dominfo_t *info)
 
 int
 xc_core_arch_memory_map_get(int xc_handle, struct xc_core_arch_context *unused,
-                            xc_dominfo_t *info, shared_info_t *live_shinfo,
+                            xc_dominfo_t *info, shared_info_any_t *live_shinfo,
                             xc_core_memory_map_t **mapp,
                             unsigned int *nr_entries)
 {
@@ -60,17 +76,22 @@ xc_core_arch_memory_map_get(int xc_handle, struct xc_core_arch_context *unused,
 }
 
 int
-xc_core_arch_map_p2m(int xc_handle, xc_dominfo_t *info,
-                     shared_info_t *live_shinfo, xen_pfn_t **live_p2m,
+xc_core_arch_map_p2m(int xc_handle, unsigned int guest_width, xc_dominfo_t *info,
+                     shared_info_any_t *live_shinfo, xen_pfn_t **live_p2m,
                      unsigned long *pfnp)
 {
     /* Double and single indirect references to the live P2M table */
     xen_pfn_t *live_p2m_frame_list_list = NULL;
     xen_pfn_t *live_p2m_frame_list = NULL;
+    /* Copies of the above. */
+    xen_pfn_t *p2m_frame_list_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
+
     uint32_t dom = info->domid;
     unsigned long p2m_size = nr_gpfns(xc_handle, info->domid);
     int ret = -1;
     int err;
+    int i;
 
     if ( p2m_size < info->nr_pages  )
     {
@@ -80,7 +101,7 @@ xc_core_arch_map_p2m(int xc_handle, xc_dominfo_t *info,
 
     live_p2m_frame_list_list =
         xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
-                             live_shinfo->arch.pfn_to_mfn_frame_list_list);
+                             GET_FIELD(live_shinfo, arch.pfn_to_mfn_frame_list_list));
 
     if ( !live_p2m_frame_list_list )
     {
@@ -88,9 +109,28 @@ xc_core_arch_map_p2m(int xc_handle, xc_dominfo_t *info,
         goto out;
     }
 
+    /* Get a local copy of the live_P2M_frame_list_list */
+    if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
+    {
+        ERROR("Couldn't allocate p2m_frame_list_list array");
+        goto out;
+    }
+    memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
+
+    /* Canonicalize guest's unsigned long vs ours */
+    if ( guest_width > sizeof(unsigned long) )
+        for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
+            if ( i < PAGE_SIZE/guest_width )
+                p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
+            else
+                p2m_frame_list_list[i] = 0;
+    else if ( guest_width < sizeof(unsigned long) )
+        for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
+            p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
+
     live_p2m_frame_list =
         xc_map_foreign_pages(xc_handle, dom, PROT_READ,
-                             live_p2m_frame_list_list,
+                             p2m_frame_list_list,
                              P2M_FLL_ENTRIES);
 
     if ( !live_p2m_frame_list )
@@ -99,8 +139,25 @@ xc_core_arch_map_p2m(int xc_handle, xc_dominfo_t *info,
         goto out;
     }
 
+    /* Get a local copy of the live_P2M_frame_list */
+    if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
+    {
+        ERROR("Couldn't allocate p2m_frame_list array");
+        goto out;
+    }
+    memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
+    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
+
+    /* Canonicalize guest's unsigned long vs ours */
+    if ( guest_width > sizeof(unsigned long) )
+        for ( i = 0; i < P2M_FL_ENTRIES; i++ )
+            p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
+    else if ( guest_width < sizeof(unsigned long) )
+        for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
+            p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
+
     *live_p2m = xc_map_foreign_pages(xc_handle, dom, PROT_READ,
-                                    live_p2m_frame_list,
+                                    p2m_frame_list,
                                     P2M_FL_ENTRIES);
 
     if ( !*live_p2m )
@@ -122,6 +179,12 @@ out:
     if ( live_p2m_frame_list )
         munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
 
+    if ( p2m_frame_list_list )
+        free(p2m_frame_list_list);
+
+    if ( p2m_frame_list )
+        free(p2m_frame_list);
+
     errno = err;
     return ret;
 }
index 6e3490bb275cfc12c8e50dd031f713e2a5da304d..739d90ef290ded7b35fd587230091cf49a7cb812 100644 (file)
 #ifndef XC_CORE_X86_H
 #define XC_CORE_X86_H
 
-#if defined(__i386__) || defined(__x86_64__)
 #define ELF_ARCH_DATA           ELFDATA2LSB
-#if defined (__i386__)
-# define ELF_ARCH_MACHINE       EM_386
-#else
-# define ELF_ARCH_MACHINE       EM_X86_64
-#endif
-#endif /* __i386__ or __x86_64__ */
-
+#define ELF_ARCH_MACHINE       (guest_width == 8 ? EM_X86_64 : EM_386)
 
 struct xc_core_arch_context {
     /* nothing */
@@ -40,8 +33,10 @@ struct xc_core_arch_context {
 #define xc_core_arch_context_get(arch_ctxt, ctxt, xc_handle, domid) \
                                                                 (0)
 #define xc_core_arch_context_dump(arch_ctxt, args, dump_rtn)    (0)
-#define xc_core_arch_gpfn_may_present(arch_ctxt, i)             (1)
 
+int
+xc_core_arch_gpfn_may_present(struct xc_core_arch_context *arch_ctxt,
+                              unsigned long pfn);
 static inline int
 xc_core_arch_context_get_shdr(struct xc_core_arch_context *arch_ctxt, 
                               struct xc_core_section_headers *sheaders,
index 6cd442cfe697c43a3dd9abec5d51a0eb12384cc9..b5ab901c6846fcba508492d4c87cee497a868074 100644 (file)
@@ -83,6 +83,8 @@
 #define X86_FEATURE_SSE4_1     (4*32+19) /* Streaming SIMD Extensions 4.1 */
 #define X86_FEATURE_SSE4_2     (4*32+20) /* Streaming SIMD Extensions 4.2 */
 #define X86_FEATURE_POPCNT     (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_XSAVE      (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
 #define X86_FEATURE_XSTORE     (5*32+ 2) /* on-CPU RNG present (xstore insn) */
index 6a8e7594c80e6b8463909b2a6c4cc3419ef140e1..f9ef0de10b3574e06d1258f22378b61a1c22d887 100644 (file)
@@ -194,6 +194,8 @@ static void xc_cpuid_hvm_policy(
                     bitmaskof(X86_FEATURE_SSE4_2) |
                     bitmaskof(X86_FEATURE_POPCNT));
 
+        regs[2] |= bitmaskof(X86_FEATURE_HYPERVISOR);
+
         regs[3] &= (bitmaskof(X86_FEATURE_FPU) |
                     bitmaskof(X86_FEATURE_VME) |
                     bitmaskof(X86_FEATURE_DE) |
@@ -309,6 +311,8 @@ static void xc_cpuid_pv_policy(
         clear_bit(X86_FEATURE_XTPR, regs[2]);
         clear_bit(X86_FEATURE_PDCM, regs[2]);
         clear_bit(X86_FEATURE_DCA, regs[2]);
+        clear_bit(X86_FEATURE_XSAVE, regs[2]);
+        set_bit(X86_FEATURE_HYPERVISOR, regs[2]);
         break;
     case 0x80000001:
         if ( !guest_64bit )
index 770e00bba283ae561b71c2f9b39c7e2095c2da11..ca03ba5db44ef094fe4ded5f398afab95f56970e 100644 (file)
@@ -1,4 +1,4 @@
-#include <xen/libelf.h>
+#include <xen/libelf/libelf.h>
 
 #define INVALID_P2M_ENTRY   ((xen_pfn_t)-1)
 
index 2b6f6b68682001229888866e0679cda756b18fb5..b1e90d8d6e9e458e7837aee6be9cf2663d68c600 100644 (file)
@@ -244,6 +244,7 @@ int xc_dom_do_gunzip(void *src, size_t srclen, void *dst, size_t dstlen)
         return -1;
     }
     rc = inflate(&zStream, Z_FINISH);
+    inflateEnd(&zStream);
     if ( rc != Z_STREAM_END )
     {
         xc_dom_panic(XC_INTERNAL_ERROR,
index f96ec3f7ab1682973770b1379ad65757ee90fc35..6ae9487656d9d9732f756ac0a6dd683ddee86b0c 100644 (file)
@@ -418,7 +418,8 @@ static int start_info_x86_32(struct xc_dom_image *dom)
     xc_dom_printf("%s: called\n", __FUNCTION__);
 
     memset(start_info, 0, sizeof(*start_info));
-    snprintf(start_info->magic, sizeof(start_info->magic), dom->guest_type);
+    strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
+    start_info->magic[sizeof(start_info->magic) - 1] = '\0';
     start_info->nr_pages = dom->total_pages;
     start_info->shared_info = shinfo << PAGE_SHIFT_X86;
     start_info->pt_base = dom->pgtables_seg.vstart;
@@ -457,7 +458,8 @@ static int start_info_x86_64(struct xc_dom_image *dom)
     xc_dom_printf("%s: called\n", __FUNCTION__);
 
     memset(start_info, 0, sizeof(*start_info));
-    snprintf(start_info->magic, sizeof(start_info->magic), dom->guest_type);
+    strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
+    start_info->magic[sizeof(start_info->magic) - 1] = '\0';
     start_info->nr_pages = dom->total_pages;
     start_info->shared_info = shinfo << PAGE_SHIFT_X86;
     start_info->pt_base = dom->pgtables_seg.vstart;
@@ -692,7 +694,7 @@ static int x86_shadow(int xc, domid_t domid)
 int arch_setup_meminit(struct xc_dom_image *dom)
 {
     int rc;
-    xen_pfn_t pfn;
+    xen_pfn_t pfn, allocsz, i;
 
     rc = x86_compat(dom->guest_xc, dom->guest_domid, dom->guest_type);
     if ( rc )
@@ -711,9 +713,15 @@ int arch_setup_meminit(struct xc_dom_image *dom)
         dom->p2m_host[pfn] = pfn;
 
     /* allocate guest memory */
-    rc = xc_domain_memory_populate_physmap(dom->guest_xc, dom->guest_domid,
-                                           dom->total_pages, 0, 0,
-                                           dom->p2m_host);
+    for ( i = rc = allocsz = 0; (i < dom->total_pages) && !rc; i += allocsz )
+    {
+        allocsz = dom->total_pages - i;
+        if ( allocsz > 1024*1024 )
+            allocsz = 1024*1024;
+        rc = xc_domain_memory_populate_physmap(
+            dom->guest_xc, dom->guest_domid, allocsz, 0, 0, &dom->p2m_host[i]);
+    }
+
     return rc;
 }
 
index e1bf86ee6a443d3fe4cbdc0f66fdc4b72ec5374f..6ed4f52ec35fd12c37c3057cbeeb50381b2fe758 100644 (file)
@@ -271,6 +271,38 @@ int xc_domain_hvm_getcontext(int xc_handle,
     return (ret < 0 ? -1 : domctl.u.hvmcontext.size);
 }
 
+/* Get just one element of the HVM guest context.
+ * size must be >= HVM_SAVE_LENGTH(type) */
+int xc_domain_hvm_getcontext_partial(int xc_handle,
+                                     uint32_t domid,
+                                     uint16_t typecode,
+                                     uint16_t instance,
+                                     void *ctxt_buf,
+                                     uint32_t size)
+{
+    int ret;
+    DECLARE_DOMCTL;
+
+    if ( !ctxt_buf ) 
+        return -EINVAL;
+
+    domctl.cmd = XEN_DOMCTL_gethvmcontext_partial;
+    domctl.domain = (domid_t) domid;
+    domctl.u.hvmcontext_partial.type = typecode;
+    domctl.u.hvmcontext_partial.instance = instance;
+    set_xen_guest_handle(domctl.u.hvmcontext_partial.buffer, ctxt_buf);
+
+    if ( (ret = lock_pages(ctxt_buf, size)) != 0 )
+        return ret;
+    
+    ret = do_domctl(xc_handle, &domctl);
+
+    if ( ctxt_buf ) 
+        unlock_pages(ctxt_buf, size);
+
+    return ret ? -1 : 0;
+}
+
 /* set info to hvm guest for restore */
 int xc_domain_hvm_setcontext(int xc_handle,
                              uint32_t domid,
@@ -537,32 +569,75 @@ int xc_domain_memory_populate_physmap(int xc_handle,
     return err;
 }
 
-int xc_domain_memory_translate_gpfn_list(int xc_handle,
-                                         uint32_t domid,
-                                         unsigned long nr_gpfns,
-                                         xen_pfn_t *gpfn_list,
-                                         xen_pfn_t *mfn_list)
+static int xc_domain_memory_pod_target(int xc_handle,
+                                       int op,
+                                       uint32_t domid,
+                                       uint64_t target_pages,
+                                       uint64_t *tot_pages,
+                                       uint64_t *pod_cache_pages,
+                                       uint64_t *pod_entries)
 {
     int err;
-    struct xen_translate_gpfn_list translate_gpfn_list = {
-        .domid    = domid,
-        .nr_gpfns = nr_gpfns,
+
+    struct xen_pod_target pod_target = {
+        .domid = domid,
+        .target_pages = target_pages
     };
-    set_xen_guest_handle(translate_gpfn_list.gpfn_list, gpfn_list);
-    set_xen_guest_handle(translate_gpfn_list.mfn_list, mfn_list);
 
-    err = xc_memory_op(xc_handle, XENMEM_translate_gpfn_list, &translate_gpfn_list);
+    err = xc_memory_op(xc_handle, op, &pod_target);
 
-    if ( err != 0 )
+    if ( err < 0 )
     {
-        DPRINTF("Failed translation for dom %d (%ld PFNs)\n",
-                domid, nr_gpfns);
+        DPRINTF("Failed %s_memory_target dom %d\n",
+                (op==XENMEM_set_pod_target)?"set":"get",
+                domid);
         errno = -err;
         err = -1;
     }
+    else
+        err = 0;
+
+    if ( tot_pages )
+        *tot_pages = pod_target.tot_pages;
+    if ( pod_cache_pages )
+        *pod_cache_pages = pod_target.pod_cache_pages;
+    if ( pod_entries )
+        *pod_entries = pod_target.pod_entries;
 
     return err;
 }
+                                       
+
+int xc_domain_memory_set_pod_target(int xc_handle,
+                                    uint32_t domid,
+                                    uint64_t target_pages,
+                                    uint64_t *tot_pages,
+                                    uint64_t *pod_cache_pages,
+                                    uint64_t *pod_entries)
+{
+    return xc_domain_memory_pod_target(xc_handle,
+                                       XENMEM_set_pod_target,
+                                       domid,
+                                       target_pages,
+                                       tot_pages,
+                                       pod_cache_pages,
+                                       pod_entries);
+}
+
+int xc_domain_memory_get_pod_target(int xc_handle,
+                                    uint32_t domid,
+                                    uint64_t *tot_pages,
+                                    uint64_t *pod_cache_pages,
+                                    uint64_t *pod_entries)
+{
+    return xc_domain_memory_pod_target(xc_handle,
+                                       XENMEM_get_pod_target,
+                                       domid,
+                                       -1,
+                                       tot_pages,
+                                       pod_cache_pages,
+                                       pod_entries);
+}
 
 int xc_domain_max_vcpus(int xc_handle, uint32_t domid, unsigned int max)
 {
@@ -845,7 +920,8 @@ int xc_domain_update_msi_irq(
     uint32_t domid,
     uint32_t gvec,
     uint32_t pirq,
-    uint32_t gflags)
+    uint32_t gflags,
+    uint64_t gtable)
 {
     int rc;
     xen_domctl_bind_pt_irq_t *bind;
@@ -855,6 +931,33 @@ int xc_domain_update_msi_irq(
     domctl.cmd = XEN_DOMCTL_bind_pt_irq;
     domctl.domain = (domid_t)domid;
 
+    bind = &(domctl.u.bind_pt_irq);
+    bind->hvm_domid = domid;
+    bind->irq_type = PT_IRQ_TYPE_MSI;
+    bind->machine_irq = pirq;
+    bind->u.msi.gvec = gvec;
+    bind->u.msi.gflags = gflags;
+    bind->u.msi.gtable = gtable;
+
+    rc = do_domctl(xc_handle, &domctl);
+    return rc;
+}
+
+int xc_domain_unbind_msi_irq(
+    int xc_handle,
+    uint32_t domid,
+    uint32_t gvec,
+    uint32_t pirq,
+    uint32_t gflags)
+{
+    int rc;
+    xen_domctl_bind_pt_irq_t *bind;
+
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_unbind_pt_irq;
+    domctl.domain = (domid_t)domid;
+
     bind = &(domctl.u.bind_pt_irq);
     bind->hvm_domid = domid;
     bind->irq_type = PT_IRQ_TYPE_MSI;
@@ -888,7 +991,8 @@ int xc_domain_bind_pt_irq(
     bind->hvm_domid = domid;
     bind->irq_type = irq_type;
     bind->machine_irq = machine_irq;
-    if ( irq_type == PT_IRQ_TYPE_PCI )
+    if ( irq_type == PT_IRQ_TYPE_PCI ||
+         irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
     {
         bind->u.pci.bus = bus;
         bind->u.pci.device = device;    
@@ -1061,6 +1165,20 @@ int xc_domain_suppress_spurious_page_faults(int xc, uint32_t domid)
 
 }
 
+int xc_domain_debug_control(int xc, uint32_t domid, uint32_t sop, uint32_t vcpu)
+{
+    DECLARE_DOMCTL;
+
+    memset(&domctl, 0, sizeof(domctl));
+    domctl.domain = (domid_t)domid;
+    domctl.cmd = XEN_DOMCTL_debug_op;
+    domctl.u.debug_op.op     = sop;
+    domctl.u.debug_op.vcpu   = vcpu;
+
+    return do_domctl(xc, &domctl);
+}
+
+
 /*
  * Local variables:
  * mode: C
index 19167284951172408d7d1affbf8c7921776a712c..4f92ebb5491e46cf4e19cb6901e81bf5db93a1d1 100644 (file)
@@ -490,6 +490,22 @@ int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
             continue;
         }
 
+        if ( j == -4 )
+        {
+            uint64_t vm86_tss;
+
+            /* Skip padding 4 bytes then read the vm86 TSS location. */
+            if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
+                 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
+            {
+                ERROR("error read the address of the vm86 TSS");
+                goto out;
+            }
+
+            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
+            continue;
+        }
+
         if ( j == 0 )
             break;  /* our work here is done */
 
index a91041498841312a5e7ccc2d0c9b8af6b5fd14b5..04d4b3085c3a49fbb2c381392654fa3479857f8f 100644 (file)
@@ -744,8 +744,6 @@ static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
     return success ? p2m : NULL;
 }
 
-
-
 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                    uint32_t max_factor, uint32_t flags, int (*suspend)(void),
                    int hvm, void *(*init_qemu_maps)(int, unsigned), 
@@ -1388,20 +1386,32 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
     if ( hvm )
     {
         struct {
-            int minusthree;
+            int id;
             uint32_t pad;
-            uint64_t ident_pt;
-        } chunk = { -3, 0 };
+            uint64_t data;
+        } chunk = { 0, };
 
+        chunk.id = -3;
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
-                         (unsigned long *)&chunk.ident_pt);
+                         (unsigned long *)&chunk.data);
 
-        if ( (chunk.ident_pt != 0) &&
+        if ( (chunk.data != 0) &&
              write_exact(io_fd, &chunk, sizeof(chunk)) )
         {
             PERROR("Error when writing the ident_pt for EPT guest");
             goto out;
         }
+
+        chunk.id = -4;
+        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+                         (unsigned long *)&chunk.data);
+
+        if ( (chunk.data != 0) &&
+             write_exact(io_fd, &chunk, sizeof(chunk)) )
+        {
+            PERROR("Error when writing the vm86 TSS for guest");
+            goto out;
+        }
     }
 
     /* Zero terminate */
index 089df2b070d8af5a61fd0f7443a3871622ee50d5..5f7119bd049aa899ccf35b221dc4c3c2ce335fea 100644 (file)
@@ -1 +1 @@
-#include <xen/elfstructs.h>
+#include <xen/libelf/elfstructs.h>
index 752c4e76dcd2f3f4bdb209d73ce4a9af60dd49a4..e20f5ac862866ef8e86b5d87fac11b65bfa4d6f4 100644 (file)
 #include <xen/foreign/x86_64.h>
 #include <xen/hvm/hvm_info_table.h>
 #include <xen/hvm/params.h>
-#include "xc_e820.h"
+#include <xen/hvm/e820.h>
 
-#include <xen/libelf.h>
+#include <xen/libelf/libelf.h>
 
 #define SUPERPAGE_PFN_SHIFT  9
 #define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
 
-#define SCRATCH_PFN 0xFFFFF
-
-#define SPECIALPAGE_GUARD    0
-#define SPECIALPAGE_BUFIOREQ 1
-#define SPECIALPAGE_XENSTORE 2
-#define SPECIALPAGE_IOREQ    3
-#define SPECIALPAGE_IDENT_PT 4
+#define SPECIALPAGE_BUFIOREQ 0
+#define SPECIALPAGE_XENSTORE 1
+#define SPECIALPAGE_IOREQ    2
+#define SPECIALPAGE_IDENT_PT 3
+#define SPECIALPAGE_SHINFO   4
 #define NR_SPECIAL_PAGES     5
+#define special_pfn(x) (0xff000u - NR_SPECIAL_PAGES + (x))
 
-static void build_e820map(void *e820_page, unsigned long long mem_size)
+static void build_hvm_info(void *hvm_info_page, uint64_t mem_size)
 {
-    struct e820entry *e820entry =
-        (struct e820entry *)(((unsigned char *)e820_page) + HVM_E820_OFFSET);
-    unsigned long long extra_mem_size = 0;
-    unsigned char nr_map = 0;
+    struct hvm_info_table *hvm_info = (struct hvm_info_table *)
+        (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
+    uint64_t lowmem_end = mem_size, highmem_end = 0;
+    uint8_t sum;
+    int i;
 
-    /*
-     * Physical address space from HVM_BELOW_4G_RAM_END to 4G is reserved
-     * for PCI devices MMIO. So if HVM has more than HVM_BELOW_4G_RAM_END
-     * RAM, memory beyond HVM_BELOW_4G_RAM_END will go to 4G above.
-     */
-    if ( mem_size > HVM_BELOW_4G_RAM_END )
+    if ( lowmem_end > HVM_BELOW_4G_RAM_END )
     {
-        extra_mem_size = mem_size - HVM_BELOW_4G_RAM_END;
-        mem_size = HVM_BELOW_4G_RAM_END;
+        highmem_end = lowmem_end + (1ull<<32) - HVM_BELOW_4G_RAM_END;
+        lowmem_end = HVM_BELOW_4G_RAM_END;
     }
 
-    /* 0x0-0x9FC00: Ordinary RAM. */
-    e820entry[nr_map].addr = 0x0;
-    e820entry[nr_map].size = 0x9FC00;
-    e820entry[nr_map].type = E820_RAM;
-    nr_map++;
+    memset(hvm_info_page, 0, PAGE_SIZE);
 
-    /* 0x9FC00-0xA0000: Extended BIOS Data Area (EBDA). */
-    e820entry[nr_map].addr = 0x9FC00;
-    e820entry[nr_map].size = 0x400;
-    e820entry[nr_map].type = E820_RESERVED;
-    nr_map++;
+    /* Fill in the header. */
+    strncpy(hvm_info->signature, "HVM INFO", 8);
+    hvm_info->length = sizeof(struct hvm_info_table);
 
-    /*
-     * Following regions are standard regions of the PC memory map.
-     * They are not covered by e820 regions. OSes will not use as RAM.
-     * 0xA0000-0xC0000: VGA memory-mapped I/O. Not covered by E820.
-     * 0xC0000-0xE0000: 16-bit devices, expansion ROMs (inc. vgabios).
-     * TODO: hvmloader should free pages which turn out to be unused.
-     */
+    /* Sensible defaults: these can be overridden by the caller. */
+    hvm_info->acpi_enabled = 1;
+    hvm_info->apic_mode = 1;
+    hvm_info->nr_vcpus = 1;
 
-    /*
-     * 0xE0000-0x0F0000: PC-specific area. We place ACPI tables here.
-     *                   We *cannot* mark as E820_ACPI, for two reasons:
-     *                    1. ACPI spec. says that E820_ACPI regions below
-     *                       16MB must clip INT15h 0x88 and 0xe801 queries.
-     *                       Our rombios doesn't do this.
-     *                    2. The OS is allowed to reclaim ACPI memory after
-     *                       parsing the tables. But our FACS is in this
-     *                       region and it must not be reclaimed (it contains
-     *                       the ACPI global lock!).
-     * 0xF0000-0x100000: System BIOS.
-     * TODO: hvmloader should free pages which turn out to be unused.
-     */
-    e820entry[nr_map].addr = 0xE0000;
-    e820entry[nr_map].size = 0x20000;
-    e820entry[nr_map].type = E820_RESERVED;
-    nr_map++;
-
-    /* Low RAM goes here. Reserve space for special pages. */
-    e820entry[nr_map].addr = 0x100000;
-    e820entry[nr_map].size = (mem_size - 0x100000 -
-                              PAGE_SIZE * NR_SPECIAL_PAGES);
-    e820entry[nr_map].type = E820_RAM;
-    nr_map++;
-
-    /* Explicitly reserve space for special pages (excluding guard page). */
-    e820entry[nr_map].addr = mem_size - PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
-    e820entry[nr_map].size = PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
-    e820entry[nr_map].type = E820_RESERVED;
-    nr_map++;
-
-    if ( extra_mem_size )
-    {
-        e820entry[nr_map].addr = (1ULL << 32);
-        e820entry[nr_map].size = extra_mem_size;
-        e820entry[nr_map].type = E820_RAM;
-        nr_map++;
-    }
+    /* Memory parameters. */
+    hvm_info->low_mem_pgend = lowmem_end >> PAGE_SHIFT;
+    hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
+    hvm_info->reserved_mem_pgstart = special_pfn(0);
 
-    *(((unsigned char *)e820_page) + HVM_E820_NR_OFFSET) = nr_map;
+    /* Finish with the checksum. */
+    for ( i = 0, sum = 0; i < hvm_info->length; i++ )
+        sum += ((uint8_t *)hvm_info)[i];
+    hvm_info->checksum = -sum;
 }
 
 static int loadelfimage(
@@ -146,25 +101,32 @@ static int loadelfimage(
 }
 
 static int setup_guest(int xc_handle,
-                       uint32_t dom, int memsize,
+                       uint32_t dom, int memsize, int target,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
     unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long special_page_nr, entry_eip, cur_pages;
+    unsigned long target_pages = (unsigned long)target << (20 - PAGE_SHIFT);
+    unsigned long pod_pages = 0;
+    unsigned long entry_eip, cur_pages;
     struct xen_add_to_physmap xatp;
     struct shared_info *shared_info;
-    void *e820_page;
+    void *hvm_info_page;
     uint32_t *ident_pt;
     struct elf_binary elf;
     uint64_t v_start, v_end;
     int rc;
     xen_capabilities_info_t caps;
+    int pod_mode = 0;
+    
 
     /* An HVM guest must be initialised with at least 2MB memory. */
-    if ( memsize < 2 )
+    if ( memsize < 2 || target < 2 )
         goto error_out;
 
+    if ( memsize > target )
+        pod_mode = 1;
+
     if ( elf_init(&elf, image, image_size) != 0 )
         goto error_out;
     elf_parse_binary(&elf);
@@ -235,6 +197,10 @@ static int setup_guest(int xc_handle,
                 .extent_order = SUPERPAGE_PFN_SHIFT,
                 .domid        = dom
             };
+
+            if ( pod_mode )
+                sp_req.mem_flags = XENMEMF_populate_on_demand;
+
             set_xen_guest_handle(sp_req.extent_start, sp_extents);
             for ( i = 0; i < sp_req.nr_extents; i++ )
                 sp_extents[i] = page_array[cur_pages+(i<<SUPERPAGE_PFN_SHIFT)];
@@ -242,6 +208,11 @@ static int setup_guest(int xc_handle,
             if ( done > 0 )
             {
                 done <<= SUPERPAGE_PFN_SHIFT;
+                if ( pod_mode && target_pages > cur_pages )
+                {
+                    int d = target_pages - cur_pages;
+                    pod_pages += ( done < d ) ? done : d;
+                }
                 cur_pages += done;
                 count -= done;
             }
@@ -253,9 +224,17 @@ static int setup_guest(int xc_handle,
             rc = xc_domain_memory_populate_physmap(
                 xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
             cur_pages += count;
+            if ( pod_mode )
+                pod_pages -= count;
         }
     }
 
+    if ( pod_mode )
+        rc = xc_domain_memory_set_pod_target(xc_handle,
+                                             dom,
+                                             pod_pages,
+                                             NULL, NULL, NULL);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
@@ -265,23 +244,22 @@ static int setup_guest(int xc_handle,
     if ( loadelfimage(&elf, xc_handle, dom, page_array) != 0 )
         goto error_out;
 
-    if ( (e820_page = xc_map_foreign_range(
+    if ( (hvm_info_page = xc_map_foreign_range(
               xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
-              HVM_E820_PAGE >> PAGE_SHIFT)) == NULL )
+              HVM_INFO_PFN)) == NULL )
         goto error_out;
-    memset(e820_page, 0, PAGE_SIZE);
-    build_e820map(e820_page, v_end);
-    munmap(e820_page, PAGE_SIZE);
+    build_hvm_info(hvm_info_page, v_end);
+    munmap(hvm_info_page, PAGE_SIZE);
 
     /* Map and initialise shared_info page. */
     xatp.domid = dom;
     xatp.space = XENMAPSPACE_shared_info;
     xatp.idx   = 0;
-    xatp.gpfn  = SCRATCH_PFN;
+    xatp.gpfn  = special_pfn(SPECIALPAGE_SHINFO);
     if ( (xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp) != 0) ||
          ((shared_info = xc_map_foreign_range(
              xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
-             SCRATCH_PFN)) == NULL) )
+             special_pfn(SPECIALPAGE_SHINFO))) == NULL) )
         goto error_out;
     memset(shared_info, 0, PAGE_SIZE);
     /* NB. evtchn_upcall_mask is unused: leave as zero. */
@@ -289,31 +267,28 @@ static int setup_guest(int xc_handle,
            sizeof(shared_info->evtchn_mask));
     munmap(shared_info, PAGE_SIZE);
 
-    special_page_nr = (((v_end > HVM_BELOW_4G_RAM_END)
-                        ? (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT)
-                        : (v_end >> PAGE_SHIFT))
-                       - NR_SPECIAL_PAGES);
-
-    /* Paranoia: clean special pages. */
+    /* Allocate and clear special pages. */
     for ( i = 0; i < NR_SPECIAL_PAGES; i++ )
-        if ( xc_clear_domain_page(xc_handle, dom, special_page_nr + i) )
-            goto error_out;
-
-    /* Free the guard page that separates low RAM from special pages. */
-    rc = xc_domain_memory_decrease_reservation(
-        xc_handle, dom, 1, 0, &page_array[special_page_nr]);
-    if ( rc != 0 )
     {
-        PERROR("Could not deallocate guard page for HVM guest.\n");
-        goto error_out;
+        xen_pfn_t pfn = special_pfn(i);
+        if ( i == SPECIALPAGE_SHINFO )
+            continue;
+        rc = xc_domain_memory_populate_physmap(xc_handle, dom, 1, 0, 0, &pfn);
+        if ( rc != 0 )
+        {
+            PERROR("Could not allocate %d'th special page.\n", i);
+            goto error_out;
+        }
+        if ( xc_clear_domain_page(xc_handle, dom, special_pfn(i)) )
+            goto error_out;
     }
 
     xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
-                     special_page_nr + SPECIALPAGE_XENSTORE);
+                     special_pfn(SPECIALPAGE_XENSTORE));
     xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
-                     special_page_nr + SPECIALPAGE_BUFIOREQ);
+                     special_pfn(SPECIALPAGE_BUFIOREQ));
     xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
-                     special_page_nr + SPECIALPAGE_IOREQ);
+                     special_pfn(SPECIALPAGE_IOREQ));
 
     /*
      * Identity-map page table is required for running with CR0.PG=0 when
@@ -321,14 +296,14 @@ static int setup_guest(int xc_handle,
      */
     if ( (ident_pt = xc_map_foreign_range(
               xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
-              special_page_nr + SPECIALPAGE_IDENT_PT)) == NULL )
+              special_pfn(SPECIALPAGE_IDENT_PT))) == NULL )
         goto error_out;
     for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
         ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
     munmap(ident_pt, PAGE_SIZE);
     xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
-                     (special_page_nr + SPECIALPAGE_IDENT_PT) << PAGE_SHIFT);
+                     special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT);
 
     /* Insert JMP <rel32> instruction at address 0x0 to reach entry point. */
     entry_eip = elf_uval(&elf, elf.ehdr, e_entry);
@@ -354,6 +329,7 @@ static int setup_guest(int xc_handle,
 static int xc_hvm_build_internal(int xc_handle,
                                  uint32_t domid,
                                  int memsize,
+                                 int target,
                                  char *image,
                                  unsigned long image_size)
 {
@@ -363,7 +339,7 @@ static int xc_hvm_build_internal(int xc_handle,
         return -1;
     }
 
-    return setup_guest(xc_handle, domid, memsize, image, image_size);
+    return setup_guest(xc_handle, domid, memsize, target, image, image_size);
 }
 
 static inline int is_loadable_phdr(Elf32_Phdr *phdr)
@@ -388,7 +364,34 @@ int xc_hvm_build(int xc_handle,
          ((image = xc_read_image(image_name, &image_size)) == NULL) )
         return -1;
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize, image, image_size);
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, memsize, image, image_size);
+
+    free(image);
+
+    return sts;
+}
+
+/* xc_hvm_build_target_mem: 
+ * Create a domain for a pre-ballooned virtualized Linux, using
+ * files/filenames.  If target < memsize, domain is created with
+ * memsize pages marked populate-on-demand, and with a PoD cache size
+ * of target.  If target == memsize, pages are populated normally.
+ */
+int xc_hvm_build_target_mem(int xc_handle,
+                           uint32_t domid,
+                           int memsize,
+                           int target,
+                           const char *image_name)
+{
+    char *image;
+    int  sts;
+    unsigned long image_size;
+
+    if ( (image_name == NULL) ||
+         ((image = xc_read_image(image_name, &image_size)) == NULL) )
+        return -1;
+
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, target, image, image_size);
 
     free(image);
 
@@ -423,7 +426,7 @@ int xc_hvm_build_mem(int xc_handle,
         return -1;
     }
 
-    sts = xc_hvm_build_internal(xc_handle, domid, memsize,
+    sts = xc_hvm_build_internal(xc_handle, domid, memsize, memsize,
                                 img, img_len);
 
     /* xc_inflate_buffer may return the original buffer pointer (for
index ac8035bf41a867cdac0a246a2cad5b945b1e65ad..00ee0f8ea4299a897ad77ce1ec42b1cecc3bb07d 100644 (file)
  * Function to translate virtual to physical addresses.
  */
 #include "xc_private.h"
+#include <xen/hvm/save.h>
 
-#if defined(__i386__)
+#define CR0_PG  0x80000000
+#define CR4_PAE 0x20
+#define PTE_PSE 0x80
+#define EFER_LMA 0x400
 
-#define L1_PAGETABLE_SHIFT_PAE 12
-#define L2_PAGETABLE_SHIFT_PAE 21
-#define L3_PAGETABLE_SHIFT_PAE 30
-
-#define L1_PAGETABLE_SHIFT             12
-#define L2_PAGETABLE_SHIFT             22
-
-#define L0_PAGETABLE_MASK_PAE  0x00000ffffffff000ULL
-#define L1_PAGETABLE_MASK_PAE  0x1ffULL
-#define L2_PAGETABLE_MASK_PAE  0x1ffULL
-#define L3_PAGETABLE_MASK_PAE  0x3ULL
-
-#define L0_PAGETABLE_MASK              0xfffff000ULL
-#define L1_PAGETABLE_MASK              0x3ffULL
-#define L2_PAGETABLE_MASK              0x3ffULL
-
-#elif defined(__x86_64__)
-
-#define L1_PAGETABLE_SHIFT_PAE 12
-#define L2_PAGETABLE_SHIFT_PAE 21
-#define L3_PAGETABLE_SHIFT_PAE 30
-#define L4_PAGETABLE_SHIFT_PAE 39
-
-#define L1_PAGETABLE_SHIFT             L1_PAGETABLE_SHIFT_PAE
-#define L2_PAGETABLE_SHIFT             L2_PAGETABLE_SHIFT_PAE
-
-#define L0_PAGETABLE_MASK_PAE  0x000ffffffffff000ULL
-#define L1_PAGETABLE_MASK_PAE  0x1ffULL
-#define L2_PAGETABLE_MASK_PAE  0x1ffULL
-#define L3_PAGETABLE_MASK_PAE  0x1ffULL
-#define L4_PAGETABLE_MASK_PAE  0x1ffULL
-
-#define L0_PAGETABLE_MASK              L0_PAGETABLE_MASK_PAE
-#define L1_PAGETABLE_MASK              L1_PAGETABLE_MASK_PAE
-#define L2_PAGETABLE_MASK              L2_PAGETABLE_MASK_PAE
-
-#endif
 
 unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
-                                           int vcpu, unsigned long long virt )
+                                           int vcpu, unsigned long long virt)
 {
-    vcpu_guest_context_any_t ctx;
-    unsigned long long cr3;
-    void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL;
-    unsigned long long pde, pte, pdpe, pmle;
-    unsigned long mfn = 0;
-#if defined (__i386__)
-    static int pt_levels = 0;
-
-    if (pt_levels == 0) {
-        xen_capabilities_info_t xen_caps = "";
-
-        if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
-            goto out;
-        if (strstr(xen_caps, "xen-3.0-x86_64"))
+    xc_dominfo_t dominfo;
+    uint64_t paddr, mask, pte = 0;
+    int size, level, pt_levels = 2;
+    void *map;
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &dominfo) != 1 
+        || dominfo.domid != dom)
+        return 0;
+
+    /* What kind of paging are we dealing with? */
+    if (dominfo.hvm) {
+        struct hvm_hw_cpu ctx;
+        if (xc_domain_hvm_getcontext_partial(xc_handle, dom,
+                                             HVM_SAVE_CODE(CPU), vcpu,
+                                             &ctx, sizeof ctx) != 0)
+            return 0;
+        if (!(ctx.cr0 & CR0_PG))
+            return virt;
+        pt_levels = (ctx.msr_efer&EFER_LMA) ? 4 : (ctx.cr4&CR4_PAE) ? 3 : 2;
+        paddr = ctx.cr3 & ((pt_levels == 3) ? ~0x1full : ~0xfffull);
+    } else {
+        DECLARE_DOMCTL;
+        vcpu_guest_context_any_t ctx;
+        if (xc_vcpu_getcontext(xc_handle, dom, vcpu, &ctx) != 0)
+            return 0;
+        domctl.domain = dom;
+        domctl.cmd = XEN_DOMCTL_get_address_size;
+        if ( do_domctl(xc_handle, &domctl) != 0 )
+            return 0;
+        if (domctl.u.address_size.size == 64) {
             pt_levels = 4;
-        else if (strstr(xen_caps, "xen-3.0-x86_32p"))
+            paddr = ctx.x64.ctrlreg[3] & ~0xfffull;
+        } else {
             pt_levels = 3;
-        else if (strstr(xen_caps, "xen-3.0-x86_32"))
-            pt_levels = 2;
-        else
-            goto out;
-    }
-#elif defined (__x86_64__)
-#define pt_levels 4
-#endif
-
-    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, &ctx) != 0) {
-        DPRINTF("failed to retreive vcpu context\n");
-        goto out;
-    }
-    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT;
-
-    /* Page Map Level 4 */
-
-#if defined(__i386__)
-    pmle = cr3;
-#elif defined(__x86_64__)
-    pml = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, cr3 >> PAGE_SHIFT);
-    if (pml == NULL) {
-        DPRINTF("failed to map PML4\n");
-        goto out;
-    }
-    pmle = *(unsigned long long *)(pml + 8 * ((virt >> L4_PAGETABLE_SHIFT_PAE) & L4_PAGETABLE_MASK_PAE));
-    if((pmle & 1) == 0) {
-        DPRINTF("page entry not present in PML4\n");
-        goto out_unmap_pml;
-    }
-#endif
-
-    /* Page Directory Pointer Table */
-
-    if (pt_levels >= 3) {
-        pdppage = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, pmle >> PAGE_SHIFT);
-        if (pdppage == NULL) {
-            DPRINTF("failed to map PDP\n");
-            goto out_unmap_pml;
-        }
-        if (pt_levels >= 4)
-            pdp = pdppage;
-        else
-            /* PDP is only 32 bit aligned with 3 level pts */
-            pdp = pdppage + (pmle & ~(XC_PAGE_MASK | 0x1f));
-
-        pdpe = *(unsigned long long *)(pdp + 8 * ((virt >> L3_PAGETABLE_SHIFT_PAE) & L3_PAGETABLE_MASK_PAE));
-
-        if((pdpe & 1) == 0) {
-            DPRINTF("page entry not present in PDP\n");
-            goto out_unmap_pdp;
+            paddr = (((uint64_t) xen_cr3_to_pfn(ctx.x32.ctrlreg[3])) 
+                     << PAGE_SHIFT);
         }
-    } else {
-        pdpe = pmle;
     }
 
-    /* Page Directory */
-
-    pd = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, pdpe >> PAGE_SHIFT);
-    if (pd == NULL) {
-        DPRINTF("failed to map PD\n");
-        goto out_unmap_pdp;
-    }
-
-    if (pt_levels >= 3)
-        pde = *(unsigned long long *)(pd + 8 * ((virt >> L2_PAGETABLE_SHIFT_PAE) & L2_PAGETABLE_MASK_PAE));
-    else
-        pde = *(unsigned long *)(pd + 4 * ((virt >> L2_PAGETABLE_SHIFT) & L2_PAGETABLE_MASK));
-
-    if ((pde & 1) == 0) {
-        DPRINTF("page entry not present in PD\n");
-        goto out_unmap_pd;
+    if (pt_levels == 4) {
+        virt &= 0x0000ffffffffffffull;
+        mask =  0x0000ff8000000000ull;
+    } else if (pt_levels == 3) {
+        virt &= 0x00000000ffffffffull;
+        mask =  0x0000007fc0000000ull;
+    } else {
+        virt &= 0x00000000ffffffffull;
+        mask =  0x00000000ffc00000ull;
     }
-
-    /* Page Table */
-
-    if (pde & 0x00000080) { /* 4M page (or 2M in PAE mode) */
-        DPRINTF("Cannot currently cope with 2/4M pages\n");
-        exit(-1);
-    } else { /* 4k page */
-        pt = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
-                                  pde >> PAGE_SHIFT);
-
-        if (pt == NULL) {
-            DPRINTF("failed to map PT\n");
-            goto out_unmap_pd;
+    size = (pt_levels == 2 ? 4 : 8);
+
+    /* Walk the pagetables */
+    for (level = pt_levels; level > 0; level--) {
+        paddr += ((virt & mask) >> (xc_ffs64(mask) - 1)) * size;
+        map = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, 
+                                   paddr >>PAGE_SHIFT);
+        if (!map) 
+            return 0;
+        memcpy(&pte, map + (paddr & (PAGE_SIZE - 1)), size);
+        munmap(map, PAGE_SIZE);
+        if (!(pte & 1)) 
+            return 0;
+        paddr = pte & 0x000ffffffffff000ull;
+        if (level == 2 && (pte & PTE_PSE)) {
+            mask = ((mask ^ ~-mask) >> 1); /* All bits below first set bit */
+            return ((paddr & ~mask) | (virt & mask)) >> PAGE_SHIFT;
         }
-
-        if (pt_levels >= 3)
-            pte = *(unsigned long long *)(pt + 8 * ((virt >> L1_PAGETABLE_SHIFT_PAE) & L1_PAGETABLE_MASK_PAE));
-        else
-            pte = *(unsigned long *)(pt + 4 * ((virt >> L1_PAGETABLE_SHIFT) & L1_PAGETABLE_MASK));
-
-        if ((pte & 1) == 0) {
-            DPRINTF("page entry not present in PT\n");
-            goto out_unmap_pt;
-        }
-
-        if (pt_levels >= 3)
-            mfn = (pte & L0_PAGETABLE_MASK_PAE) >> PAGE_SHIFT;
-        else
-            mfn = (pte & L0_PAGETABLE_MASK) >> PAGE_SHIFT;
+        mask >>= (pt_levels == 2 ? 10 : 9);
     }
-
- out_unmap_pt:
-    munmap(pt, PAGE_SIZE);
- out_unmap_pd:
-    munmap(pd, PAGE_SIZE);
- out_unmap_pdp:
-    munmap(pdppage, PAGE_SIZE);
- out_unmap_pml:
-    munmap(pml, PAGE_SIZE);
- out:
-    return mfn;
+    return paddr >> PAGE_SHIFT;
 }
 
 /*
index 69f6b6e155c8b117a58460b6335d812c2c146a3e..13342a31213f32ead12f19196e4eb5e1187ffc78 100644 (file)
  *
  */
 
+#include <errno.h>
+#include <stdbool.h>
 #include "xc_private.h"
 
+/*
+ * Get PM statistic info
+ */
 int xc_pm_get_max_px(int xc_handle, int cpuid, int *max_px)
 {
     DECLARE_SYSCTL;
@@ -168,3 +173,192 @@ int xc_pm_reset_cxstat(int xc_handle, int cpuid)
 
     return xc_sysctl(xc_handle, &sysctl);
 }
+
+
+/*
+ * 1. Get PM parameter
+ * 2. Provide user PM control
+ */
+int xc_get_cpufreq_para(int xc_handle, int cpuid,
+                        struct xc_get_cpufreq_para *user_para)
+{
+    DECLARE_SYSCTL;
+    int ret = 0;
+    struct xen_get_cpufreq_para *sys_para = &sysctl.u.pm_op.get_para;
+    bool has_num = user_para->cpu_num &&
+                     user_para->freq_num &&
+                     user_para->gov_num;
+
+    if ( (xc_handle < 0) || !user_para )
+        return -EINVAL;
+
+    if ( has_num )
+    {
+        if ( (!user_para->affected_cpus)                    ||
+             (!user_para->scaling_available_frequencies)    ||
+             (!user_para->scaling_available_governors) )
+            return -EINVAL;
+
+        if ( (ret = lock_pages(user_para->affected_cpus,
+                               user_para->cpu_num * sizeof(uint32_t))) )
+            goto unlock_1;
+        if ( (ret = lock_pages(user_para->scaling_available_frequencies,
+                               user_para->freq_num * sizeof(uint32_t))) )
+            goto unlock_2;
+        if ( (ret = lock_pages(user_para->scaling_available_governors,
+                 user_para->gov_num * CPUFREQ_NAME_LEN * sizeof(char))) )
+            goto unlock_3;
+
+        set_xen_guest_handle(sys_para->affected_cpus,
+                             user_para->affected_cpus);
+        set_xen_guest_handle(sys_para->scaling_available_frequencies,
+                             user_para->scaling_available_frequencies);
+        set_xen_guest_handle(sys_para->scaling_available_governors,
+                             user_para->scaling_available_governors);
+    }
+
+    sysctl.cmd = XEN_SYSCTL_pm_op;
+    sysctl.u.pm_op.cmd = GET_CPUFREQ_PARA;
+    sysctl.u.pm_op.cpuid = cpuid;
+    sys_para->cpu_num  = user_para->cpu_num;
+    sys_para->freq_num = user_para->freq_num;
+    sys_para->gov_num  = user_para->gov_num;
+
+    ret = xc_sysctl(xc_handle, &sysctl);
+    if ( ret )
+    {
+        if ( errno == EAGAIN )
+        {
+            user_para->cpu_num  = sys_para->cpu_num;
+            user_para->freq_num = sys_para->freq_num;
+            user_para->gov_num  = sys_para->gov_num;
+            ret = -errno;
+        }
+
+        if ( has_num )
+            goto unlock_4;
+        goto unlock_1;
+    }
+    else
+    {
+        user_para->cpuinfo_cur_freq = sys_para->cpuinfo_cur_freq;
+        user_para->cpuinfo_max_freq = sys_para->cpuinfo_max_freq;
+        user_para->cpuinfo_min_freq = sys_para->cpuinfo_min_freq;
+        user_para->scaling_cur_freq = sys_para->scaling_cur_freq;
+        user_para->scaling_max_freq = sys_para->scaling_max_freq;
+        user_para->scaling_min_freq = sys_para->scaling_min_freq;
+
+        memcpy(user_para->scaling_driver, 
+                sys_para->scaling_driver, CPUFREQ_NAME_LEN);
+        memcpy(user_para->scaling_governor,
+                sys_para->scaling_governor, CPUFREQ_NAME_LEN);
+
+        /* copy to user_para no matter what cpufreq governor */
+        XC_BUILD_BUG_ON(sizeof(((struct xc_get_cpufreq_para *)0)->u) !=
+                        sizeof(((struct xen_get_cpufreq_para *)0)->u));
+
+        memcpy(&user_para->u, &sys_para->u, sizeof(sys_para->u));
+    }
+
+unlock_4:
+    unlock_pages(user_para->scaling_available_governors,
+                 user_para->gov_num * CPUFREQ_NAME_LEN * sizeof(char));
+unlock_3:
+    unlock_pages(user_para->scaling_available_frequencies,
+                 user_para->freq_num * sizeof(uint32_t));
+unlock_2:
+    unlock_pages(user_para->affected_cpus,
+                 user_para->cpu_num * sizeof(uint32_t));
+unlock_1:
+    return ret;
+}
+
+int xc_set_cpufreq_gov(int xc_handle, int cpuid, char *govname)
+{
+    DECLARE_SYSCTL;
+    char *scaling_governor = sysctl.u.pm_op.set_gov.scaling_governor;
+
+    if ( (xc_handle < 0) || (!govname) )
+        return -EINVAL;
+
+    sysctl.cmd = XEN_SYSCTL_pm_op;
+    sysctl.u.pm_op.cmd = SET_CPUFREQ_GOV;
+    sysctl.u.pm_op.cpuid = cpuid;
+    strncpy(scaling_governor, govname, CPUFREQ_NAME_LEN);
+    scaling_governor[CPUFREQ_NAME_LEN - 1] = '\0';
+
+    return xc_sysctl(xc_handle, &sysctl);
+}
+
+int xc_set_cpufreq_para(int xc_handle, int cpuid, 
+                        int ctrl_type, int ctrl_value)
+{
+    DECLARE_SYSCTL;
+
+    if ( xc_handle < 0 )
+        return -EINVAL;
+
+    sysctl.cmd = XEN_SYSCTL_pm_op;
+    sysctl.u.pm_op.cmd = SET_CPUFREQ_PARA;
+    sysctl.u.pm_op.cpuid = cpuid;
+    sysctl.u.pm_op.set_para.ctrl_type = ctrl_type;
+    sysctl.u.pm_op.set_para.ctrl_value = ctrl_value;
+
+    return xc_sysctl(xc_handle, &sysctl);
+}
+
+int xc_get_cpufreq_avgfreq(int xc_handle, int cpuid, int *avg_freq)
+{
+    int ret = 0;
+    DECLARE_SYSCTL;
+
+    if ( (xc_handle < 0) || (!avg_freq) )
+        return -EINVAL;
+
+    sysctl.cmd = XEN_SYSCTL_pm_op;
+    sysctl.u.pm_op.cmd = GET_CPUFREQ_AVGFREQ;
+    sysctl.u.pm_op.cpuid = cpuid;
+    ret = xc_sysctl(xc_handle, &sysctl);
+
+    *avg_freq = sysctl.u.pm_op.get_avgfreq;
+
+    return ret;
+}
+
+int xc_get_cputopo(int xc_handle, struct xc_get_cputopo *info)
+{
+    int rc;
+    DECLARE_SYSCTL;
+
+    sysctl.cmd = XEN_SYSCTL_pm_op;
+    sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_get_cputopo;
+    sysctl.u.pm_op.cpuid = 0;
+    set_xen_guest_handle( sysctl.u.pm_op.get_topo.cpu_to_core,
+                         info->cpu_to_core );
+    set_xen_guest_handle( sysctl.u.pm_op.get_topo.cpu_to_socket,
+                         info->cpu_to_socket );
+    sysctl.u.pm_op.get_topo.max_cpus = info->max_cpus;
+
+    rc = do_sysctl(xc_handle, &sysctl);
+    info->nr_cpus = sysctl.u.pm_op.get_topo.nr_cpus;
+
+    return rc;
+}
+
+/* value:   0 - disable sched_smt_power_savings 
+            1 - enable sched_smt_power_savings
+ */
+int xc_set_sched_opt_smt(int xc_handle, uint32_t value)
+{
+   int rc;
+   DECLARE_SYSCTL;
+
+   sysctl.cmd = XEN_SYSCTL_pm_op;
+   sysctl.u.pm_op.cmd = XEN_SYSCTL_pm_op_set_sched_opt_smt;
+   sysctl.u.pm_op.cpuid = 0;
+   sysctl.u.pm_op.set_sched_opt_smt = value;
+   rc = do_sysctl(xc_handle, &sysctl);
+
+   return rc;
+}
+
index b37978afe8120bd9e51107c2f4c3aa82d6e1beee..99589beaf85ac7f71446dfc6f390bdd1f3599b04 100644 (file)
@@ -307,17 +307,18 @@ int xc_memory_op(int xc_handle,
             goto out1;
         }
         break;
-    case XENMEM_remove_from_physmap:
-        if ( lock_pages(arg, sizeof(struct xen_remove_from_physmap)) )
+    case XENMEM_current_reservation:
+    case XENMEM_maximum_reservation:
+    case XENMEM_maximum_gpfn:
+        if ( lock_pages(arg, sizeof(domid_t)) )
         {
             PERROR("Could not lock");
             goto out1;
         }
         break;
-    case XENMEM_current_reservation:
-    case XENMEM_maximum_reservation:
-    case XENMEM_maximum_gpfn:
-        if ( lock_pages(arg, sizeof(domid_t)) )
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+        if ( lock_pages(arg, sizeof(struct xen_pod_target)) )
         {
             PERROR("Could not lock");
             goto out1;
@@ -347,14 +348,15 @@ int xc_memory_op(int xc_handle,
     case XENMEM_add_to_physmap:
         unlock_pages(arg, sizeof(struct xen_add_to_physmap));
         break;
-    case XENMEM_remove_from_physmap:
-        unlock_pages(arg, sizeof(struct xen_remove_from_physmap));
-        break;
     case XENMEM_current_reservation:
     case XENMEM_maximum_reservation:
     case XENMEM_maximum_gpfn:
         unlock_pages(arg, sizeof(domid_t));
         break;
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+        unlock_pages(arg, sizeof(struct xen_pod_target));
+        break;
     }
 
  out1:
@@ -627,6 +629,33 @@ int write_exact(int fd, const void *data, size_t size)
     return 0;
 }
 
+int xc_ffs8(uint8_t x)
+{
+    int i;
+    for ( i = 0; i < 8; i++ )
+        if ( x & (1u << i) )
+            return i+1;
+    return 0;
+}
+
+int xc_ffs16(uint16_t x)
+{
+    uint8_t h = x>>8, l = x;
+    return l ? xc_ffs8(l) : h ? xc_ffs8(h) + 8 : 0;
+}
+
+int xc_ffs32(uint32_t x)
+{
+    uint16_t h = x>>16, l = x;
+    return l ? xc_ffs16(l) : h ? xc_ffs16(h) + 16 : 0;
+}
+
+int xc_ffs64(uint64_t x)
+{
+    uint32_t h = x>>32, l = x;
+    return l ? xc_ffs32(l) : h ? xc_ffs32(h) + 32 : 0;
+}
+
 /*
  * Local variables:
  * mode: C
index 6e49b749a0b29a0fbc45c49ed3662673f60e1f90..d95bd0675d04ea46741cb6caf9998b29b56caf42 100644 (file)
@@ -43,6 +43,9 @@
 #define INFO     1
 #define PROGRESS 0
 
+/* Force a compilation error if condition is true */
+#define XC_BUILD_BUG_ON(p) ((void)sizeof(struct { int:-!!(p); }))
+
 /*
 ** Define max dirty page cache to permit during save/restore -- need to balance 
 ** keeping cache usage down with CPU impact of invalidating too often.
@@ -215,4 +218,9 @@ int xc_flush_mmu_updates(int xc_handle, struct xc_mmu *mmu);
 int read_exact(int fd, void *data, size_t size);
 int write_exact(int fd, const void *data, size_t size);
 
+int xc_ffs8(uint8_t x);
+int xc_ffs16(uint16_t x);
+int xc_ffs32(uint32_t x);
+int xc_ffs64(uint64_t x);
+
 #endif /* __XC_PRIVATE_H__ */
index fa6c3a0ad1b203c0e13002203dc7346095cd3f89..39f02198a9b1205186abe7ce974131a5b964d862 100644 (file)
@@ -44,8 +44,7 @@ static uint64_t                         online_cpumap;
 static uint64_t                         regs_valid;
 static vcpu_guest_context_any_t      ctxt[MAX_VIRT_CPUS];
 
-extern int ffsll(long long int);
-#define FOREACH_CPU(cpumap, i)  for ( cpumap = online_cpumap; (i = ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) )
+#define FOREACH_CPU(cpumap, i)  for ( cpumap = online_cpumap; (i = xc_ffs64(cpumap)); cpumap &= ~(1 << (index - 1)) )
 
 static int
 fetch_regs(int xc_handle, int cpu, int *online)
@@ -136,7 +135,7 @@ online_vcpus_changed(uint64_t cpumap)
     uint64_t changed_cpumap = cpumap ^ online_cpumap;
     int index;
 
-    while ( (index = ffsll(changed_cpumap)) ) {
+    while ( (index = xc_ffs64(changed_cpumap)) ) {
         if ( cpumap & (1 << (index - 1)) )
         {
             if (handlers.td_create) handlers.td_create(index - 1);
@@ -150,243 +149,38 @@ online_vcpus_changed(uint64_t cpumap)
 
 }
 
-/* --------------------- */
-/* XXX application state */
-static long      nr_pages = 0;
-static uint64_t *page_array = NULL;
-
-static uint64_t to_ma(int cpu, uint64_t maddr)
-{
-    return maddr;
-}
 
 static void *
-map_domain_va_32(
-    int xc_handle,
-    int cpu,
-    void *guest_va,
-    int perm)
-{
-    unsigned long l2e, l1e, l1p, p, va = (unsigned long)guest_va;
-    uint32_t *l2, *l1;
-    static void *v[MAX_VIRT_CPUS];
-
-    l2 = xc_map_foreign_range(
-         xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-         xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
-    if ( l2 == NULL )
-        return NULL;
-
-    l2e = l2[l2_table_offset_i386(va)];
-    munmap(l2, PAGE_SIZE);
-    if ( !(l2e & _PAGE_PRESENT) )
-        return NULL;
-    l1p = to_ma(cpu, l2e);
-    l1 = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, PROT_READ, l1p >> PAGE_SHIFT);
-    if ( l1 == NULL )
-        return NULL;
-
-    l1e = l1[l1_table_offset_i386(va)];
-    munmap(l1, PAGE_SIZE);
-    if ( !(l1e & _PAGE_PRESENT) )
-        return NULL;
-    p = to_ma(cpu, l1e);
-    if ( v[cpu] != NULL )
-        munmap(v[cpu], PAGE_SIZE);
-    v[cpu] = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, perm, p >> PAGE_SHIFT);
-    if ( v[cpu] == NULL )
-        return NULL;
-
-    return (void *)((unsigned long)v[cpu] | (va & (PAGE_SIZE - 1)));
-}
-
-
-static void *
-map_domain_va_pae(
+map_domain_va(
     int xc_handle,
     int cpu,
     void *guest_va,
     int perm)
 {
-    uint64_t l3e, l2e, l1e, l2p, l1p, p;
     unsigned long va = (unsigned long)guest_va;
-    uint64_t *l3, *l2, *l1;
-    static void *v[MAX_VIRT_CPUS];
+    unsigned long mfn;
+    void *map;
 
-    l3 = xc_map_foreign_range(
-        xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
-    if ( l3 == NULL )
+    /* cross page boundary */
+    if ( (va & ~PAGE_MASK) + sizeof(long) > PAGE_SIZE )
         return NULL;
 
-    l3e = l3[l3_table_offset_pae(va)];
-    munmap(l3, PAGE_SIZE);
-    if ( !(l3e & _PAGE_PRESENT) )
-        return NULL;
-    l2p = to_ma(cpu, l3e);
-    l2 = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, PROT_READ, l2p >> PAGE_SHIFT);
-    if ( l2 == NULL )
-        return NULL;
-
-    l2e = l2[l2_table_offset_pae(va)];
-    munmap(l2, PAGE_SIZE);
-    if ( !(l2e & _PAGE_PRESENT) )
-        return NULL;
-    l1p = to_ma(cpu, l2e);
-    l1 = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, PROT_READ, l1p >> PAGE_SHIFT);
-    if ( l1 == NULL )
+    mfn = xc_translate_foreign_address(xc_handle, current_domid, cpu, va);
+    if ( mfn == 0 )
         return NULL;
 
-    l1e = l1[l1_table_offset_pae(va)];
-    munmap(l1, PAGE_SIZE);
-    if ( !(l1e & _PAGE_PRESENT) )
-        return NULL;
-    p = to_ma(cpu, l1e);
-    if ( v[cpu] != NULL )
-        munmap(v[cpu], PAGE_SIZE);
-    v[cpu] = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, perm, p >> PAGE_SHIFT);
-    if ( v[cpu] == NULL )
+    map = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, 
+                               perm, mfn);
+    if (map == NULL)
         return NULL;
 
-    return (void *)((unsigned long)v[cpu] | (va & (PAGE_SIZE - 1)));
+    return map + (va & ~PAGE_MASK);
 }
 
-#ifdef __x86_64__
-static void *
-map_domain_va_64(
-    int xc_handle,
-    int cpu,
-    void *guest_va,
-    int perm)
-{
-    unsigned long l4e, l3e, l2e, l1e, l3p, l2p, l1p, p, va = (unsigned long)guest_va;
-    uint64_t *l4, *l3, *l2, *l1;
-    static void *v[MAX_VIRT_CPUS];
-
-    if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
-        return map_domain_va_32(xc_handle, cpu, guest_va, perm);
-
-    l4 = xc_map_foreign_range(
-        xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
-    if ( l4 == NULL )
-        return NULL;
-
-    l4e = l4[l4_table_offset(va)];
-    munmap(l4, PAGE_SIZE);
-    if ( !(l4e & _PAGE_PRESENT) )
-        return NULL;
-    l3p = to_ma(cpu, l4e);
-    l3 = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, PROT_READ, l3p >> PAGE_SHIFT);
-    if ( l3 == NULL )
-        return NULL;
-
-    l3e = l3[l3_table_offset(va)];
-    munmap(l3, PAGE_SIZE);
-    if ( !(l3e & _PAGE_PRESENT) )
-        return NULL;
-    l2p = to_ma(cpu, l3e);
-    l2 = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, PROT_READ, l2p >> PAGE_SHIFT);
-    if ( l2 == NULL )
-        return NULL;
-
-    l2e = l2[l2_table_offset(va)];
-    munmap(l2, PAGE_SIZE);
-    if ( !(l2e & _PAGE_PRESENT) )
-        return NULL;
-    l1p = to_ma(cpu, l2e);
-    if (l2e & 0x80)  { /* 2M pages */
-        p = to_ma(cpu, l1p + (l1_table_offset(va) << PAGE_SHIFT));
-    } else { /* 4K pages */
-        l1 = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, PROT_READ, l1p >> PAGE_SHIFT);
-        if ( l1 == NULL )
-            return NULL;
-
-        l1e = l1[l1_table_offset(va)];
-        munmap(l1, PAGE_SIZE);
-        if ( !(l1e & _PAGE_PRESENT) )
-            return NULL;
-        p = to_ma(cpu, l1e);
-    }
-    if ( v[cpu] != NULL )
-        munmap(v[cpu], PAGE_SIZE);
-    v[cpu] = xc_map_foreign_range(xc_handle, current_domid, PAGE_SIZE, perm, p >> PAGE_SHIFT);
-    if ( v[cpu] == NULL )
-        return NULL;
-
-    return (void *)((unsigned long)v[cpu] | (va & (PAGE_SIZE - 1)));
-}
-#endif
-
-static void *
-map_domain_va(
-    int xc_handle,
-    int cpu,
-    void *guest_va,
-    int perm)
+static void
+unmap_domain_va(void *guest_va)
 {
-    unsigned long va = (unsigned long) guest_va;
-    long npgs = xc_get_tot_pages(xc_handle, current_domid);
-    static enum { MODE_UNKNOWN, MODE_64, MODE_32, MODE_PAE } mode;
-
-    if ( mode == MODE_UNKNOWN )
-    {
-        xen_capabilities_info_t caps;
-        (void)xc_version(xc_handle, XENVER_capabilities, caps);
-        if ( strstr(caps, "-x86_64") )
-            mode = MODE_64;
-        else if ( strstr(caps, "-x86_32p") )
-            mode = MODE_PAE;
-        else if ( strstr(caps, "-x86_32") )
-            mode = MODE_32;
-    }
-
-    if ( nr_pages != npgs )
-    {
-        if ( nr_pages > 0 )
-            free(page_array);
-        nr_pages = npgs;
-        if ( (page_array = malloc(nr_pages * sizeof(*page_array))) == NULL )
-        {
-            IPRINTF("Could not allocate memory\n");
-            return NULL;
-        }
-        if ( xc_get_pfn_list(xc_handle, current_domid,
-                             page_array, nr_pages) != nr_pages )
-        {
-            IPRINTF("Could not get the page frame list\n");
-            return NULL;
-        }
-    }
-
-    if (fetch_regs(xc_handle, cpu, NULL))
-        return NULL;
-
-    if (!paging_enabled(&ctxt[cpu])) {
-        static void * v;
-        uint64_t page;
-
-        if ( v != NULL )
-            munmap(v, PAGE_SIZE);
-
-        page = to_ma(cpu, va);
-
-        v = xc_map_foreign_range( xc_handle, current_domid, PAGE_SIZE,
-                perm, page >> PAGE_SHIFT);
-
-        if ( v == NULL )
-            return NULL;
-
-        return (void *)(((unsigned long)v) | (va & BSD_PAGE_MASK));
-    }
-#ifdef __x86_64__
-    if ( mode == MODE_64 )
-        return map_domain_va_64(xc_handle, cpu, guest_va, perm);
-#endif
-    if ( mode == MODE_PAE )
-        return map_domain_va_pae(xc_handle, cpu, guest_va, perm);
-    /* else ( mode == MODE_32 ) */
-    return map_domain_va_32(xc_handle, cpu, guest_va, perm);
+    munmap((void *)((unsigned long)guest_va & PAGE_MASK), PAGE_SIZE);
 }
 
 int control_c_pressed_flag = 0;
@@ -474,6 +268,8 @@ xc_ptrace(
         if ( guest_va == NULL )
             goto out_error;
         retval = *guest_va;
+        if (!current_isfile)
+            unmap_domain_va(guest_va);
         break;
 
     case PTRACE_POKETEXT:
@@ -487,7 +283,9 @@ xc_ptrace(
                 xc_handle, cpu, addr, PROT_READ|PROT_WRITE);
         if ( guest_va == NULL )
             goto out_error;
-        *guest_va = (unsigned long)data;
+        *guest_va = edata;
+        if (!current_isfile)
+            unmap_domain_va(guest_va);
         break;
 
     case PTRACE_GETREGS:
@@ -524,10 +322,20 @@ xc_ptrace(
         /*  XXX we can still have problems if the user switches threads
          *  during single-stepping - but that just seems retarded
          */
-        ctxt[cpu].c.user_regs.eflags |= PSL_T;
-        if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
-                                &ctxt[cpu])))
-            goto out_error_domctl;
+        /* Try to enalbe Monitor Trap Flag for HVM, and fall back to TF
+         * if no MTF support
+         */
+        if ( !current_is_hvm ||
+             xc_domain_debug_control(xc_handle,
+                                     current_domid,
+                                     XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON,
+                                     cpu) )
+        {
+            ctxt[cpu].c.user_regs.eflags |= PSL_T;
+            if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
+                                    &ctxt[cpu])))
+                goto out_error_domctl;
+        }
         /* FALLTHROUGH */
 
     case PTRACE_CONT:
@@ -538,15 +346,22 @@ xc_ptrace(
         {
             FOREACH_CPU(cpumap, index) {
                 cpu = index - 1;
-                if (fetch_regs(xc_handle, cpu, NULL))
-                    goto out_error;
-                /* Clear trace flag */
-                if ( ctxt[cpu].c.user_regs.eflags & PSL_T )
+                if ( !current_is_hvm ||
+                      xc_domain_debug_control(xc_handle,
+                                              current_domid,
+                                              XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF,
+                                              cpu) )
                 {
-                    ctxt[cpu].c.user_regs.eflags &= ~PSL_T;
-                    if ((retval = xc_vcpu_setcontext(xc_handle, current_domid,
-                                                cpu, &ctxt[cpu])))
-                        goto out_error_domctl;
+                    if (fetch_regs(xc_handle, cpu, NULL))
+                        goto out_error;
+                    /* Clear trace flag */
+                    if ( ctxt[cpu].c.user_regs.eflags & PSL_T )
+                    {
+                        ctxt[cpu].c.user_regs.eflags &= ~PSL_T;
+                        if ((retval = xc_vcpu_setcontext(xc_handle, current_domid,
+                                        cpu, &ctxt[cpu])))
+                            goto out_error_domctl;
+                    }
                 }
             }
         }
index d4bd6bcb1566e9109e10d5af71aa5efc37ba6a78..a159591c4ddf0cea4a8a5f840547e1065a9dcf00 100644 (file)
@@ -154,7 +154,7 @@ xc_waitdomain_core_compat(
             IPRINTF("Could not allocate m2p array\n");
             return -1;
         }
-        bzero(m2p_array_compat, sizeof(unsigned long)* 1 << 20);
+        memset(m2p_array_compat, 0, sizeof(unsigned long)* 1 << 20);
 
         for (i = 0; i < nr_pages_compat; i++)
             m2p_array_compat[p2m_array_compat[i]] = i;
@@ -540,7 +540,9 @@ xc_waitdomain_core_elf(
                              XEN_ELFNOTE_DUMPCORE_XEN_VERSION,
                              (void**)&xen_version) < 0)
         goto out;
-    if (xen_version->xen_version.pagesize != PAGE_SIZE)
+    /* shifted case covers 32 bit FV guest core file created on 64 bit Dom0 */
+    if (xen_version->xen_version.pagesize != PAGE_SIZE &&
+        (xen_version->xen_version.pagesize >> 32) != PAGE_SIZE)
         goto out;
 
     /* .note.Xen: format_version */
index f88a92890648d67996270c63a419f673e0041982..d32a53755096d8ed34358a70397382b3e16ad2b3 100644 (file)
@@ -134,6 +134,8 @@ void *xc_map_foreign_ranges(int xc_handle, uint32_t dom,
     if (rc)
         goto ioctl_failed;
 
+    return addr;
+
 ioctl_failed:
     rc = munmap(addr, size);
     if (rc == -1)
diff --git a/tools/libxc/xc_suspend.c b/tools/libxc/xc_suspend.c
new file mode 100644 (file)
index 0000000..c0ab757
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ */
+
+#include "xc_private.h"
+#include "xenguest.h"
+
+#define SUSPEND_LOCK_FILE "/var/lib/xen/suspend_evtchn_lock.d"
+static int lock_suspend_event(void)
+{
+    int fd, rc;
+    mode_t mask;
+    char buf[128];
+
+    mask = umask(022);
+    fd = open(SUSPEND_LOCK_FILE, O_CREAT | O_EXCL | O_RDWR, 0666);
+    if (fd < 0)
+    {
+        ERROR("Can't create lock file for suspend event channel\n");
+        return -EINVAL;
+    }
+    umask(mask);
+    snprintf(buf, sizeof(buf), "%10ld", (long)getpid());
+
+    rc = write_exact(fd, buf, strlen(buf));
+    close(fd);
+
+    return rc;
+}
+
+static int unlock_suspend_event(void)
+{
+    int fd, pid, n;
+    char buf[128];
+
+    fd = open(SUSPEND_LOCK_FILE, O_RDWR);
+
+    if (fd < 0)
+        return -EINVAL;
+
+    n = read(fd, buf, 127);
+
+    close(fd);
+
+    if (n > 0)
+    {
+        sscanf(buf, "%d", &pid);
+        /* We are the owner, so we can simply delete the file */
+        if (pid == getpid())
+        {
+            unlink(SUSPEND_LOCK_FILE);
+            return 0;
+        }
+    }
+
+    return -EPERM;
+}
+
+int xc_await_suspend(int xce, int suspend_evtchn)
+{
+    int rc;
+
+    do {
+        rc = xc_evtchn_pending(xce);
+        if (rc < 0) {
+            ERROR("error polling suspend notification channel: %d", rc);
+            return -1;
+        }
+    } while (rc != suspend_evtchn);
+
+    /* harmless for one-off suspend */
+    if (xc_evtchn_unmask(xce, suspend_evtchn) < 0)
+        ERROR("failed to unmask suspend notification channel: %d", rc);
+
+    return 0;
+}
+
+int xc_suspend_evtchn_release(int xce, int suspend_evtchn)
+{
+    if (suspend_evtchn >= 0)
+        xc_evtchn_unbind(xce, suspend_evtchn);
+
+    return unlock_suspend_event();
+}
+
+int xc_suspend_evtchn_init(int xc, int xce, int domid, int port)
+{
+    int rc, suspend_evtchn = -1;
+
+    if (lock_suspend_event())
+        return -EINVAL;
+
+    suspend_evtchn = xc_evtchn_bind_interdomain(xce, domid, port);
+    if (suspend_evtchn < 0) {
+        ERROR("failed to bind suspend event channel: %d", suspend_evtchn);
+        goto cleanup;
+    }
+
+    rc = xc_domain_subscribe_for_suspend(xc, domid, port);
+    if (rc < 0) {
+        ERROR("failed to subscribe to domain: %d", rc);
+        goto cleanup;
+    }
+
+    /* event channel is pending immediately after binding */
+    xc_await_suspend(xce, suspend_evtchn);
+
+    return suspend_evtchn;
+
+cleanup:
+    if (suspend_evtchn > 0)
+        xc_suspend_evtchn_release(xce, suspend_evtchn);
+
+    return -1;
+}
index 100749a9ab07afa1e70f793765814bfcf2b8c267..9ce228620a870f33c4c6661f6ad3a9c08f02e15e 100644 (file)
@@ -158,7 +158,7 @@ typedef struct xc_dominfo {
                   paused:1, blocked:1, running:1,
                   hvm:1, debugged:1;
     unsigned int  shutdown_reason; /* only meaningful if shutdown==1 */
-    unsigned long nr_pages;
+    unsigned long nr_pages; /* current number, not maximum */
     unsigned long shared_info_frame;
     uint64_t      cpu_time;
     unsigned long max_memkb;
@@ -375,6 +375,25 @@ int xc_domain_hvm_getcontext(int xc_handle,
                              uint8_t *ctxt_buf,
                              uint32_t size);
 
+
+/**
+ * This function returns one element of the context of a hvm domain
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to get information from
+ * @parm typecode which type of elemnt required 
+ * @parm instance which instance of the type
+ * @parm ctxt_buf a pointer to a structure to store the execution context of
+ *            the hvm domain
+ * @parm size the size of ctxt_buf (must be >= HVM_SAVE_LENGTH(typecode))
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_getcontext_partial(int xc_handle,
+                                     uint32_t domid,
+                                     uint16_t typecode,
+                                     uint16_t instance,
+                                     void *ctxt_buf,
+                                     uint32_t size);
+
 /**
  * This function will set the context for hvm domain
  *
@@ -628,11 +647,18 @@ int xc_domain_memory_populate_physmap(int xc_handle,
                                       unsigned int mem_flags,
                                       xen_pfn_t *extent_start);
 
-int xc_domain_memory_translate_gpfn_list(int xc_handle,
-                                         uint32_t domid,
-                                         unsigned long nr_gpfns,
-                                         xen_pfn_t *gpfn_list,
-                                         xen_pfn_t *mfn_list);
+int xc_domain_memory_set_pod_target(int xc_handle,
+                                    uint32_t domid,
+                                    uint64_t target_pages,
+                                    uint64_t *tot_pages,
+                                    uint64_t *pod_cache_pages,
+                                    uint64_t *pod_entries);
+
+int xc_domain_memory_get_pod_target(int xc_handle,
+                                    uint32_t domid,
+                                    uint64_t *tot_pages,
+                                    uint64_t *pod_cache_pages,
+                                    uint64_t *pod_entries);
 
 int xc_domain_ioport_permission(int xc_handle,
                                 uint32_t domid,
@@ -703,8 +729,8 @@ void *xc_map_foreign_batch(int xc_handle, uint32_t dom, int prot,
 
 /**
  * Translates a virtual address in the context of a given domain and
- * vcpu returning the machine page frame number of the associated
- * page.
+ * vcpu returning the GFN containing the address (that is, an MFN for 
+ * PV guests, a PFN for HVM guests).  Returns 0 for failure.
  *
  * @parm xc_handle a handle on an open hypervisor interface
  * @parm dom the domain to perform the translation in
@@ -1066,7 +1092,14 @@ int xc_domain_update_msi_irq(
     uint32_t domid,
     uint32_t gvec,
     uint32_t pirq,
-    uint32_t gflags);
+    uint32_t gflags,
+    uint64_t gtable);
+
+int xc_domain_unbind_msi_irq(int xc_handle,
+                             uint32_t domid,
+                             uint32_t gvec,
+                             uint32_t pirq,
+                             uint32_t gflags);
 
 int xc_domain_bind_pt_irq(int xc_handle,
                           uint32_t domid,
@@ -1111,6 +1144,12 @@ int xc_domain_set_target(int xc_handle,
                          uint32_t domid,
                          uint32_t target);
 
+/* Control the domain for debug */
+int xc_domain_debug_control(int xc_handle,
+                            uint32_t domid,
+                            uint32_t sop,
+                            uint32_t vcpu);
+
 #if defined(__i386__) || defined(__x86_64__)
 int xc_cpuid_check(int xc,
                    const unsigned int *input,
@@ -1161,4 +1200,66 @@ int xc_pm_reset_cxstat(int xc_handle, int cpuid);
 
 int xc_cpu_online(int xc_handle, int cpu);
 int xc_cpu_offline(int xc_handle, int cpu);
+
+/* 
+ * cpufreq para name of this structure named 
+ * same as sysfs file name of native linux
+ */
+typedef xen_userspace_t xc_userspace_t;
+typedef xen_ondemand_t xc_ondemand_t;
+
+struct xc_get_cpufreq_para {
+    /* IN/OUT variable */
+    uint32_t cpu_num;
+    uint32_t freq_num;
+    uint32_t gov_num;
+
+    /* for all governors */
+    /* OUT variable */
+    uint32_t *affected_cpus;
+    uint32_t *scaling_available_frequencies;
+    char     *scaling_available_governors;
+    char scaling_driver[CPUFREQ_NAME_LEN];
+
+    uint32_t cpuinfo_cur_freq;
+    uint32_t cpuinfo_max_freq;
+    uint32_t cpuinfo_min_freq;
+    uint32_t scaling_cur_freq;
+
+    char scaling_governor[CPUFREQ_NAME_LEN];
+    uint32_t scaling_max_freq;
+    uint32_t scaling_min_freq;
+
+    /* for specific governor */
+    union {
+        xc_userspace_t userspace;
+        xc_ondemand_t ondemand;
+    } u;
+};
+
+int xc_get_cpufreq_para(int xc_handle, int cpuid,
+                        struct xc_get_cpufreq_para *user_para);
+int xc_set_cpufreq_gov(int xc_handle, int cpuid, char *govname);
+int xc_set_cpufreq_para(int xc_handle, int cpuid,
+                        int ctrl_type, int ctrl_value);
+int xc_get_cpufreq_avgfreq(int xc_handle, int cpuid, int *avg_freq);
+
+struct xc_get_cputopo {
+     /* IN: maximum addressable entry in
+      * the caller-provided cpu_to_core/socket.
+      */
+    uint32_t max_cpus;
+    uint32_t *cpu_to_core;
+    uint32_t *cpu_to_socket;
+
+    /* OUT: number of cpus returned
+     * If OUT is greater than IN then the cpu_to_core/socket is truncated!
+     */
+    uint32_t nr_cpus;
+};
+
+int xc_get_cputopo(int xc_handle, struct xc_get_cputopo *info);
+
+int xc_set_sched_opt_smt(int xc_handle, uint32_t value);
+
 #endif /* XENCTRL_H */
index ba60326a47847c583eabb25d0cc68bec651010a4..d64fc4554bbc97c1370f86956031bb7125126283 100644 (file)
@@ -130,10 +130,22 @@ int xc_hvm_build(int xc_handle,
                  int memsize,
                  const char *image_name);
 
+int xc_hvm_build_target_mem(int xc_handle,
+                            uint32_t domid,
+                            int memsize,
+                            int target,
+                            const char *image_name);
+
 int xc_hvm_build_mem(int xc_handle,
                      uint32_t domid,
                      int memsize,
                      const char *image_buffer,
                      unsigned long image_size);
 
+int xc_suspend_evtchn_release(int xce, int suspend_evtchn);
+
+int xc_suspend_evtchn_init(int xc, int xce, int domid, int port);
+
+int xc_await_suspend(int xce, int suspend_evtchn);
+
 #endif /* XENGUEST_H */
index d7620933cdda3f428e1b23b380b111a28211af4e..4dc3a5c66f7fdb8b5404b58d28c5b8170ea9460e 100644 (file)
@@ -7,7 +7,6 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <zlib.h>
-#include <strings.h>
 #include <malloc.h>
 
 #include "xg_private.h"
@@ -109,7 +108,7 @@ char *xc_inflate_buffer(const char *in_buf, unsigned long in_size,
                 (256 * ((unsigned char)in_buf[in_size-2] +
                         (256 * (unsigned char)in_buf[in_size-1])))));
 
-    bzero(&zStream, sizeof(zStream));
+    memset(&zStream, 0, sizeof(zStream));
     out_buf = malloc(out_len + 16);        /* Leave a little extra space */
     if ( out_buf == NULL )
     {
@@ -131,6 +130,7 @@ char *xc_inflate_buffer(const char *in_buf, unsigned long in_size,
 
     /* Inflate in one pass/call */
     sts = inflate(&zStream, Z_FINISH);
+    inflateEnd(&zStream);
     if ( sts != Z_STREAM_END )
     {
         ERROR("inflate failed, sts %d\n", sts);
index 10dd156e94612a28ba2433da6c26eb0e573deae9..f9da7127410f3983eb4128e4ca86b79541e3fee4 100644 (file)
@@ -90,6 +90,8 @@ add_param(xmlNode *, const char *, const char *);
 static xmlNode *
 add_param_struct(xmlNode *);
 static xmlNode *
+add_param_array(xmlNode *);
+static xmlNode *
 add_struct_array(xmlNode *, const char *);
 static xmlNode *
 add_nested_struct(xmlNode *, const char *);
@@ -1292,7 +1294,7 @@ make_body_add_type(enum abstract_typename typename, abstract_value *v,
         const struct abstract_type *member_type = v->type->child;
         arbitrary_set *set_val = v->u.struct_val;
         abstract_value v;
-        xmlNode *data_node = add_param_struct(params_node);
+        xmlNode *data_node = add_param_array(params_node);
 
         for (size_t i = 0; i < set_val->size; i++)
         {
@@ -1611,6 +1613,16 @@ add_param_struct(xmlNode *params_node)
 }
 
 
+static xmlNode *
+add_param_array(xmlNode *params_node)
+{
+    xmlNode *param_node = add_container(params_node, "param");
+    xmlNode *value_node = add_container(param_node,  "value");
+
+    return xmlNewChild(value_node, NULL, BAD_CAST "array", NULL);
+}
+
+
 static void
 add_struct_member(xmlNode *struct_node, const char *name, const char *type,
                   const char *value)
index 40e7bbfca378525471c10292ad673689e495fd89..12c599cd75f3333e5a5c39060afa9747d842750b 100644 (file)
@@ -47,7 +47,7 @@ install: build
 
 .PHONY: clean
 clean:
-       $(RM) *.o $(TARGETS) *~
+       $(RM) *.o $(TARGETS) *~ $(DEPS)
        set -e; for d in $(SUBDIRS); do $(MAKE) -C $$d clean; done
 
 %.o: %.c $(HDRS) Makefile
@@ -55,3 +55,5 @@ clean:
 
 xenperf xenpm: %: %.o Makefile
        $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) $(LDFLAGS_libxenctrl)
+
+-include $(DEPS)
index c918945f812a781a06d623fe30d1bde93bc21939..c50cf185accd86e758ece6dcfbf0d10166d0fb25 100644 (file)
@@ -50,17 +50,25 @@ static int check_for_xen(void)
 {
     uint32_t eax, ebx, ecx, edx;
     char signature[13];
+    uint32_t base;
 
-    cpuid(0x40000000, &eax, &ebx, &ecx, &edx);
-    *(uint32_t *)(signature + 0) = ebx;
-    *(uint32_t *)(signature + 4) = ecx;
-    *(uint32_t *)(signature + 8) = edx;
-    signature[12] = '\0';
+    for ( base = 0x40000000; base < 0x40001000; base += 0x100 )
+    {
+        cpuid(base, &eax, &ebx, &ecx, &edx);
 
-    if ( strcmp("XenVMMXenVMM", signature) || (eax < 0x40000002) )
-        return 0;
+        *(uint32_t *)(signature + 0) = ebx;
+        *(uint32_t *)(signature + 4) = ecx;
+        *(uint32_t *)(signature + 8) = edx;
+        signature[12] = '\0';
+
+        if ( !strcmp("XenVMMXenVMM", signature) && (eax >= (base + 2)) )
+            goto found;
+    }
+
+    return 0;
 
-    cpuid(0x40000001, &eax, &ebx, &ecx, &edx);
+ found:
+    cpuid(base + 1, &eax, &ebx, &ecx, &edx);
     printf("Running in %s context on Xen v%d.%d.\n",
            pv_context ? "PV" : "HVM", (uint16_t)(eax >> 16), (uint16_t)eax);
     return 1;
index 618aa27a84a11962f573968f57330627c67057ae..39eef6517f4bc3483f244a23375a825e434353a4 100644 (file)
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
 #include <getopt.h>
 #include <errno.h>
+#include <signal.h>
 
 #include <xenctrl.h>
 #include <inttypes.h>
+#include <sys/time.h>
 
-int main(int argc, char **argv)
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
+static int xc_fd;
+static int max_cpu_nr;
+
+/* help message */
+void show_help(void)
 {
-    int xc_fd;
-    int i, j, ret = 0;
-    int cinfo = 0, pinfo = 0;
-    int ch;
-    xc_physinfo_t physinfo = { 0 };
+    fprintf(stderr,
+            "xen power management control tool\n\n"
+            "usage: xenpm <command> [args]\n\n"
+            "xenpm command list:\n\n"
+            " get-cpuidle-states    [cpuid]       list cpu idle info of CPU <cpuid> or all\n"
+            " get-cpufreq-states    [cpuid]       list cpu freq info of CPU <cpuid> or all\n"
+            " get-cpufreq-para      [cpuid]       list cpu freq parameter of CPU <cpuid> or all\n"
+            " set-scaling-maxfreq   [cpuid] <HZ>  set max cpu frequency <HZ> on CPU <cpuid>\n"
+            "                                     or all CPUs\n"
+            " set-scaling-minfreq   [cpuid] <HZ>  set min cpu frequency <HZ> on CPU <cpuid>\n"
+            "                                     or all CPUs\n"
+            " set-scaling-speed     [cpuid] <num> set scaling speed on CPU <cpuid> or all\n"
+            "                                     it is used in userspace governor.\n"
+            " set-scaling-governor  [cpuid] <gov> set scaling governor on CPU <cpuid> or all\n"
+            "                                     as userspace/performance/powersave/ondemand\n"
+            " set-sampling-rate     [cpuid] <num> set sampling rate on CPU <cpuid> or all\n"
+            "                                     it is used in ondemand governor.\n"
+            " set-up-threshold      [cpuid] <num> set up threshold on CPU <cpuid> or all\n"
+            "                                     it is used in ondemand governor.\n"
+            " get-cpu-topology                    get thread/core/socket topology info\n"
+            " set-sched-smt           enable|disable enable/disable scheduler smt power saving\n"
+            " start [seconds]                     start collect Cx/Px statistics,\n"
+            "                                     output after CTRL-C or SIGINT or several seconds.\n"
+            );
+}
+/* wrapper function */
+void help_func(int argc, char *argv[])
+{
+    show_help();
+}
+
+static void print_cxstat(int cpuid, struct xc_cx_stat *cxstat)
+{
+    int i;
 
-    while ( (ch = getopt(argc, argv, "cp")) != -1 )
+    printf("cpu id               : %d\n", cpuid);
+    printf("total C-states       : %d\n", cxstat->nr);
+    printf("idle time(ms)        : %"PRIu64"\n",
+           cxstat->idle_time/1000000UL);
+    for ( i = 0; i < cxstat->nr; i++ )
     {
-        switch ( ch )
-        {
-        case 'c':
-            cinfo = 1;
-            break;
-        case 'p':
-            pinfo = 1;
-            break;
-        default:
-            fprintf(stderr, "%s [-p] [-c]\n", argv[0]);
-            return -1;
-        }
+        printf("C%d                   : transition [%020"PRIu64"]\n",
+               i, cxstat->triggers[i]);
+        printf("                       residency  [%020"PRIu64" ms]\n",
+               cxstat->residencies[i]/1000000UL);
     }
+    printf("\n");
+}
+
+/* show cpu idle information on CPU cpuid */
+static int get_cxstat_by_cpuid(int xc_fd, int cpuid, struct xc_cx_stat *cxstat)
+{
+    int ret = 0;
+    int max_cx_num = 0;
+
+    ret = xc_pm_get_max_cx(xc_fd, cpuid, &max_cx_num);
+    if ( ret )
+        return errno;
 
-    if ( !cinfo && !pinfo )
+    if ( !cxstat )
+        return -EINVAL;
+
+    cxstat->triggers = malloc(max_cx_num * sizeof(uint64_t));
+    if ( !cxstat->triggers )
+        return -ENOMEM;
+    cxstat->residencies = malloc(max_cx_num * sizeof(uint64_t));
+    if ( !cxstat->residencies )
     {
-        cinfo = 1;
-        pinfo = 1;
+        free(cxstat->triggers);
+        return -ENOMEM;
     }
 
-    xc_fd = xc_interface_open();
-    if ( xc_fd < 0 )
+    ret = xc_pm_get_cxstat(xc_fd, cpuid, cxstat);
+    if( ret )
     {
-        fprintf(stderr, "failed to get the handler\n");
-        return xc_fd;
+        int temp = errno;
+        free(cxstat->triggers);
+        free(cxstat->residencies);
+        cxstat->triggers = NULL;
+        cxstat->residencies = NULL;
+        return temp;
     }
 
-    ret = xc_physinfo(xc_fd, &physinfo);
+    return 0;
+}
+
+static int show_cxstat_by_cpuid(int xc_fd, int cpuid)
+{
+    int ret = 0;
+    struct xc_cx_stat cxstatinfo;
+
+    ret = get_cxstat_by_cpuid(xc_fd, cpuid, &cxstatinfo);
     if ( ret )
+        return ret;
+
+    print_cxstat(cpuid, &cxstatinfo);
+
+    free(cxstatinfo.triggers);
+    free(cxstatinfo.residencies);
+    return 0;
+}
+
+void cxstat_func(int argc, char *argv[])
+{
+    int cpuid = -1;
+
+    if ( argc > 0 && sscanf(argv[0], "%d", &cpuid) != 1 )
+        cpuid = -1;
+
+    if ( cpuid >= max_cpu_nr )
+        cpuid = -1;
+
+    if ( cpuid < 0 )
     {
-        fprintf(stderr, "failed to get the processor information\n");
-        xc_interface_close(xc_fd);
+        /* show cxstates on all cpus */
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( show_cxstat_by_cpuid(xc_fd, i) == -ENODEV )
+                break;
+    }
+    else
+        show_cxstat_by_cpuid(xc_fd, cpuid);
+}
+
+static void print_pxstat(int cpuid, struct xc_px_stat *pxstat)
+{
+    int i;
+
+    printf("cpu id               : %d\n", cpuid);
+    printf("total P-states       : %d\n", pxstat->total);
+    printf("usable P-states      : %d\n", pxstat->usable);
+    printf("current frequency    : %"PRIu64" MHz\n",
+           pxstat->pt[pxstat->cur].freq);
+    for ( i = 0; i < pxstat->total; i++ )
+    {
+        if ( pxstat->cur == i )
+            printf("*P%d", i);
+        else
+            printf("P%d ", i);
+        printf("                  : freq       [%04"PRIu64" MHz]\n",
+               pxstat->pt[i].freq);
+        printf("                       transition [%020"PRIu64"]\n",
+               pxstat->pt[i].count);
+        printf("                       residency  [%020"PRIu64" ms]\n",
+               pxstat->pt[i].residency/1000000UL);
+    }
+    printf("\n");
+}
+
+/* show cpu frequency information on CPU cpuid */
+static int get_pxstat_by_cpuid(int xc_fd, int cpuid, struct xc_px_stat *pxstat)
+{
+    int ret = 0;
+    int max_px_num = 0;
+
+    ret = xc_pm_get_max_px(xc_fd, cpuid, &max_px_num);
+    if ( ret )
+        return errno;
+
+    if ( !pxstat)
+        return -EINVAL;
+
+    pxstat->trans_pt = malloc(max_px_num * max_px_num *
+                              sizeof(uint64_t));
+    if ( !pxstat->trans_pt )
+        return -ENOMEM;
+    pxstat->pt = malloc(max_px_num * sizeof(struct xc_px_val));
+    if ( !pxstat->pt )
+    {
+        free(pxstat->trans_pt);
+        return -ENOMEM;
+    }
+
+    ret = xc_pm_get_pxstat(xc_fd, cpuid, pxstat);
+    if( ret )
+    {
+        int temp = errno;
+        free(pxstat->trans_pt);
+        free(pxstat->pt);
+        pxstat->trans_pt = NULL;
+        pxstat->pt = NULL;
+        return temp;
+    }
+
+    return 0;
+}
+
+/* show cpu actual average freq information on CPU cpuid */
+static int get_avgfreq_by_cpuid(int xc_fd, int cpuid, int *avgfreq)
+{
+    int ret = 0;
+
+    ret = xc_get_cpufreq_avgfreq(xc_fd, cpuid, avgfreq);
+    if ( ret )
+    {
+        return errno;
+    }
+
+    return 0;
+}
+
+static int show_pxstat_by_cpuid(int xc_fd, int cpuid)
+{
+    int ret = 0;
+    struct xc_px_stat pxstatinfo;
+
+    ret = get_pxstat_by_cpuid(xc_fd, cpuid, &pxstatinfo);
+    if ( ret )
         return ret;
+
+    print_pxstat(cpuid, &pxstatinfo);
+
+    free(pxstatinfo.trans_pt);
+    free(pxstatinfo.pt);
+    return 0;
+}
+
+void pxstat_func(int argc, char *argv[])
+{
+    int cpuid = -1;
+
+    if ( argc > 0 && sscanf(argv[0], "%d", &cpuid) != 1 )
+        cpuid = -1;
+
+    if ( cpuid >= max_cpu_nr )
+        cpuid = -1;
+
+    if ( cpuid < 0 )
+    {
+        /* show pxstates on all cpus */
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( show_pxstat_by_cpuid(xc_fd, i) == -ENODEV )
+                break;
     }
+    else
+        show_pxstat_by_cpuid(xc_fd, cpuid);
+}
+
+static uint64_t usec_start, usec_end;
+static struct xc_cx_stat *cxstat, *cxstat_start, *cxstat_end;
+static struct xc_px_stat *pxstat, *pxstat_start, *pxstat_end;
+static int *avgfreq;
+static uint64_t *sum, *sum_cx, *sum_px;
+
+static void signal_int_handler(int signo)
+{
+    int i, j;
+    struct timeval tv;
+    int cx_cap = 0, px_cap = 0;
 
-    /* print out the C state information */
-    if ( cinfo )
+    if ( gettimeofday(&tv, NULL) == -1 )
     {
-        int max_cx_num = 0;
-        struct xc_cx_stat cxstatinfo, *cxstat = &cxstatinfo;
+        fprintf(stderr, "failed to get timeofday\n");
+        return ;
+    }
+    usec_end = tv.tv_sec * 1000000UL + tv.tv_usec;
 
-        for ( i = 0; i < physinfo.nr_cpus; i++ )
-        {
-            ret = xc_pm_get_max_cx(xc_fd, i, &max_cx_num);
-            if ( ret )
-            {
-                if ( errno == ENODEV )
-                {
-                    fprintf(stderr, "Xen cpuidle is not enabled!\n");
-                    break;
-                }
-                else
-                {
-                    fprintf(stderr, "[CPU%d] failed to get max C-state\n", i);
-                    continue;
-                }
-            }
+    if ( get_cxstat_by_cpuid(xc_fd, 0, NULL) != -ENODEV )
+    {
+        cx_cap = 1;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( !get_cxstat_by_cpuid(xc_fd, i, &cxstat_end[i]) )
+                for ( j = 0; j < cxstat_end[i].nr; j++ )
+                    sum_cx[i] += cxstat_end[i].residencies[j] -
+                                 cxstat_start[i].residencies[j];
+    }
+
+    if ( get_pxstat_by_cpuid(xc_fd, 0, NULL) != -ENODEV )
+    {
+        px_cap = 1;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( !get_pxstat_by_cpuid(xc_fd, i , &pxstat_end[i]) )
+                for ( j = 0; j < pxstat_end[i].total; j++ )
+                    sum_px[i] += pxstat_end[i].pt[j].residency -
+                                 pxstat_start[i].pt[j].residency;
+    }
 
-            cxstat->triggers = malloc(max_cx_num * sizeof(uint64_t));
-            if ( !cxstat->triggers )
+    for ( i = 0; i < max_cpu_nr; i++ )
+        get_avgfreq_by_cpuid(xc_fd, i, &avgfreq[i]);
+
+    printf("Elapsed time (ms): %"PRIu64"\n", (usec_end - usec_start) / 1000UL);
+    for ( i = 0; i < max_cpu_nr; i++ )
+    {
+        uint64_t res, triggers;
+        double avg_res;
+
+        printf("\nCPU%d:\tResidency(ms)\t\tAvg Res(ms)\n",i);
+        if ( cx_cap && sum_cx[i] > 0 )
+        {
+            for ( j = 0; j < cxstat_end[i].nr; j++ )
             {
-                fprintf(stderr, "failed to malloc for C-states triggers\n");
-                break;
+                res = cxstat_end[i].residencies[j] -
+                    cxstat_start[i].residencies[j];
+                triggers = cxstat_end[i].triggers[j] -
+                    cxstat_start[i].triggers[j];
+                avg_res = (triggers==0) ? 0: (double)res/triggers/1000000.0;
+                printf("  C%d\t%"PRIu64"\t(%5.2f%%)\t%.2f\n", j, res/1000000UL,
+                        100 * res / (double)sum_cx[i], avg_res );
             }
-            cxstat->residencies = malloc(max_cx_num * sizeof(uint64_t));
-            if ( !cxstat->residencies )
+            printf("\n");
+        }
+        if ( px_cap && sum_px[i]>0 )
+        {
+            for ( j = 0; j < pxstat_end[i].total; j++ )
             {
-                fprintf(stderr, "failed to malloc for C-states residencies\n");
-                free(cxstat->triggers);
-                break;
+                res = pxstat_end[i].pt[j].residency -
+                    pxstat_start[i].pt[j].residency;
+                printf("  P%d\t%"PRIu64"\t(%5.2f%%)\n", j,
+                        res / 1000000UL, 100UL * res / (double)sum_px[i]);
             }
+        }
+        printf("  Avg freq\t%d\tKHz\n", avgfreq[i]);
+    }
 
-            ret = xc_pm_get_cxstat(xc_fd, i, cxstat);
-            if( ret )
-            {
-                fprintf(stderr, "[CPU%d] failed to get C-states statistics "
-                        "information\n", i);
-                free(cxstat->triggers);
-                free(cxstat->residencies);
-                continue;
-            }
+    /* some clean up and then exits */
+    for ( i = 0; i < 2 * max_cpu_nr; i++ )
+    {
+        free(cxstat[i].triggers);
+        free(cxstat[i].residencies);
+        free(pxstat[i].trans_pt);
+        free(pxstat[i].pt);
+    }
+    free(cxstat);
+    free(pxstat);
+    free(sum);
+    free(avgfreq);
+    xc_interface_close(xc_fd);
+    exit(0);
+}
 
-            printf("cpu id               : %d\n", i);
-            printf("total C-states       : %d\n", cxstat->nr);
-            printf("idle time(ms)        : %"PRIu64"\n",
-                   cxstat->idle_time/1000000UL);
-            for ( j = 0; j < cxstat->nr; j++ )
-            {
-                printf("C%d                   : transition [%020"PRIu64"]\n",
-                       j, cxstat->triggers[j]);
-                printf("                       residency  [%020"PRIu64" ms]\n",
-                       cxstat->residencies[j]*1000000UL/3579/1000000UL);
-            }
+void start_gather_func(int argc, char *argv[])
+{
+    int i;
+    struct timeval tv;
+    int timeout = 0;
 
-            free(cxstat->triggers);
-            free(cxstat->residencies);
+    if ( argc == 1 )
+    {
+        sscanf(argv[0], "%d", &timeout);
+        if ( timeout <= 0 )
+            fprintf(stderr, "failed to set timeout seconds, falling back...\n");
+        else
+            printf("Timeout set to %d seconds\n", timeout);
+    }
 
-            printf("\n");
+    if ( gettimeofday(&tv, NULL) == -1 )
+    {
+        fprintf(stderr, "failed to get timeofday\n");
+        return ;
+    }
+    usec_start = tv.tv_sec * 1000000UL + tv.tv_usec;
+
+    sum = malloc(sizeof(uint64_t) * 2 * max_cpu_nr);
+    if ( sum == NULL )
+        return ;
+    cxstat = malloc(sizeof(struct xc_cx_stat) * 2 * max_cpu_nr);
+    if ( cxstat == NULL )
+    {
+        free(sum);
+        return ;
+    }
+    pxstat = malloc(sizeof(struct xc_px_stat) * 2 * max_cpu_nr);
+    if ( pxstat == NULL )
+    {
+        free(sum);
+        free(cxstat);
+        return ;
+    }
+    avgfreq = malloc(sizeof(int) * max_cpu_nr);
+    if ( avgfreq == NULL )
+    {
+        free(sum);
+        free(cxstat);
+        free(pxstat);
+        return ;
+    }
+    memset(sum, 0, sizeof(uint64_t) * 2 * max_cpu_nr);
+    memset(cxstat, 0, sizeof(struct xc_cx_stat) * 2 * max_cpu_nr);
+    memset(pxstat, 0, sizeof(struct xc_px_stat) * 2 * max_cpu_nr);
+    memset(avgfreq, 0, sizeof(int) * max_cpu_nr);
+    sum_cx = sum;
+    sum_px = sum + max_cpu_nr;
+    cxstat_start = cxstat;
+    cxstat_end = cxstat + max_cpu_nr;
+    pxstat_start = pxstat;
+    pxstat_end = pxstat + max_cpu_nr;
+
+    if ( get_cxstat_by_cpuid(xc_fd, 0, NULL) == -ENODEV &&
+         get_pxstat_by_cpuid(xc_fd, 0, NULL) == -ENODEV )
+    {
+        fprintf(stderr, "Xen cpu idle and frequency is disabled!\n");
+        return ;
+    }
+
+    for ( i = 0; i < max_cpu_nr; i++ )
+    {
+        get_cxstat_by_cpuid(xc_fd, i, &cxstat_start[i]);
+        get_pxstat_by_cpuid(xc_fd, i, &pxstat_start[i]);
+        get_avgfreq_by_cpuid(xc_fd, i, &avgfreq[i]);
+    }
+
+    if (signal(SIGINT, signal_int_handler) == SIG_ERR)
+    {
+        fprintf(stderr, "failed to set signal int handler\n");
+        free(sum);
+        free(pxstat);
+        free(cxstat);
+        free(avgfreq);
+        return ;
+    }
+
+    if ( timeout > 0 )
+    {
+        if ( signal(SIGALRM, signal_int_handler) == SIG_ERR )
+        {
+            fprintf(stderr, "failed to set signal alarm handler\n");
+            free(sum);
+            free(pxstat);
+            free(cxstat);
+            free(avgfreq);
+            return ;
         }
+        alarm(timeout);
     }
 
-    /* print out P state information */
-    if ( pinfo )
+    printf("Start sampling, waiting for CTRL-C or SIGINT or SIGALARM signal ...\n");
+
+    pause();
+}
+
+/* print out parameters about cpu frequency */
+static void print_cpufreq_para(int cpuid, struct xc_get_cpufreq_para *p_cpufreq)
+{
+    int i;
+
+    printf("cpu id               : %d\n", cpuid);
+
+    printf("affected_cpus        :");
+    for ( i = 0; i < p_cpufreq->cpu_num; i++ )
+        if ( i == cpuid )
+            printf(" *%d", p_cpufreq->affected_cpus[i]);
+        else
+            printf(" %d", p_cpufreq->affected_cpus[i]);
+    printf("\n");
+
+    printf("cpuinfo frequency    : max [%u] min [%u] cur [%u]\n",
+           p_cpufreq->cpuinfo_max_freq,
+           p_cpufreq->cpuinfo_min_freq,
+           p_cpufreq->cpuinfo_cur_freq);
+
+    printf("scaling_driver       : %s\n", p_cpufreq->scaling_driver);
+
+    printf("scaling_avail_gov    : %s\n",
+           p_cpufreq->scaling_available_governors);
+
+    printf("current_governor     : %s\n", p_cpufreq->scaling_governor);
+    if ( !strncmp(p_cpufreq->scaling_governor,
+                  "userspace", CPUFREQ_NAME_LEN) )
+    {
+        printf("  userspace specific :\n");
+        printf("    scaling_setspeed : %u\n",
+               p_cpufreq->u.userspace.scaling_setspeed);
+    }
+    else if ( !strncmp(p_cpufreq->scaling_governor,
+                       "ondemand", CPUFREQ_NAME_LEN) )
+    {
+        printf("  ondemand specific  :\n");
+        printf("    sampling_rate    : max [%u] min [%u] cur [%u]\n",
+               p_cpufreq->u.ondemand.sampling_rate_max,
+               p_cpufreq->u.ondemand.sampling_rate_min,
+               p_cpufreq->u.ondemand.sampling_rate);
+        printf("    up_threshold     : %u\n",
+               p_cpufreq->u.ondemand.up_threshold);
+    }
+
+    printf("scaling_avail_freq   :");
+    for ( i = 0; i < p_cpufreq->freq_num; i++ )
+        if ( p_cpufreq->scaling_available_frequencies[i] ==
+             p_cpufreq->scaling_cur_freq )
+            printf(" *%d", p_cpufreq->scaling_available_frequencies[i]);
+        else
+            printf(" %d", p_cpufreq->scaling_available_frequencies[i]);
+    printf("\n");
+
+    printf("scaling frequency    : max [%u] min [%u] cur [%u]\n",
+           p_cpufreq->scaling_max_freq,
+           p_cpufreq->scaling_min_freq,
+           p_cpufreq->scaling_cur_freq);
+    printf("\n");
+}
+
+/* show cpu frequency parameters information on CPU cpuid */
+static int show_cpufreq_para_by_cpuid(int xc_fd, int cpuid)
+{
+    int ret = 0;
+    struct xc_get_cpufreq_para cpufreq_para, *p_cpufreq = &cpufreq_para;
+
+    p_cpufreq->cpu_num = 0;
+    p_cpufreq->freq_num = 0;
+    p_cpufreq->gov_num = 0;
+    p_cpufreq->affected_cpus = NULL;
+    p_cpufreq->scaling_available_frequencies = NULL;
+    p_cpufreq->scaling_available_governors = NULL;
+
+    do
     {
-        int max_px_num = 0;
-        struct xc_px_stat pxstatinfo, *pxstat = &pxstatinfo;
+        free(p_cpufreq->affected_cpus);
+        free(p_cpufreq->scaling_available_frequencies);
+        free(p_cpufreq->scaling_available_governors);
+
+        p_cpufreq->affected_cpus = NULL;
+        p_cpufreq->scaling_available_frequencies = NULL;
+        p_cpufreq->scaling_available_governors = NULL;
 
-        for ( i = 0; i < physinfo.nr_cpus; i++ )
+        if (!(p_cpufreq->affected_cpus =
+              malloc(p_cpufreq->cpu_num * sizeof(uint32_t))))
         {
-            ret = xc_pm_get_max_px(xc_fd, i, &max_px_num);
-            if ( ret )
-            {
-                if ( errno == ENODEV )
-                {
-                    printf("Xen cpufreq is not enabled!\n");
-                    break;
-                }
-                else
-                {
-                    fprintf(stderr, "[CPU%d] failed to get max P-state\n", i);
-                    continue;
-                }
-            }
+            fprintf(stderr,
+                    "[CPU%d] failed to malloc for affected_cpus\n",
+                    cpuid);
+            ret = -ENOMEM;
+            goto out;
+        }
+        if (!(p_cpufreq->scaling_available_frequencies =
+              malloc(p_cpufreq->freq_num * sizeof(uint32_t))))
+        {
+            fprintf(stderr,
+                    "[CPU%d] failed to malloc for scaling_available_frequencies\n",
+                    cpuid);
+            ret = -ENOMEM;
+            goto out;
+        }
+        if (!(p_cpufreq->scaling_available_governors =
+              malloc(p_cpufreq->gov_num * CPUFREQ_NAME_LEN * sizeof(char))))
+        {
+            fprintf(stderr,
+                    "[CPU%d] failed to malloc for scaling_available_governors\n",
+                    cpuid);
+            ret = -ENOMEM;
+            goto out;
+        }
 
-            pxstat->trans_pt = malloc(max_px_num * max_px_num *
-                                      sizeof(uint64_t));
-            if ( !pxstat->trans_pt )
-            {
-                fprintf(stderr, "failed to malloc for P-states "
-                        "transition table\n");
-                break;
-            }
-            pxstat->pt = malloc(max_px_num * sizeof(struct xc_px_val));
-            if ( !pxstat->pt )
-            {
-                fprintf(stderr, "failed to malloc for P-states table\n");
-                free(pxstat->pt);
+        ret = xc_get_cpufreq_para(xc_fd, cpuid, p_cpufreq);
+    } while ( ret && errno == EAGAIN );
+
+    if ( ret == 0 )
+        print_cpufreq_para(cpuid, p_cpufreq);
+    else if ( errno == ENODEV )
+    {
+        ret = -ENODEV;
+        fprintf(stderr, "Xen cpufreq is not enabled!\n");
+    }
+    else
+        fprintf(stderr,
+                "[CPU%d] failed to get cpufreq parameter\n",
+                cpuid);
+
+out:
+    free(p_cpufreq->scaling_available_governors);
+    free(p_cpufreq->scaling_available_frequencies);
+    free(p_cpufreq->affected_cpus);
+
+    return ret;
+}
+
+void cpufreq_para_func(int argc, char *argv[])
+{
+    int cpuid = -1;
+
+    if ( argc > 0 && sscanf(argv[0], "%d", &cpuid) != 1 )
+        cpuid = -1;
+
+    if ( cpuid >= max_cpu_nr )
+        cpuid = -1;
+
+    if ( cpuid < 0 )
+    {
+        /* show cpu freqency information on all cpus */
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( show_cpufreq_para_by_cpuid(xc_fd, i) == -ENODEV )
                 break;
-            }
+    }
+    else
+        show_cpufreq_para_by_cpuid(xc_fd, cpuid);
+}
 
-            ret = xc_pm_get_pxstat(xc_fd, i, pxstat);
-            if( ret )
-            {
-                fprintf(stderr, "[CPU%d] failed to get P-states "
-                        "statistics information\n", i);
-                free(pxstat->trans_pt);
-                free(pxstat->pt);
-                continue;
-            }
+void scaling_max_freq_func(int argc, char *argv[])
+{
+    int cpuid = -1, freq = -1;
+
+    if ( (argc >= 2 && (sscanf(argv[1], "%d", &freq) != 1 ||
+                        sscanf(argv[0], "%d", &cpuid) != 1)) ||
+         (argc == 1 && sscanf(argv[0], "%d", &freq) != 1 ) ||
+         argc == 0 )
+    {
+        fprintf(stderr, "failed to set scaling max freq\n");
+        return ;
+    }
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_fd, i, SCALING_MAX_FREQ, freq) )
+                fprintf(stderr, "[CPU%d] failed to set scaling max freq\n", i);
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_fd, cpuid, SCALING_MAX_FREQ, freq) )
+            fprintf(stderr, "failed to set scaling max freq\n");
+    }
+}
+
+void scaling_min_freq_func(int argc, char *argv[])
+{
+    int cpuid = -1, freq = -1;
+
+    if ( (argc >= 2 && (sscanf(argv[1], "%d", &freq) != 1 ||
+                        sscanf(argv[0], "%d", &cpuid) != 1) ) ||
+         (argc == 1 && sscanf(argv[0], "%d", &freq) != 1 ) ||
+         argc == 0 )
+    {
+        fprintf(stderr, "failed to set scaling min freq\n");
+        return ;
+    }
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_fd, i, SCALING_MIN_FREQ, freq) )
+                fprintf(stderr, "[CPU%d] failed to set scaling min freq\n", i);
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_fd, cpuid, SCALING_MIN_FREQ, freq) )
+            fprintf(stderr, "failed to set scaling min freq\n");
+    }
+}
+
+void scaling_speed_func(int argc, char *argv[])
+{
+    int cpuid = -1, speed = -1;
+
+    if ( (argc >= 2 && (sscanf(argv[1], "%d", &speed) != 1 ||
+                        sscanf(argv[0], "%d", &cpuid) != 1) ) ||
+         (argc == 1 && sscanf(argv[0], "%d", &speed) != 1 ) ||
+         argc == 0 )
+    {
+        fprintf(stderr, "failed to set scaling speed\n");
+        return ;
+    }
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_fd, i, SCALING_SETSPEED, speed) )
+                fprintf(stderr, "[CPU%d] failed to set scaling speed\n", i);
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_fd, cpuid, SCALING_SETSPEED, speed) )
+            fprintf(stderr, "failed to set scaling speed\n");
+    }
+}
+
+void scaling_sampling_rate_func(int argc, char *argv[])
+{
+    int cpuid = -1, rate = -1;
+
+    if ( (argc >= 2 && (sscanf(argv[1], "%d", &rate) != 1 ||
+                        sscanf(argv[0], "%d", &cpuid) != 1) ) ||
+         (argc == 1 && sscanf(argv[0], "%d", &rate) != 1 ) ||
+         argc == 0 )
+    {
+        fprintf(stderr, "failed to set scaling sampling rate\n");
+        return ;
+    }
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_fd, i, SAMPLING_RATE, rate) )
+                fprintf(stderr,
+                        "[CPU%d] failed to set scaling sampling rate\n", i);
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_fd, cpuid, SAMPLING_RATE, rate) )
+            fprintf(stderr, "failed to set scaling sampling rate\n");
+    }
+}
 
-            printf("cpu id               : %d\n", i);
-            printf("total P-states       : %d\n", pxstat->total);
-            printf("usable P-states      : %d\n", pxstat->usable);
-            printf("current frequency    : %"PRIu64" MHz\n",
-                   pxstat->pt[pxstat->cur].freq);
-            for ( j = 0; j < pxstat->total; j++ )
+void scaling_up_threshold_func(int argc, char *argv[])
+{
+    int cpuid = -1, threshold = -1;
+
+    if ( (argc >= 2 && (sscanf(argv[1], "%d", &threshold) != 1 ||
+                        sscanf(argv[0], "%d", &cpuid) != 1) ) ||
+         (argc == 1 && sscanf(argv[0], "%d", &threshold) != 1 ) ||
+         argc == 0 )
+    {
+        fprintf(stderr, "failed to set up scaling threshold\n");
+        return ;
+    }
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_para(xc_fd, i, UP_THRESHOLD, threshold) )
+                fprintf(stderr,
+                        "[CPU%d] failed to set up scaling threshold\n", i);
+    }
+    else
+    {
+        if ( xc_set_cpufreq_para(xc_fd, cpuid, UP_THRESHOLD, threshold) )
+            fprintf(stderr, "failed to set up scaling threshold\n");
+    }
+}
+
+void scaling_governor_func(int argc, char *argv[])
+{
+    int cpuid = -1;
+    char *name = NULL;
+
+    if ( argc >= 2 )
+    {
+        name = strdup(argv[1]);
+        if ( name == NULL )
+            goto out;
+        if ( sscanf(argv[0], "%d", &cpuid) != 1 )
+        {
+            free(name);
+            goto out;
+        }
+    }
+    else if ( argc > 0 )
+    {
+        name = strdup(argv[0]);
+        if ( name == NULL )
+            goto out;
+    }
+    else
+        goto out;
+
+    if ( cpuid < 0 )
+    {
+        int i;
+        for ( i = 0; i < max_cpu_nr; i++ )
+            if ( xc_set_cpufreq_gov(xc_fd, i, name) )
+                fprintf(stderr, "[CPU%d] failed to set governor name\n", i);
+    }
+    else
+    {
+        if ( xc_set_cpufreq_gov(xc_fd, cpuid, name) )
+            fprintf(stderr, "failed to set governor name\n");
+    }
+
+    free(name);
+    return ;
+out:
+    fprintf(stderr, "failed to set governor name\n");
+}
+
+#define MAX_NR_CPU 512
+
+void cpu_topology_func(int argc, char *argv[])
+{
+    uint32_t cpu_to_core[MAX_NR_CPU];
+    uint32_t cpu_to_socket[MAX_NR_CPU];
+    struct xc_get_cputopo info;
+    int i, ret;
+
+    info.cpu_to_core = cpu_to_core;
+    info.cpu_to_socket = cpu_to_socket;
+    info.max_cpus = MAX_NR_CPU;
+    ret = xc_get_cputopo(xc_fd, &info);
+    if (!ret)
+    {
+        printf("CPU\tcore\tsocket\n");
+        for (i=0; i<info.nr_cpus; i++)
+        {
+            if ( info.cpu_to_core[i] != INVALID_TOPOLOGY_ID &&
+                    info.cpu_to_socket[i] != INVALID_TOPOLOGY_ID )
             {
-                if ( pxstat->cur == j )
-                    printf("*P%d", j);
-                else
-                    printf("P%d ", j);
-                printf("                  : freq       [%04"PRIu64" MHz]\n",
-                       pxstat->pt[j].freq);
-                printf("                       transition [%020"PRIu64"]\n",
-                       pxstat->pt[j].count);
-                printf("                       residency  [%020"PRIu64" ms]\n",
-                       pxstat->pt[j].residency/1000000UL);
+            printf("CPU%d\t %d\t %d\n", i, info.cpu_to_core[i],
+                    info.cpu_to_socket[i]);
             }
+        }
+    }
+    else
+    {
+        printf("Can not get Xen CPU topology!\n");
+    }
 
-            free(pxstat->trans_pt);
-            free(pxstat->pt);
+    return ;
+}
 
-            printf("\n");
-        }
+void set_sched_smt_func(int argc, char *argv[])
+{
+    int value, rc;
+
+    if (argc != 1){
+        show_help();
+        exit(-1);
+    }
+
+    if ( !strncmp(argv[0], "disable", sizeof("disable")) )
+    {
+        value = 0;
+    }
+    else if ( !strncmp(argv[0], "enable", sizeof("enable")) )
+    {
+        value = 1;
+    }
+    else
+    {
+        show_help();
+        exit(-1);
+    }
+
+    rc = xc_set_sched_opt_smt(xc_fd, value);
+    printf("%s sched_smt_power_savings %s\n", argv[0],
+                    rc? "failed":"successeed" );
+
+    return;
+}
+
+struct {
+    const char *name;
+    void (*function)(int argc, char *argv[]);
+} main_options[] = {
+    { "help", help_func },
+    { "get-cpuidle-states", cxstat_func },
+    { "get-cpufreq-states", pxstat_func },
+    { "start", start_gather_func },
+    { "get-cpufreq-para", cpufreq_para_func },
+    { "set-scaling-maxfreq", scaling_max_freq_func },
+    { "set-scaling-minfreq", scaling_min_freq_func },
+    { "set-scaling-governor", scaling_governor_func },
+    { "set-scaling-speed", scaling_speed_func },
+    { "set-sampling-rate", scaling_sampling_rate_func },
+    { "set-up-threshold", scaling_up_threshold_func },
+    { "get-cpu-topology", cpu_topology_func},
+    { "set-sched-smt", set_sched_smt_func},
+};
+
+int main(int argc, char *argv[])
+{
+    int i, ret = 0;
+    xc_physinfo_t physinfo = { 0 };
+    int nr_matches = 0;
+    int matches_main_options[ARRAY_SIZE(main_options)];
+
+    if ( argc < 2 )
+    {
+        show_help();
+        return 0;
+    }
+
+    xc_fd = xc_interface_open();
+    if ( xc_fd < 0 )
+    {
+        fprintf(stderr, "failed to get the handler\n");
+        return 0;
     }
 
+    ret = xc_physinfo(xc_fd, &physinfo);
+    if ( ret )
+    {
+        fprintf(stderr, "failed to get the processor information\n");
+        xc_interface_close(xc_fd);
+        return 0;
+    }
+    max_cpu_nr = physinfo.nr_cpus;
+
+    /* calculate how many options match with user's input */
+    for ( i = 0; i < ARRAY_SIZE(main_options); i++ )
+        if ( !strncmp(main_options[i].name, argv[1], strlen(argv[1])) )
+            matches_main_options[nr_matches++] = i;
+
+    if ( nr_matches > 1 )
+    {
+        fprintf(stderr, "Ambigious options: ");
+        for ( i = 0; i < nr_matches; i++ )
+            fprintf(stderr, " %s", main_options[matches_main_options[i]].name);
+        fprintf(stderr, "\n");
+    }
+    else if ( nr_matches == 1 )
+        /* dispatch to the corresponding function handler */
+        main_options[matches_main_options[0]].function(argc - 2, argv + 2);
+    else
+        show_help();
+
     xc_interface_close(xc_fd);
-    return ret;
+    return 0;
 }
 
index 3629d897423035494112217490e151f69a2766da..03c8ed9530c12571b8eb5acb762e790819beaa3c 100644 (file)
@@ -22,4 +22,6 @@ endif
 
 .PHONY: clean
 clean:
-       rm -rf build tmp *.pyc *.pyo *.o *.a *~ a.out
+       rm -rf build tmp *.pyc *.pyo *.o *.a *~ a.out $(DEPS)
+
+-include $(DEPS)
index ae15af187ef0efd1ef0e30ce1ac24242b5ab43c1..ba70832e62e01716d7049a91882bd5e9fe9571fa 100644 (file)
@@ -441,7 +441,11 @@ class Grub:
                 # Timed out waiting for a keypress
                 if mytime != -1:
                     mytime += 1
-                    if mytime >= int(timeout):
+                    # curses.timeout() does not work properly on Solaris
+                    # So we may come here even after a key has been pressed.
+                    # Check both timeout and mytime to avoid exiting
+                    # when we shouldn't.
+                    if timeout != -1 and mytime >= int(timeout):
                         self.isdone = True
                         break
             else:
@@ -501,7 +505,7 @@ def get_entry_idx(cf, entry):
 
     return None
 
-def run_grub(file, entry, fs):
+def run_grub(file, entry, fs, arg):
     global g
     global sel
 
@@ -526,7 +530,11 @@ def run_grub(file, entry, fs):
         print "No kernel image selected!"
         sys.exit(1)
 
-    img = g.cf.images[sel]
+    try:
+        img = g.cf.images[sel]
+    except:
+        log.debug("PyGrub: Default selection is not valid, using first boot configuration...")
+        img = g.cf.images[0]
 
     grubcfg = { "kernel": None, "ramdisk": None, "args": None }
 
@@ -534,7 +542,7 @@ def run_grub(file, entry, fs):
     if img.initrd:
         grubcfg["ramdisk"] = img.initrd[1]
     if img.args:
-        grubcfg["args"] = img.args
+        grubcfg["args"] = img.args + " " + arg
 
     return grubcfg
 
@@ -579,6 +587,15 @@ def sniff_solaris(fs, cfg):
 
     return cfg
  
+def sniff_netware(fs, cfg):
+    if not fs.file_exists("/nwserver/xnloader.sys"):
+        return cfg
+
+    if not cfg["kernel"]:
+        cfg["kernel"] = "/nwserver/xnloader.sys"
+
+    return cfg
+
 if __name__ == "__main__":
     sel = None
     
@@ -605,7 +622,7 @@ if __name__ == "__main__":
     isconfig = False
 
     # what was passed in
-    incfg = { "kernel": None, "ramdisk": None, "args": None }
+    incfg = { "kernel": None, "ramdisk": None, "args": "" }
     # what grub or sniffing chose
     chosencfg = { "kernel": None, "ramdisk": None, "args": None }
     # what to boot
@@ -641,7 +658,7 @@ if __name__ == "__main__":
 
     # debug
     if isconfig:
-        chosencfg = run_grub(file, entry)
+        chosencfg = run_grub(file, entry, fs, incfg["args"])
         print "  kernel: %s" % chosencfg["kernel"]
         if img.initrd:
             print "  initrd: %s" % chosencfg["ramdisk"]
@@ -659,7 +676,10 @@ if __name__ == "__main__":
     chosencfg = sniff_solaris(fs, incfg)
 
     if not chosencfg["kernel"]:
-        chosencfg = run_grub(file, entry, fs)
+        chosencfg = sniff_netware(fs, incfg)
+
+    if not chosencfg["kernel"]:
+        chosencfg = run_grub(file, entry, fs, incfg["args"])
 
     data = fs.open_file(chosencfg["kernel"]).read()
     (tfd, bootcfg["kernel"]) = tempfile.mkstemp(prefix="boot_kernel.",
index c2e5c9c3189732259551b9ccb900a5cbaa629be3..4ae0a2d777a23b7b8149c5edcbc2c1308aaac9fd 100644 (file)
@@ -85,3 +85,6 @@ test:
 .PHONY: clean
 clean:
        rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/auxbin.pyc
+       rm -f $(DEPS)
+
+-include $(DEPS)
index 0afc168b665745f2d91acaa28146f2841a548014..6edd054bb332c153276265ed92a4d301e4bb8336 100644 (file)
@@ -38,6 +38,13 @@ scf = Extension("scf",
                libraries          = libraries,
                sources            = [ "xen/lowlevel/scf/scf.c" ])
              
+process = Extension("process",
+               extra_compile_args = extra_compile_args,
+               include_dirs       = include_dirs + [ "xen/lowlevel/process" ],
+               library_dirs       = library_dirs,
+               libraries          = libraries + [ "contract" ],
+               sources            = [ "xen/lowlevel/process/process.c" ])
+
 acm = Extension("acm",
                extra_compile_args = extra_compile_args,
                include_dirs       = include_dirs + [ "xen/lowlevel/acm" ],
@@ -63,6 +70,7 @@ ptsname = Extension("ptsname",
 modules = [ xc, xs, ptsname, acm, flask ]
 if os.uname()[0] == 'SunOS':
     modules.append(scf)
+    modules.append(process)
 
 setup(name            = 'xen',
       version         = '3.0',
index b8387611b703d08bdb5adc6713c89a240c1be7bb..bf96cf2af626b73fdae2e187fa7686f1c178394b 100644 (file)
@@ -55,6 +55,7 @@ static PyObject *pyflask_context_to_sid(PyObject *self, PyObject *args,
     xc_handle = xc_interface_open();
     if (xc_handle < 0) {
         errno = xc_handle;
+        free(buf);
         return PyErr_SetFromErrno(xc_error_obj);
     }
     
diff --git a/tools/python/xen/lowlevel/process/process.c b/tools/python/xen/lowlevel/process/process.c
new file mode 100644 (file)
index 0000000..a1e71bc
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <Python.h>
+
+#include <libcontract.h>
+#include <sys/contract/process.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+/*
+ * On Solaris, xend runs under a contract as an smf(5) service.  As a
+ * result, when spawning long-running children such as a domain's
+ * qemu-dm instantiation, we have to make sure it's in a separate
+ * contract. Before we fork, we must activate a separate process
+ * contract template to place the child processes in a new contract.
+ */
+
+static PyObject *
+pyprocess_activate(PyObject *o, PyObject *args, PyObject *kwargs)
+{
+       static char *kwlist[] = { "name", NULL };
+       char *name = NULL;
+       int flags;
+       int cfd;
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s", kwlist, &name))
+               return (NULL);
+
+       cfd = open64("/system/contract/process/template", O_RDWR);
+
+       if (cfd == -1)
+               goto err;
+
+       if ((flags = fcntl(cfd, F_GETFD, 0)) == -1)
+               goto err;
+       
+       if (fcntl(cfd, F_SETFD, flags | FD_CLOEXEC) == -1)
+               goto err;
+
+       if (name != NULL)
+               ct_pr_tmpl_set_svc_aux(cfd, name);
+
+       if (ct_tmpl_activate(cfd))
+               goto err;
+
+       return (PyInt_FromLong((long)cfd));
+
+err:
+       if (cfd != -1)
+               close(cfd);
+       PyErr_SetFromErrno(PyExc_OSError);
+       return (NULL);
+}
+
+static PyObject *
+pyprocess_clear(PyObject *o, PyObject *args, PyObject *kwargs)
+{
+       static char *kwlist[] = { "contract", NULL };
+       int cfd;
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i", kwlist, &cfd))
+               return (NULL);
+
+       if (ct_tmpl_clear(cfd) != 0) {
+               PyErr_SetFromErrno(PyExc_OSError);
+               return (NULL);
+       }
+
+       close(cfd);
+
+       Py_INCREF(Py_None);
+       return (Py_None);
+}
+
+static PyObject *
+pyprocess_abandon_latest(PyObject *o, PyObject *args, PyObject *kwargs)
+{
+       static char *kwlist[] = { NULL };
+       static char path[PATH_MAX];
+       ct_stathdl_t st;
+       ctid_t latest;
+       int cfd;
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "", kwlist))
+               return (NULL);
+
+       cfd = open64("/system/contract/process/latest", O_RDONLY);
+       if (cfd == -1)
+               goto err;
+
+       ct_status_read(cfd, CTD_COMMON, &st);
+       latest = ct_status_get_id(st);
+       ct_status_free(st);
+       close(cfd);
+
+       snprintf(path, PATH_MAX, "/system/contract/process/%ld/ctl",
+           (long)latest);
+
+       if ((cfd = open64(path, O_WRONLY)) < 0) 
+               goto err;
+       if (ct_ctl_abandon(cfd))
+               goto err;
+       close(cfd);
+
+       Py_INCREF(Py_None);
+       return (Py_None);
+err:
+       PyErr_SetFromErrno(PyExc_OSError);
+       return (NULL);
+}
+
+PyDoc_STRVAR(pyprocess_activate__doc__,
+    "activate(name)\n"
+    "\n"
+    "Activate a new process contract template. If name is given,\n"
+    "it is used as the template's auxiliary value.\n"
+    "Returns the new contract template.\n");
+PyDoc_STRVAR(pyprocess_clear__doc__,
+    "clear(contract)\n"
+    "\n"
+    "Clear and close the given contract template.\n");
+
+PyDoc_STRVAR(pyprocess_abandon_latest__doc__,
+    "abandon_latest()\n"
+    "\n"
+    "Abandon the latest contract created by this thread.\n");
+
+static struct PyMethodDef pyprocess_module_methods[] = {
+    { "activate", (PyCFunction) pyprocess_activate,
+      METH_VARARGS|METH_KEYWORDS, pyprocess_activate__doc__ },
+    { "clear", (PyCFunction) pyprocess_clear,
+      METH_VARARGS|METH_KEYWORDS, pyprocess_clear__doc__ },
+    { "abandon_latest", (PyCFunction) pyprocess_abandon_latest,
+      METH_VARARGS|METH_KEYWORDS, pyprocess_abandon_latest__doc__ },
+    { NULL, NULL, 0, NULL }    
+};
+
+PyMODINIT_FUNC
+initprocess(void)
+{
+       Py_InitModule("process", pyprocess_module_methods);
+}
index 59a16b3ee7bf4d7659b6c02c3b4086f123bc9c0d..2c5096fe69c19606283949a2c75988ba0d0e2aea 100644 (file)
@@ -678,19 +678,22 @@ static PyObject *pyxc_get_device_group(XcObject *self,
 
     if ( rc < 0 )
     {
-      free(sdev_array); 
-      return pyxc_error_to_exception();
+        free(sdev_array); 
+        return pyxc_error_to_exception();
     }
 
     if ( !num_sdevs )
     {
-       free(sdev_array);
-       return Py_BuildValue("s", "");
+        free(sdev_array);
+        return Py_BuildValue("s", "");
     }
 
     group_str = calloc(num_sdevs, sizeof(dev_str));
     if (group_str == NULL)
+    {
+        free(sdev_array);
         return PyErr_NoMemory();
+    }
 
     for ( i = 0; i < num_sdevs; i++ )
     {
@@ -887,36 +890,37 @@ static PyObject *pyxc_hvm_build(XcObject *self,
     int i;
 #endif
     char *image;
-    int memsize, vcpus = 1, acpi = 0, apic = 1;
+    int memsize, target=-1, vcpus = 1, acpi = 0, apic = 1;
 
     static char *kwd_list[] = { "domid",
-                                "memsize", "image", "vcpus", "acpi",
+                                "memsize", "image", "target", "vcpus", "acpi",
                                 "apic", NULL };
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iii", kwd_list,
-                                      &dom, &memsize,
-                                      &image, &vcpus, &acpi, &apic) )
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iis|iiii", kwd_list,
+                                      &dom, &memsize, &image, &target, &vcpus,
+                                      &acpi, &apic) )
         return NULL;
 
-    if ( xc_hvm_build(self->xc_handle, dom, memsize, image) != 0 )
+    if ( target == -1 )
+        target = memsize;
+
+    if ( xc_hvm_build_target_mem(self->xc_handle, dom, memsize,
+                                 target, image) != 0 )
         return pyxc_error_to_exception();
 
 #if !defined(__ia64__)
-    /* Set up the HVM info table. */
+    /* Fix up the HVM info table. */
     va_map = xc_map_foreign_range(self->xc_handle, dom, XC_PAGE_SIZE,
                                   PROT_READ | PROT_WRITE,
                                   HVM_INFO_PFN);
     if ( va_map == NULL )
         return PyErr_SetFromErrno(xc_error_obj);
     va_hvm = (struct hvm_info_table *)(va_map + HVM_INFO_OFFSET);
-    memset(va_hvm, 0, sizeof(*va_hvm));
-    strncpy(va_hvm->signature, "HVM INFO", 8);
-    va_hvm->length       = sizeof(struct hvm_info_table);
     va_hvm->acpi_enabled = acpi;
     va_hvm->apic_mode    = apic;
     va_hvm->nr_vcpus     = vcpus;
     for ( i = 0, sum = 0; i < va_hvm->length; i++ )
         sum += ((uint8_t *)va_hvm)[i];
-    va_hvm->checksum = -sum;
+    va_hvm->checksum -= sum;
     munmap(va_map, XC_PAGE_SIZE);
 #endif
 
@@ -1332,6 +1336,24 @@ static PyObject *pyxc_domain_setmaxmem(XcObject *self, PyObject *args)
     return zero;
 }
 
+static PyObject *pyxc_domain_set_target_mem(XcObject *self, PyObject *args)
+{
+    uint32_t dom;
+    unsigned int mem_kb, mem_pages;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &mem_kb))
+        return NULL;
+
+    mem_pages = mem_kb / 4; 
+
+    if (xc_domain_memory_set_pod_target(self->xc_handle, dom, mem_pages,
+                                        NULL, NULL, NULL) != 0)
+        return pyxc_error_to_exception();
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
 static PyObject *pyxc_domain_set_memmap_limit(XcObject *self, PyObject *args)
 {
     uint32_t dom;
@@ -1812,6 +1834,14 @@ static PyMethodDef pyxc_methods[] = {
       " maxmem_kb [int]: .\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
+    { "domain_set_target_mem", 
+      (PyCFunction)pyxc_domain_set_target_mem, 
+      METH_VARARGS, "\n"
+      "Set a domain's memory target\n"
+      " dom [int]: Identifier of domain.\n"
+      " mem_kb [int]: .\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
     { "domain_set_memmap_limit", 
       (PyCFunction)pyxc_domain_set_memmap_limit, 
       METH_VARARGS, "\n"
index 6497126d2de73c9db51a4ac0b07c92b8c5926d81..ad47a279a4451a9b746021e5b94a3030edc9c643 100644 (file)
@@ -336,15 +336,19 @@ static PyObject *xspy_set_permissions(XsHandle *self, PyObject *args)
        xs_set_error(EINVAL);
         goto exit;
     }
+
     xsperms_n = PyList_Size(perms);
-    xsperms = calloc(xsperms_n, sizeof(struct xs_permissions));
+    /* NB. alloc +1 so we can change the owner if necessary. */
+    xsperms = calloc(xsperms_n + 1, sizeof(struct xs_permissions));
     if (!xsperms) {
        xs_set_error(ENOMEM);
         goto exit;
     }
+
     tuple0 = PyTuple_New(0);
     if (!tuple0)
         goto exit;
+
     for (i = 0; i < xsperms_n; i++) {
         /* Read/write perms. Set these. */
         int p_read = 0, p_write = 0;
@@ -357,6 +361,17 @@ static PyObject *xspy_set_permissions(XsHandle *self, PyObject *args)
         if (p_write)
             xsperms[i].perms |= XS_PERM_WRITE;
     }
+
+    /*
+     * Is the caller trying to restrict access to the first specified
+     * domain? If so then it cannot be owner, so we force dom0 as owner.
+     */
+    if (xsperms_n && xsperms[0].perms && xsperms[0].id) {
+        memmove(&xsperms[1], &xsperms[0], xsperms_n * sizeof(*xsperms));
+        xsperms[0].id = xsperms[0].perms = 0;
+        xsperms_n++;
+    }
+
     Py_BEGIN_ALLOW_THREADS
     result = xs_set_permissions(xh, th, path, xsperms, xsperms_n);
     Py_END_ALLOW_THREADS
index 74935774a62910c53d25904dbd795cea77216be4..29b23fb7dd1fce0cb18831f7b1fdff78b8ce3979 100644 (file)
@@ -33,7 +33,7 @@ def blkdev_name_to_number(name):
         major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) - ord('a'))) / 16 ]
         minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) - ord('a'))) % 16) * 16 + int(n[9:] or 0)
         devnum = major * 256 + minor
-    elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
+    elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?$', n):
         ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ]
         major = ide_majors[(ord(n[7:8]) - ord('a')) / 2]
         minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0)
index cd4361404c3174f039f154e6768fe1712679dc4d..e31f396e730dfeaf408fdc7018ed1faa67db25ad 100644 (file)
@@ -23,7 +23,7 @@ from xen.xend import sxp
 from xen.xend.XendClient import server
 from xen.xend.XendError import XendError
 from xen.xend.xenstore.xstransact import xstransact
-from xen.xend.server import DevController
+from xen.xend.server import DevConstants
 
 import xen.xend.XendProtocol
 
@@ -169,7 +169,7 @@ def diagnose_hotplugging():
 
 
 def stateString(state):
-    return state and DevController.xenbusState[int(state)] or '<None>'
+    return state and DevConstants.xenbusState[int(state)] or '<None>'
 
 
 def main(argv = None):
index b5a0a2a371dc5e3fb97e488f9d88133ee9e4196f..3efc43d57fdaa0e89c7999e7db152380f705c115 100644 (file)
@@ -5,7 +5,7 @@ def fcntl_setfd_cloexec(file, bool):
         f = fcntl.fcntl(file, fcntl.F_GETFD)
         if bool: f |= fcntl.FD_CLOEXEC
         else: f &= ~fcntl.FD_CLOEXEC
-        fcntl.fcntl(file, fcntl.F_SETFD)
+        fcntl.fcntl(file, fcntl.F_SETFD, f)
 
 def waitstatus_description(st):
         if os.WIFEXITED(st):
index aeb25d85666307e315d2e05290ff30e810748f82..bb7720bb4e53530d90c58c01ecae9a49644eecb3 100644 (file)
@@ -12,6 +12,7 @@ import re
 import types
 import struct
 import time
+import threading
 from xen.util import utils
 
 PROC_PCI_PATH = '/proc/bus/pci/devices'
@@ -66,9 +67,10 @@ PCI_EXP_DEVCTL_FLR = (0x1 << 15)
 
 PCI_CAP_ID_PM = 0x01
 PCI_PM_CTRL = 4
-PCI_PM_CTRL_NO_SOFT_RESET = 0x0004
+PCI_PM_CTRL_NO_SOFT_RESET = 0x0008
 PCI_PM_CTRL_STATE_MASK = 0x0003
 PCI_D3hot = 3
+PCI_D0hot = 0
 
 VENDOR_INTEL  = 0x8086
 PCI_CAP_ID_VENDOR_SPECIFIC_CAP = 0x09
@@ -96,6 +98,7 @@ MSIX_SIZE_MASK = 0x7ff
 
 # Global variable to store information from lspci
 lspci_info = None
+lspci_info_lock = threading.RLock()
 
 #Calculate PAGE_SHIFT: number of bits to shift an address to get the page number
 PAGE_SIZE = resource.getpagesize()
@@ -173,12 +176,16 @@ def get_all_pci_devices():
 
     return pci_devs
 
-def create_lspci_info():
+def _create_lspci_info():
+    """Execute 'lspci' command and parse the result.
+    If the command does not exist, lspci_info will be kept blank ({}).
+
+    Expects to be protected by lspci_info_lock.
+    """
     global lspci_info
+    
     lspci_info = {}
 
-    # Execute 'lspci' command and parse the result.
-    # If the command does not exist, lspci_info will be kept blank ({}).
     for paragraph in os.popen(LSPCI_CMD + ' -vmm').read().split('\n\n'):
         device_name = None
         device_info = {}
@@ -194,6 +201,14 @@ def create_lspci_info():
         if device_name is not None:
             lspci_info[device_name] = device_info
 
+def create_lspci_info():
+    global lspci_info_lock
+    lspci_info_lock.acquire()
+    try:
+        _create_lspci_info()
+    finally:
+        lspci_info_lock.release()
+
 def save_pci_conf_space(devs_string):
     pci_list = []
     cfg_list = []
@@ -234,7 +249,7 @@ def find_all_devices_owned_by_pciback():
     return dev_list
 
 def transform_list(target, src):
-    ''' src: its element is pci string (Format: xxxx:xx:xx:x).
+    ''' src: its element is pci string (Format: xxxx:xx:xx.x).
         target: its element is pci string, or a list of pci string.
 
         If all the elements in src are in target, we remove them from target
@@ -276,7 +291,7 @@ def check_FLR_capability(dev_list):
                     coassigned_pci_list = dev.find_all_the_multi_functions()
                     need_transform = True
                 elif dev.dev_type == DEV_TYPE_PCI and not dev.pci_af_flr:
-                    coassigned_pci_list = dev.find_coassigned_devices(True)
+                    coassigned_pci_list = dev.find_coassigned_pci_devices(True)
                     del coassigned_pci_list[0]
                     need_transform = True
 
@@ -434,7 +449,7 @@ class PciDevice:
                 list = list + [dev.name]
         return list
         
-    def find_coassigned_devices(self, ignore_bridge = True):
+    def find_coassigned_pci_devices(self, ignore_bridge = True):
         ''' Here'self' is a PCI device, we need find the uppermost PCI/PCI-X
             bridge, and all devices behind it must be co-assigned to the same
             guest.
@@ -467,12 +482,12 @@ class PciDevice:
         os.lseek(fd, PCI_CB_BRIDGE_CONTROL, 0)
         br_cntl |= PCI_BRIDGE_CTL_BUS_RESET
         os.write(fd, struct.pack('H', br_cntl))
-        time.sleep(0.200)
+        time.sleep(0.100)
         # De-assert Secondary Bus Reset
         os.lseek(fd, PCI_CB_BRIDGE_CONTROL, 0)
         br_cntl &= ~PCI_BRIDGE_CTL_BUS_RESET
         os.write(fd, struct.pack('H', br_cntl))
-        time.sleep(0.200)
+        time.sleep(0.100)
         os.close(fd)
 
         # Restore the config spaces
@@ -483,18 +498,25 @@ class PciDevice:
         if pos == 0:
             return False
         
+        # No_Soft_Reset - When set 1, this bit indicates that
+        # devices transitioning from D3hot to D0 because of
+        # PowerState commands do not perform an internal reset.
+        pm_ctl = self.pci_conf_read32(pos + PCI_PM_CTRL)
+        if (pm_ctl & PCI_PM_CTRL_NO_SOFT_RESET) == 1:
+            return False
+
         (pci_list, cfg_list) = save_pci_conf_space([self.name])
         
-        # Enter D3hot without soft reset
-        pm_ctl = self.pci_conf_read32(pos + PCI_PM_CTRL)
-        pm_ctl |= PCI_PM_CTRL_NO_SOFT_RESET
+        # Enter D3hot
         pm_ctl &= ~PCI_PM_CTRL_STATE_MASK
         pm_ctl |= PCI_D3hot
         self.pci_conf_write32(pos + PCI_PM_CTRL, pm_ctl)
         time.sleep(0.010)
 
         # From D3hot to D0
-        self.pci_conf_write32(pos + PCI_PM_CTRL, 0)
+        pm_ctl &= ~PCI_PM_CTRL_STATE_MASK
+        pm_ctl |= PCI_D0hot
+        self.pci_conf_write32(pos + PCI_PM_CTRL, pm_ctl)
         time.sleep(0.010)
 
         restore_pci_conf_space((pci_list, cfg_list))
@@ -516,7 +538,7 @@ class PciDevice:
         (pci_list, cfg_list) = save_pci_conf_space([self.name])
 
         self.pci_conf_write8(pos + PCI_USB_FLRCTRL, 1)
-        time.sleep(0.010)
+        time.sleep(0.100)
 
         restore_pci_conf_space((pci_list, cfg_list))
 
@@ -532,6 +554,16 @@ class PciDevice:
         funcs = re.findall(p, pci_names)
         return funcs
 
+    def find_coassigned_devices(self):
+        if self.dev_type == DEV_TYPE_PCIe_ENDPOINT and not self.pcie_flr:
+            return self.find_all_the_multi_functions()
+        elif self.dev_type == DEV_TYPE_PCI and not self.pci_af_flr:
+            coassigned_pci_list = self.find_coassigned_pci_devices(True)
+            del coassigned_pci_list[0]
+            return coassigned_pci_list
+        else:
+            return [self.name]
+
     def find_cap_offset(self, cap):
         path = find_sysfs_mnt()+SYSFS_PCI_DEVS_PATH+'/'+ \
                self.name+SYSFS_PCI_DEV_CONFIG_PATH
@@ -626,7 +658,7 @@ class PciDevice:
                 self.dev_type = DEV_TYPE_PCI_BRIDGE
             else:
                 creg = self.pci_conf_read16(pos + PCI_EXP_FLAGS)
-                if ((creg & PCI_EXP_TYPE_PCI_BRIDGE) >> 4) == \
+                if ((creg & PCI_EXP_FLAGS_TYPE) >> 4) == \
                     PCI_EXP_TYPE_PCI_BRIDGE:
                     self.dev_type = DEV_TYPE_PCI_BRIDGE
                 else:
@@ -691,7 +723,7 @@ class PciDevice:
                 pos = self.find_cap_offset(PCI_CAP_ID_EXP)
                 self.pci_conf_write32(pos + PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_FLR)
                 # We must sleep at least 100ms for the completion of FLR
-                time.sleep(0.200)
+                time.sleep(0.100)
                 restore_pci_conf_space((pci_list, cfg_list))
             else:
                 if self.bus == 0:
@@ -712,13 +744,13 @@ class PciDevice:
                 # We use Advanced Capability to do FLR.
                 pos = self.find_cap_offset(PCI_CAP_ID_AF)
                 self.pci_conf_write8(pos + PCI_AF_CTL, PCI_AF_CTL_FLR)
-                time.sleep(0.200)
+                time.sleep(0.100)
                 restore_pci_conf_space((pci_list, cfg_list))
             else:
                 if self.bus == 0:
                     self.do_FLR_for_integrated_device()
                 else:
-                    devs = self.find_coassigned_devices(False)
+                    devs = self.find_coassigned_pci_devices(False)
                     # Remove the element 0 which is a bridge
                     target_bus = devs[0]
                     del devs[0]
@@ -893,22 +925,27 @@ class PciDevice:
         Since we cannot obtain these data from sysfs, use 'lspci' command.
         """
         global lspci_info
+        global lspci_info_lock
 
-        if lspci_info is None:
-            create_lspci_info()
-
+        lspci_info_lock.acquire()
         try:
-            device_info = lspci_info[self.name]
-            self.revision = int(device_info['Rev'], 16)
-            self.vendorname = device_info['Vendor']
-            self.devicename = device_info['Device']
-            self.classname = device_info['Class']
-            self.subvendorname = device_info['SVendor']
-            self.subdevicename = device_info['SDevice']
-        except KeyError:
-            pass
+            if lspci_info is None:
+                _create_lspci_info()
 
-        return True
+            try:
+                device_info = lspci_info[self.name]
+                self.revision = int(device_info['Rev'], 16)
+                self.vendorname = device_info['Vendor']
+                self.devicename = device_info['Device']
+                self.classname = device_info['Class']
+                self.subvendorname = device_info['SVendor']
+                self.subdevicename = device_info['SDevice']
+            except KeyError:
+                pass
+
+            return True
+        finally:
+            lspci_info_lock.release()
 
     def __str__(self):
         str = "PCI Device %s\n" % (self.name)
diff --git a/tools/python/xen/util/rwlock.py b/tools/python/xen/util/rwlock.py
new file mode 100644 (file)
index 0000000..e79a82f
--- /dev/null
@@ -0,0 +1,137 @@
+""" Reader-writer lock implementation based on a condition variable """
+
+#============================================================================
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#============================================================================
+# Copyright (C) 2008 International Business Machines Corp.
+# Author: Stefan Berger <stefanb@us.ibm.com>
+#============================================================================
+
+from threading import Condition
+
+class RWLock:
+
+    RWLOCK_STATE_WRITER = -1
+    RWLOCK_STATE_UNUSED = 0
+
+    def __init__(self):
+        self.__condition = Condition()
+        self.__state = RWLock.RWLOCK_STATE_UNUSED
+        self.__blocked_writers = 0
+
+    def acquire_reader(self):
+        self.__condition.acquire()
+        while True:
+            if self.__state == RWLock.RWLOCK_STATE_WRITER:
+                self.__condition.wait()
+            else:
+                break
+        self.__state += 1
+        self.__condition.release()
+
+    def acquire_writer(self):
+        self.__condition.acquire()
+        self.__acquire_writer(RWLock.RWLOCK_STATE_UNUSED)
+        self.__condition.release()
+
+    def __acquire_writer(self, wait_for_state):
+        while True:
+            if self.__state == wait_for_state:
+                self.__state = RWLock.RWLOCK_STATE_WRITER
+                break
+            else:
+                self.__blocked_writers += 1
+                self.__condition.wait()
+                self.__blocked_writers -= 1
+
+    def release(self):
+        self.__condition.acquire()
+        if self.__state == RWLock.RWLOCK_STATE_WRITER:
+            self.__state = RWLock.RWLOCK_STATE_UNUSED
+        elif self.__state == RWLock.RWLOCK_STATE_UNUSED:
+            assert False, 'Lock not in use.'
+        else:
+            self.__state -= 1
+        self.__condition.notifyAll()
+        self.__condition.release()
+
+
+if __name__ == '__main__':
+    from threading import Thread
+    from time import sleep
+
+    rwlock = RWLock()
+
+    class Base(Thread):
+        def __init__(self, name, timeout):
+            self.name = name
+            self.timeout = timeout
+            Thread.__init__(self)
+
+    class Reader(Base):
+        def __init__(self, name = 'Reader', timeout = 10):
+            Base.__init__(self, name, timeout)
+
+        def run(self):
+            print '%s begin' % self.name
+            rwlock.acquire_reader()
+            print '%s acquired' % self.name
+            sleep(self.timeout)
+            rwlock.release()
+            print '%s end' % self.name
+
+    class ReaderTwice(Base):
+        def __init__(self, name = 'Reader', timeout = 10):
+            Base.__init__(self, name, timeout)
+
+        def run(self):
+            print '%s begin' % self.name
+            rwlock.acquire_reader()
+            print '%s acquired once' % self.name
+            sleep(self.timeout)
+            rwlock.acquire_reader()
+            print '%s acquired twice' % self.name
+            sleep(self.timeout)
+            rwlock.release()
+            rwlock.release()
+            print '%s end' % self.name
+
+    class Writer(Base):
+        def __init__(self, name = 'Writer', timeout = 10):
+            Base.__init__(self, name, timeout)
+
+        def run(self):
+            print '%s begin' % self.name
+            rwlock.acquire_writer()
+            print '%s acquired' % self.name
+            sleep(self.timeout)
+            rwlock.release()
+            print '%s end' % self.name
+
+    def run_test(threadlist, msg):
+        print msg
+        for t in threadlist:
+            t.start()
+            sleep(1)
+        for t in threads:
+            t.join()
+        print 'Done\n\n'
+
+    threads = []
+    threads.append( Reader('R1', 4) )
+    threads.append( Reader('R2', 4) )
+    threads.append( Writer('W1', 4) )
+    threads.append( Reader('R3', 4) )
+    run_test(threads,
+             'Test: readers may bypass blocked writers')
index 42137bc6c8ff084e22ad50945f3cc1a6f6aef211..3b8d0536a82325142d4a565278f8043eb1036955 100644 (file)
@@ -36,28 +36,27 @@ SYSFS_SCSI_DEV_TYPEID_PATH = '/type'
 SYSFS_SCSI_DEV_REVISION_PATH = '/rev'
 SYSFS_SCSI_DEV_SCSILEVEL_PATH = '/scsi_level'
 
-def _vscsi_hctl_block(name, scsi_devices):
-    """ block-device name is convert into hctl. (e.g., '/dev/sda',
-    '0:0:0:0')"""
+def _vscsi_get_devname_by(name, scsi_devices):
+    """A device name is gotten by the HCTL.
+    (e.g., '0:0:0:0' to '/dev/sda')
+    """
+
     try:
         search = re.compile(r'' + name + '$', re.DOTALL)
     except Exception, e:
         raise VmError("vscsi: invalid expression. " + str(e))
-    chk = 0
-    for hctl, block, sg, scsi_id in scsi_devices:
+
+    for hctl, devname, sg, scsi_id in scsi_devices:
         if search.match(hctl):
-            chk = 1
-            break
+            return (hctl, devname)
 
-    if chk:
-        return (hctl, block)
-    else:
-        return (None, None)
+    return (None, None)
 
 
-def _vscsi_block_scsiid_to_hctl(phyname, scsi_devices):
-    """ block-device name is convert into hctl. (e.g., '/dev/sda',
-    '0:0:0:0')"""
+def _vscsi_get_hctl_by(phyname, scsi_devices):
+    """An HCTL is gotten by the device name or the scsi_id.
+    (e.g., '/dev/sda' to '0:0:0:0')
+    """
     
     if re.match('/dev/sd[a-z]+([1-9]|1[0-5])?$', phyname):
         # sd driver
@@ -72,73 +71,98 @@ def _vscsi_block_scsiid_to_hctl(phyname, scsi_devices):
         # scsi_id -gu
         name = phyname
 
-    chk = 0
-    for hctl, block, sg, scsi_id in scsi_devices:
-        if block == name:
-            chk = 1
-            break
-        elif sg == name:
-            chk = 1
-            break
-        elif scsi_id == name:
-            chk = 1
-            break
-
-    if chk:
-        return (hctl, block)
-    else:
-        return (None, None)
+    for hctl, devname, sg, scsi_id in scsi_devices:
+        if name in [devname, sg, scsi_id]:
+            return (hctl, devname)
 
+    return (None, None)
+
+
+def _vscsi_get_scsiid(sg):
+    scsi_id = os.popen('/sbin/scsi_id -gu -s /class/scsi_generic/' + sg).read().split()
+    if len(scsi_id):
+        return scsi_id[0]
+    return None
 
-def vscsi_get_scsidevices():
-    """ get all scsi devices"""
+
+def _vscsi_get_scsidevices_by_lsscsi(option = ""):
+    """ get all scsi devices information by lsscsi """
 
     devices = []
-    sysfs_mnt = utils.find_sysfs_mount() 
+
+    for scsiinfo in os.popen('{ lsscsi -g %s; } 2>/dev/null' % option).readlines():
+        s = scsiinfo.split()
+        hctl = s[0][1:-1]
+        try:
+            devname = s[-2].split('/dev/')[1]
+        except IndexError:
+            devname = None
+        try:
+            sg = s[-1].split('/dev/')[1]
+            scsi_id = _vscsi_get_scsiid(sg)
+        except IndexError:
+            sg = None
+            scsi_id = None
+        devices.append([hctl, devname, sg, scsi_id])
+
+    return devices
+
+
+def _vscsi_get_scsidevices_by_sysfs():
+    """ get all scsi devices information by sysfs """
+
+    devices = []
+    try:
+        sysfs_mnt = utils.find_sysfs_mount() 
+    except:
+        return devices
 
     for dirpath, dirnames, files in os.walk(sysfs_mnt + SYSFS_SCSI_PATH):
         for hctl in dirnames:
             paths = os.path.join(dirpath, hctl)
-            block = "-"
+            devname = None
+            sg = None
+            scsi_id = None
             for f in os.listdir(paths):
-                if re.match('^block', f):
-                    os.chdir(os.path.join(paths, f))
-                    block = os.path.basename(os.getcwd())
-                elif re.match('^tape', f):
-                    os.chdir(os.path.join(paths, f))
-                    block = os.path.basename(os.getcwd())
-                elif re.match('^scsi_changer', f):
-                    os.chdir(os.path.join(paths, f))
-                    block = os.path.basename(os.getcwd())
-                elif re.match('^onstream_tape', f):
-                    os.chdir(os.path.join(paths, f))
-                    block = os.path.basename(os.getcwd())
+                realpath = os.path.realpath(os.path.join(paths, f))
+                if  re.match('^block', f) or \
+                    re.match('^tape', f) or \
+                    re.match('^scsi_changer', f) or \
+                    re.match('^onstream_tape', f):
+                    devname = os.path.basename(realpath)
 
                 if re.match('^scsi_generic', f):
-                    os.chdir(os.path.join(paths, f))
-                    sg = os.path.basename(os.getcwd())
-                    lines = os.popen('/sbin/scsi_id -gu -s /class/scsi_generic/' + sg).read().split()
-                    if len(lines) == 0:
-                        scsi_id = '-'
-                    else:
-                        scsi_id = lines[0]
-
-            devices.append([hctl, block, sg, scsi_id])
+                    sg = os.path.basename(realpath)
+                    scsi_id = _vscsi_get_scsiid(sg)
+            devices.append([hctl, devname, sg, scsi_id])
 
     return devices
 
 
-def vscsi_search_hctl_and_block(device):
-
-    scsi_devices = vscsi_get_scsidevices()
-
-    tmp = device.split(':')
-    if len(tmp) == 4:
-        (hctl, block) = _vscsi_hctl_block(device, scsi_devices)
+def vscsi_get_scsidevices():
+    """ get all scsi devices information """
+
+    devices = _vscsi_get_scsidevices_by_lsscsi("")
+    if devices:
+        return devices
+    return _vscsi_get_scsidevices_by_sysfs()
+
+
+def vscsi_get_hctl_and_devname_by(target, scsi_devices = None):
+    if scsi_devices is None:
+        if len(target.split(':')) == 4:
+            scsi_devices = _vscsi_get_scsidevices_by_lsscsi(target)
+        elif target.startswith('/dev/'): 
+            scsi_devices = _vscsi_get_scsidevices_by_lsscsi("| grep %s" % target)
+        else:
+            scsi_devices = _vscsi_get_scsidevices_by_lsscsi("")
+        if not scsi_devices:
+            scsi_devices = _vscsi_get_scsidevices_by_sysfs()
+
+    if len(target.split(':')) == 4:
+        return _vscsi_get_devname_by(target, scsi_devices)
     else:
-        (hctl, block) = _vscsi_block_scsiid_to_hctl(device, scsi_devices)
-
-    return (hctl, block)
+        return _vscsi_get_hctl_by(target, scsi_devices)
 
 
 def get_scsi_vendor(pHCTL):
@@ -212,9 +236,9 @@ def get_all_scsi_devices():
             'sg_name': scsi_info[2],
             'scsi_id': None
         }
-        if scsi_info[1] != '-':
+        if scsi_info[1] is not None:
             scsi_dev['dev_name'] = scsi_info[1] 
-        if scsi_info[3] != '-':
+        if scsi_info[3] is not None:
             scsi_dev['scsi_id'] = scsi_info[3] 
 
         scsi_dev['vendor_name'] = \
@@ -229,7 +253,7 @@ def get_all_scsi_devices():
             get_scsi_scsilevel(scsi_dev['physical_HCTL'])
 
         try:
-            lsscsi_info = os.popen('lsscsi ' + scsi_dev['physical_HCTL']).read().split()
+            lsscsi_info = os.popen('lsscsi %s 2>/dev/null' % scsi_dev['physical_HCTL']).read().split()
             scsi_dev['type'] = lsscsi_info[1]
         except:
             scsi_dev['type'] = None
index 7d8001ad1e6479b8ed28af08da1d777882ab45cb..807da69b3a728181e494a6ff3f19516e836adef1 100644 (file)
@@ -20,7 +20,7 @@ import types
 from xen.xend import sxp
 from xen.xend import PrettyPrint
 from xen.xend.Args import ArgError
-from xen.xend.XendError import XendError
+from xen.xend.XendError import XendError, XendInvalidDomain
 #from xen.xend.XendLogging import log
 
 import resource
@@ -71,6 +71,8 @@ class SrvDir(SrvBase):
             val = self.get(x)
         except XendError, ex:
             return self.noChild(str(ex))
+        except XendInvalidDomain, ex:
+            return self.noChild(str(ex))
         if val is None:
             return self.noChild('Not found: ' + str(x))
         else:
index 7d5702fce066489dc8b62c021186aa2a81fdf114..e507323e9155ebbe4b279470d4dd72867ad1e420 100644 (file)
@@ -292,3 +292,40 @@ def hostAllowed(addrport, hosts_allowed):
                 return True
         log.warn("Rejected connection from %s (%s).", addrport[0], fqdn)
         return False
+
+
+class SocketDgramListener:
+    """A connectionless server socket, running listen in a thread.
+    """
+
+    def __init__(self, protocol_class):
+        self.protocol = protocol_class()
+        self.sock = self.createSocket()
+        threading.Thread(target=self.main).start()
+
+
+    def close(self):
+        try:
+            self.sock.close()
+        except:
+            pass
+
+
+    def createSocket(self):
+        raise NotImplementedError()
+
+
+    def main(self):
+        try:
+            while True:
+                try:
+                    data = self.sock.recv(BUFFER_SIZE)
+                    self.protocol.dataReceived(data)
+                except socket.error, ex:
+                    if ex.args[0] not in (EWOULDBLOCK, EAGAIN, EINTR):
+                        break
+        finally:
+            try:
+                self.close()
+            except:
+                pass
index 12b6e9694edb69401b25cf12222d392648a12526..180c0858ebd0c91c3b5957fbe411d573976f8f42 100644 (file)
@@ -27,16 +27,19 @@ from xen.util import mkdir
 import connection
 
 
-def bind(path):
-    """Create a Unix socket, and bind it to the given path.  The socket is
-created such that only the current user may access it."""
+def bind(path, type = socket.SOCK_STREAM):
+    """Create a Unix socket, and bind it to the given path.
+    The socket is created such that only the current user may access it."""
 
-    parent = os.path.dirname(path)
-    mkdir.parents(parent, stat.S_IRWXU, True)
-    if os.path.exists(path):
-        os.unlink(path)
+    if path[0] == '\0': # Abstract namespace is used for the path
+        pass
+    else:
+        parent = os.path.dirname(path)
+        mkdir.parents(parent, stat.S_IRWXU, True)
+        if os.path.exists(path):
+            os.unlink(path)
 
-    sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+    sock = socket.socket(socket.AF_UNIX, type)
     sock.bind(path)
     return sock
 
@@ -48,8 +51,19 @@ class UnixListener(connection.SocketListener):
 
 
     def createSocket(self):
-        return bind(self.path)
+        return bind(self.path, socket.SOCK_STREAM)
 
 
     def acceptConnection(self, sock, _):
         connection.SocketServerConnection(sock, self.protocol_class)
+
+
+class UnixDgramListener(connection.SocketDgramListener):
+    def __init__(self, path, protocol_class):
+        self.path = path
+        connection.SocketDgramListener.__init__(self, protocol_class)
+
+
+    def createSocket(self):
+        return bind(self.path, socket.SOCK_DGRAM)
+
index 42e131be37cbef16fc276f2b88be71ae9fec0315..b4a33c4be148daa8adcf1700cbb6d248d27e8609 100644 (file)
@@ -29,6 +29,7 @@ import xmlrpclib
 import XendDomain, XendDomainInfo, XendNode, XendDmesg
 import XendLogging, XendTaskManager, XendAPIStore
 
+from xen.xend import uuid as genuuid
 from XendAPIVersion import *
 from XendAuthSessions import instance as auth_manager
 from XendError import *
@@ -431,7 +432,7 @@ def valid_object(class_name):
            lambda *args, **kwargs: \
            _check_ref(lambda r: \
                           XendAPIStore.get(r, class_name) is not None,
-                      'PIF', func, *args, **kwargs)
+                      class_name, func, *args, **kwargs)
 
 # -----------------------------
 # Bridge to Legacy XM API calls
@@ -1867,7 +1868,7 @@ class XendAPI(object):
         dom = xendom.get_vm_by_uuid(vbd_struct['VM'])
         vdi = xennode.get_vdi_by_uuid(vbd_struct['VDI'])
         if not vdi:
-            return xen_api_error(['HANDLE_INVALID', 'VDI', vdi_ref])
+            return xen_api_error(['HANDLE_INVALID', 'VDI', vbd_struct['VDI']])
 
         # new VBD via VDI/SR
         vdi_image = vdi.get_location()
@@ -2392,7 +2393,7 @@ class XendAPI(object):
             tpmif.destroy_vtpmstate(dom.getName())
             return xen_api_success_void()
         else:
-            return xen_api_error(['HANDLE_INVALID', 'VM', vtpm_struct['VM']])
+            return xen_api_error(['HANDLE_INVALID', 'VTPM', vtpm_ref])
 
     # class methods
     def VTPM_create(self, session, vtpm_struct):
@@ -2614,7 +2615,7 @@ class XendAPI(object):
         return xen_api_success_void()
 
     def event_unregister(self, session, unreg_classes):
-        event_unregister(session, reg_classes)
+        event_unregister(session, unreg_classes)
         return xen_api_success_void()
 
     def event_next(self, session):
@@ -2641,7 +2642,7 @@ class XendAPI(object):
         return xen_api_error(['DEBUG_FAIL', session])
 
     def debug_create(self, session):
-        debug_uuid = uuid.createString()
+        debug_uuid = genuuid.createString()
         self._debug[debug_uuid] = None
         return xen_api_success(debug_uuid)
 
index 372509d695034621282e4294df6cfcfa3a479aae..6876f2e6fb9e5005e6a617b0e83c30d4411c7561 100644 (file)
@@ -25,35 +25,59 @@ You must register both the uuid and type, and get objects
 by type, to ensure safety
 """
 
+import threading
+
 __classes = {}
+__classes_lock = threading.RLock()
 
 def register(uuid, type, inst):
-    __classes[(uuid, type)] = inst
-    return inst
+    __classes_lock.acquire()
+    try:
+        __classes[(uuid, type)] = inst
+        return inst
+    finally:
+        __classes_lock.release()
 
 def deregister(uuid, type):
-    old = get(uuid, type)
-    del __classes[(uuid, type)]
-    return old
+    __classes_lock.acquire()
+    try:
+        old = get(uuid, type)
+        if old is not None:
+            del __classes[(uuid, type)]
+        return old
+    finally:
+        __classes_lock.release()
 
 def get(uuid, type):
     """
     Get the instances by uuid and type
     """
-    return __classes.get((uuid, type), None)
+    __classes_lock.acquire()
+    try:
+        return __classes.get((uuid, type), None)
+    finally:
+        __classes_lock.release()
 
 def get_all(all_type):
     """
     Get all instances by type
     """
-    return [inst
-            for ((uuid, t), inst) in __classes.items()
-            if t == all_type]        
+    __classes_lock.acquire()
+    try:
+        return [inst
+                for ((uuid, t), inst) in __classes.items()
+                if t == all_type]        
+    finally:
+        __classes_lock.release()
 
 def get_all_uuid(all_type):
     """
     Get all uuids by type
     """
-    return [uuid
-            for (uuid, t) in __classes.keys()
-            if t == all_type]
+    __classes_lock.acquire()
+    try:
+        return [uuid
+                for (uuid, t) in __classes.keys()
+                if t == all_type]
+    finally:
+        __classes_lock.release()
index 60e876140d2dba4f218d1c685ef67988f8c82bf4..d2e5761fa2e384ceb0f96bc282d3661bb1ac1428 100644 (file)
@@ -67,9 +67,23 @@ def bootloader(blexec, disk, dom, quiet = False, blargs = '', kernel = '',
     # listening on the bootloader's fifo for the results.
 
     (m1, s1) = pty.openpty()
-    tty.setraw(m1);
-    fcntl.fcntl(m1, fcntl.F_SETFL, os.O_NDELAY);
-    os.close(s1)
+
+    # On Solaris, the pty master side will get cranky if we try
+    # to write to it while there is no slave. To work around this,
+    # keep the slave descriptor open until we're done. Set it
+    # to raw terminal parameters, otherwise it will echo back
+    # characters, which will confuse the I/O loop below.
+    # Furthermore, a raw master pty device has no terminal
+    # semantics on Solaris, so don't try to set any attributes
+    # for it.
+    if os.uname()[0] != 'SunOS' and os.uname()[0] != 'NetBSD':
+        tty.setraw(m1)
+        os.close(s1)
+    else:
+        tty.setraw(s1)
+
+    fcntl.fcntl(m1, fcntl.F_SETFL, os.O_NDELAY)
+
     slavename = ptsname.ptsname(m1)
     dom.storeDom("console/tty", slavename)
 
@@ -108,7 +122,11 @@ def bootloader(blexec, disk, dom, quiet = False, blargs = '', kernel = '',
     # record that this domain is bootloading
     dom.bootloader_pid = child
 
-    tty.setraw(m2);
+    # On Solaris, the master pty side does not have terminal semantics,
+    # so don't try to set any attributes, as it will fail.
+    if os.uname()[0] != 'SunOS':
+        tty.setraw(m2);
+
     fcntl.fcntl(m2, fcntl.F_SETFL, os.O_NDELAY);
     while True:
         try:
@@ -117,32 +135,55 @@ def bootloader(blexec, disk, dom, quiet = False, blargs = '', kernel = '',
             if e.errno == errno.EINTR:
                 continue
         break
+
+    fcntl.fcntl(r, fcntl.F_SETFL, os.O_NDELAY);
+
     ret = ""
     inbuf=""; outbuf="";
+    # filedescriptors:
+    #   r - input from the bootloader (bootstring output)
+    #   m1 - input/output from/to xenconsole
+    #   m2 - input/output from/to pty that controls the bootloader
+    # The filedescriptors are NDELAY, so it's ok to try to read
+    # bigger chunks than may be available, to keep e.g. curses
+    # screen redraws in the bootloader efficient. m1 is the side that
+    # gets xenconsole input, which will be keystrokes, so a small number
+    # is sufficient. m2 is pygrub output, which will be curses screen
+    # updates, so a larger number (1024) is appropriate there.
+    #
+    # For writeable descriptors, only include them in the set for select
+    # if there is actual data to write, otherwise this would loop too fast,
+    # eating up CPU time.
+
     while True:
-        sel = select.select([r, m1, m2], [m1, m2], [])
+        wsel = []
+        if len(outbuf) != 0:
+            wsel = wsel + [m1]
+        if len(inbuf) != 0:
+            wsel = wsel + [m2]
+        sel = select.select([r, m1, m2], wsel, [])
         try: 
             if m1 in sel[0]:
-                s = os.read(m1, 1)
+                s = os.read(m1, 16)
                 inbuf += s
-            if m2 in sel[1] and len(inbuf) != 0:
-                os.write(m2, inbuf[0])
-                inbuf = inbuf[1:]
+            if m2 in sel[1]:
+                n = os.write(m2, inbuf)
+                inbuf = inbuf[n:]
         except OSError, e:
             if e.errno == errno.EIO:
                 pass
         try:
             if m2 in sel[0]:
-                s = os.read(m2, 1)
+                s = os.read(m2, 1024)
                 outbuf += s
-            if m1 in sel[1] and len(outbuf) != 0:
-                os.write(m1, outbuf[0])
-                outbuf = outbuf[1:]
+            if m1 in sel[1]:
+                n = os.write(m1, outbuf)
+                outbuf = outbuf[n:]
         except OSError, e:
             if e.errno == errno.EIO:
                 pass
         if r in sel[0]:
-            s = os.read(r, 1)
+            s = os.read(r, 128)
             ret = ret + s
             if len(s) == 0:
                 break
@@ -152,6 +193,8 @@ def bootloader(blexec, disk, dom, quiet = False, blargs = '', kernel = '',
     os.close(r)
     os.close(m2)
     os.close(m1)
+    if os.uname()[0] == 'SunOS' or os.uname()[0] == 'NetBSD':
+        os.close(s1)
     os.unlink(fifo)
 
     # Re-acquire the lock to cover the changes we're about to make
index af76bdc7e1859f54b2795b937be15e012fa57468..a0ea01166575da97a6409be87c488cff64bc126f 100644 (file)
@@ -66,6 +66,13 @@ def insert_after(list, pred, value):
 
 
 def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1):
+    try:
+        if not os.path.isdir("/var/lib/xen"):
+            os.makedirs("/var/lib/xen")
+    except Exception, exn:
+        log.exception("Can't create directory '/var/lib/xen'")
+        raise XendError("Can't create directory '/var/lib/xen'")
+
     write_exact(fd, SIGNATURE, "could not write guest state file: signature")
 
     sxprep = dominfo.sxpr()
@@ -107,7 +114,7 @@ def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1):
             if line == "suspend":
                 log.debug("Suspending %d ...", dominfo.getDomid())
                 dominfo.shutdown('suspend')
-                dominfo.waitForShutdown()
+                dominfo.waitForSuspend()
             if line in ('suspend', 'suspended'):
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2,
                                        domain_name)
@@ -166,6 +173,13 @@ def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1):
 
 
 def restore(xd, fd, dominfo = None, paused = False, relocating = False):
+    try:
+        if not os.path.isdir("/var/lib/xen"):
+            os.makedirs("/var/lib/xen")
+    except Exception, exn:
+        log.exception("Can't create directory '/var/lib/xen'")
+        raise XendError("Can't create directory '/var/lib/xen'")
+
     signature = read_exact(fd, len(SIGNATURE),
         "not a valid guest state file: signature read")
     if signature != SIGNATURE:
@@ -253,7 +267,7 @@ def restore(xd, fd, dominfo = None, paused = False, relocating = False):
         # set memory limit
         xc.domain_setmaxmem(dominfo.getDomid(), maxmem)
 
-        balloon.free(memory + shadow)
+        balloon.free(memory + shadow, dominfo)
 
         shadow_cur = xc.shadow_mem_control(dominfo.getDomid(), shadow / 1024)
         dominfo.info['shadow_memory'] = shadow_cur
index b16a90710ecf7476836a3205559abf36a042c090..7bb6255aff4a8447614c178592f929f2eb83a1c3 100644 (file)
@@ -149,6 +149,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'opengl': int,
     'soundhw': str,
     'stdvga': int,
+    'videoram': int,
     'usb': int,
     'usbdevice': str,
     'hpet': int,
@@ -157,6 +158,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'vncdisplay': int,
     'vnclisten': str,
     'timer_mode': int,
+    'vpt_align': int,
     'viridian': int,
     'vncpasswd': str,
     'vncunused': int,
@@ -166,11 +168,14 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'guest_os_type': str,
     'hap': int,
     'xen_extended_power_mgmt': int,
+    'pci_msitranslate': int,
+    'pci_power_mgmt': int,
+    'xen_platform_pci': int,
 }
 
 # Xen API console 'other_config' keys.
 XENAPI_CONSOLE_OTHER_CFG = ['vncunused', 'vncdisplay', 'vnclisten',
-                            'vncpasswd', 'type', 'display', 'xauthority',
+                            'vncpasswd', 'sdl', 'vnc', 'display', 'xauthority',
                             'keymap', 'opengl']
 
 # List of XendConfig configuration keys that have no direct equivalent
@@ -213,6 +218,7 @@ XENAPI_CFG_TYPES = {
     'cpuid_check' : dict,
     'machine_address_size': int,
     'suppress_spurious_page_faults': bool0,
+    's3_integrity' : int,
 }
 
 # List of legacy configuration keys that have no equivalent in the
@@ -256,6 +262,8 @@ LEGACY_CFG_TYPES = {
     'on_xend_start': str,
     'online_vcpus':  int,
     'rtc/timeoffset': str,
+    'bootloader':    str,
+    'bootloader_args': str,
 }
 
 # Values that should be stored in xenstore's /vm/<uuid> that is used
@@ -276,6 +284,8 @@ LEGACY_XENSTORE_VM_PARAMS = [
     'on_reboot',
     'on_xend_start',
     'on_xend_stop',
+    'bootloader',
+    'bootloader_args',
 ]
 
 ##
@@ -424,6 +434,8 @@ class XendConfig(dict):
     def _vcpus_sanity_check(self):
         if 'VCPUs_max' in self and 'vcpu_avail' not in self:
             self['vcpu_avail'] = (1 << self['VCPUs_max']) - 1
+        if 'online_vcpus' in self:
+            self['VCPUs_live'] = self['online_vcpus']
 
     def _uuid_sanity_check(self):
         """Make sure UUID is in proper string format with hyphens."""
@@ -453,6 +465,10 @@ class XendConfig(dict):
                 self['platform']['rtc_timeoffset'] = 0
             if 'hpet' not in self['platform']:
                 self['platform']['hpet'] = 0
+            if 'xen_platform_pci' not in self['platform']:
+                self['platform']['xen_platform_pci'] = 1
+            if 'vpt_align' not in self['platform']:
+                self['platform']['vpt_align'] = 1
             if 'loader' not in self['platform']:
                 # Old configs may have hvmloader set as PV_kernel param
                 if self.has_key('PV_kernel') and self['PV_kernel'] != '':
@@ -852,7 +868,7 @@ class XendConfig(dict):
             # add vfb device if it isn't there already
             if not self.has_rfb():
                 dev_config = ['vfb']
-                dev_config.append(['type', 'vnc'])
+                dev_config.append(['vnc', '1'])
                 # copy VNC related params from platform config to vfb dev conf
                 for key in ['vncpasswd', 'vncunused', 'vncdisplay',
                             'vnclisten']:
@@ -1032,8 +1048,6 @@ class XendConfig(dict):
                     sxpr.append([name, s])
 
         for xenapi, legacy in XENAPI_CFG_TO_LEGACY_CFG.items():
-            if legacy in ('cpus'): # skip this
-                continue
             if self.has_key(xenapi) and self[xenapi] not in (None, []):
                 if type(self[xenapi]) == bool:
                     # convert booleans to ints before making an sxp item
@@ -1144,7 +1158,7 @@ class XendConfig(dict):
                     return None
         return devid
     
-    def device_duplicate_check(self, dev_type, dev_info, defined_config):
+    def device_duplicate_check(self, dev_type, dev_info, defined_config, config):
         defined_devices_sxpr = self.all_devices_sxpr(target = defined_config)
         
         if dev_type == 'vbd' or dev_type == 'tap':
@@ -1163,9 +1177,34 @@ class XendConfig(dict):
                         if blkdev_file == o_blkdev_file:
                             raise XendConfigError('The file "%s" is already used' %
                                                   blkdev_file)
+                    if dev_uname == o_dev_uname:
+                        raise XendConfigError('The uname "%s" is already defined' %
+                                             dev_uname)
                     o_blkdev_name = sxp.child_value(o_dev_info, 'dev')
                     o_devid = self._blkdev_name_to_number(o_blkdev_name)
                     if o_devid != None and devid == o_devid:
+                        name_array = blkdev_name.split(':', 2)
+                        if len(name_array) == 2 and name_array[1] == 'cdrom':
+                            #
+                            # Since the device is a cdrom, we are most likely
+                            # inserting, changing, or removing a cd.  We can
+                            # update the old device instead of creating a new
+                            # one.
+                            #
+                            if o_dev_uname != None and dev_uname == None:
+                                #
+                                # We are removing a cd.  We can simply update
+                                # the uname on the existing device.
+                                #
+                                merge_sxp = sxp.from_string("('vbd' ('uname' ''))")
+                            else:
+                                merge_sxp = config
+
+                            dev_uuid = sxp.child_value(o_dev_info, 'uuid')
+                            if dev_uuid != None and \
+                               self.device_update(dev_uuid, cfg_sxp = merge_sxp):
+                                return dev_uuid
+
                         raise XendConfigError('The device "%s" is already defined' %
                                               blkdev_name)
                     
@@ -1177,6 +1216,7 @@ class XendConfig(dict):
                     if dev_mac.lower() == sxp.child_value(o_dev_info, 'mac').lower():
                         raise XendConfigError('The mac "%s" is already defined' %
                                               dev_mac)
+        return None
     
     def device_add(self, dev_type, cfg_sxp = None, cfg_xenapi = None,
                    target = None):
@@ -1245,6 +1285,11 @@ class XendConfig(dict):
                         'PPCI': ppci_uuid,
                         'hotplug_slot': pci_dev.get('vslot', 0)
                     }
+
+                    dpci_opts = pci_dev.get('opts')
+                    if dpci_opts and len(dpci_opts) > 0:
+                        dpci_record['options'] = dpci_opts
+
                     XendDPCI(dpci_uuid, dpci_record)
 
                 target['devices'][pci_devs_uuid] = (dev_type,
@@ -1260,6 +1305,8 @@ class XendConfig(dict):
                                                   uuid.createString())
                 vscsi_dict = self.vscsi_convert_sxp_to_dict(config)
                 vscsi_devs = vscsi_dict['devs']
+                vscsi_mode = vscsi_dict['feature-host']
+                vscsi_be = vscsi_dict.get('backend', None)
 
                 # create XenAPI DSCSI objects.
                 for vscsi_dev in vscsi_devs:
@@ -1274,9 +1321,16 @@ class XendConfig(dict):
                     }
                     XendDSCSI(dscsi_uuid, dscsi_record)
 
-                target['devices'][vscsi_devs_uuid] = \
-                    (dev_type, {'devs': vscsi_devs, 'uuid': vscsi_devs_uuid} )
-                log.debug("XendConfig: reading device: %s" % vscsi_devs)
+                vscsi_info = {
+                    'devs': vscsi_devs,
+                    'feature-host': vscsi_mode,
+                    'uuid': vscsi_devs_uuid
+                }
+                if vscsi_be is not None:
+                    vscsi_info['backend'] = vscsi_be
+                target['devices'][vscsi_devs_uuid] = (dev_type, vscsi_info)
+                log.debug("XendConfig: reading device: %s,%s" % \
+                          (vscsi_devs, vscsi_mode))
                 return vscsi_devs_uuid
 
             for opt_val in config[1:]:
@@ -1287,7 +1341,6 @@ class XendConfig(dict):
                     pass
 
             if dev_type == 'vbd':
-                dev_info['bootable'] = 0
                 if dev_info.get('dev', '').startswith('ioemu:'):
                     dev_info['driver'] = 'ioemu'
                 else:
@@ -1302,7 +1355,9 @@ class XendConfig(dict):
                 if not dev_info.get('mac'):
                     dev_info['mac'] = randomMAC()
 
-            self.device_duplicate_check(dev_type, dev_info, target)
+            ret_uuid = self.device_duplicate_check(dev_type, dev_info, target, config)
+            if ret_uuid != None:
+                return ret_uuid
 
             if dev_type == 'vif':
                 if dev_info.get('policy') and dev_info.get('label'):
@@ -1323,7 +1378,7 @@ class XendConfig(dict):
                 if param not in target:
                     target[param] = []
                 if dev_uuid not in target[param]:
-                    if dev_type == 'vbd':
+                    if dev_type == 'vbd' and 'bootable' not in dev_info:
                         # Compat hack -- mark first disk bootable
                         dev_info['bootable'] = int(not target[param])
                     target[param].append(dev_uuid)
@@ -1331,8 +1386,9 @@ class XendConfig(dict):
                 if 'vbd_refs' not in target:
                     target['vbd_refs'] = []
                 if dev_uuid not in target['vbd_refs']:
-                    # Compat hack -- mark first disk bootable
-                    dev_info['bootable'] = int(not target['vbd_refs'])
+                    if 'bootable' not in dev_info:
+                        # Compat hack -- mark first disk bootable
+                        dev_info['bootable'] = int(not target['vbd_refs'])
                     target['vbd_refs'].append(dev_uuid)
                     
             elif dev_type == 'vfb':
@@ -1445,7 +1501,8 @@ class XendConfig(dict):
                     # collapse other config into devinfo for things
                     # such as vncpasswd, vncunused, etc.                    
                     dev_info.update(console_other_config)
-                    dev_info['type'] = console_other_config.get('type', 'vnc') 
+                    dev_info['vnc'] = console_other_config.get('vnc', '0')
+                    dev_info['sdl'] = console_other_config.get('sdl', '0')
                     target['devices'][dev_uuid] = ('vfb', dev_info)
                     target['console_refs'].append(dev_uuid)
 
@@ -1566,7 +1623,7 @@ class XendConfig(dict):
                 try:
                     opt, val = opt_val
                     pci_dev_info[opt] = val
-                except TypeError:
+                except (TypeError, ValueError):
                     pass
             # append uuid for each pci device.
             dpci_uuid = pci_dev_info.get('uuid', uuid.createString())
@@ -1600,23 +1657,27 @@ class XendConfig(dict):
         #
         # [device,
         #   [vscsi,
+        #     [feature-host, 0],
+        #     [backend, 0],
         #     [dev,
         #       [devid, 0], [p-devname, sdb], [p-dev, 1:0:0:1],
-        #       [v-dev, 0:0:0:0], [state, Initialising]
+        #       [v-dev, 0:0:0:0], [state, 1]
         #     ],
         #     [dev,
         #       [devid, 0], [p-devname, sdc], [p-dev, 1:0:0:2],
-        #       [v-dev, 0:0:0:1], [satet, Initialising]
+        #       [v-dev, 0:0:0:1], [satet, 1]
         #     ]
         #   ],
         #   [vscsi,
+        #     [feature-host, 1],
+        #     [backend, 0],
         #     [dev,
         #       [devid, 1], [p-devname, sdg], [p-dev, 2:0:0:0],
-        #       [v-dev, 1:0:0:0], [state, Initialising]
+        #       [v-dev, 1:0:0:0], [state, 1]
         #     ],
         #     [dev,
         #       [devid, 1], [p-devname, sdh], [p-dev, 2:0:0:1],
-        #       [v-dev, 1:0:0:1], [satet, Initialising]
+        #       [v-dev, 1:0:0:1], [satet, 1]
         #     ]
         #   ]
         # ]
@@ -1630,20 +1691,24 @@ class XendConfig(dict):
         #
         # [device,
         #   [vscsi,
+        #     [feature-host, 0],
+        #     [backend, 0],
         #     [dev,
         #       [devid, 0], [p-devname, sdd], [p-dev, 1:0:0:3],
-        #       [v-dev, 0:0:0:2], [state, Initialising]
+        #       [v-dev, 0:0:0:2], [state, 1]
         #     ]
         #   ]
         # ]
         #
-        # state 'Initialising' indicates that the device is being attached,
-        # while state 'Closing' indicates that the device is being detached.
+        # state xenbusState['Initialising'] indicates that the device is 
+        # being attached, while state xenbusState['Closing'] indicates 
+        # that the device is being detached.
         #
         # The Dict looks like this:
         #
         # { devs: [ {devid: 0, p-devname: sdd, p-dev: 1:0:0:3,
-        #            v-dev: 0:0:0:2, state: Initialising} ] }
+        #            v-dev: 0:0:0:2, state: 1} ],
+        #   feature-host: 1 , backend: 0 }
 
         dev_config = {}
 
@@ -1662,6 +1727,14 @@ class XendConfig(dict):
             vscsi_devs.append(vscsi_dev_info)
         dev_config['devs'] = vscsi_devs 
 
+        vscsi_mode = sxp.children(dev_sxp, 'feature-host')[0]
+        dev_config['feature-host'] = vscsi_mode[1]
+        try:
+            vscsi_be = sxp.children(dev_sxp, 'backend')[0]
+            dev_config['backend'] = vscsi_be[1]
+        except IndexError:
+            pass
+
         return dev_config
 
     def console_add(self, protocol, location, other_config = {}):
@@ -1759,6 +1832,11 @@ class XendConfig(dict):
                         'PPCI': ppci_uuid,
                         'hotplug_slot': pci_dev.get('vslot', 0)
                     }
+
+                    dpci_opts = pci_dev.get('opts')
+                    if dpci_opts and len(dpci_opts) > 0:
+                        dpci_record['options'] = dpci_opts
+
                     XendDPCI(dpci_uuid, dpci_record)
 
                 self['devices'][dev_uuid] = (dev_type,
@@ -1769,10 +1847,15 @@ class XendConfig(dict):
             if dev_type == 'vscsi': # Special case for vscsi
                 vscsi_dict = self.vscsi_convert_sxp_to_dict(config)
                 vscsi_devs = vscsi_dict['devs']
+                vscsi_mode = vscsi_dict['feature-host']
+                vscsi_be = vscsi_dict.get('backend', None)
 
                 # destroy existing XenAPI DSCSI objects
+                vscsi_devid = int(dev_info['devs'][0]['devid'])
                 for dscsi_uuid in XendDSCSI.get_by_VM(self['uuid']):
-                    XendAPIStore.deregister(dscsi_uuid, "DSCSI")
+                    dscsi_inst = XendAPIStore.get(dscsi_uuid, 'DSCSI')
+                    if vscsi_devid == dscsi_inst.get_virtual_host():
+                        XendAPIStore.deregister(dscsi_uuid, "DSCSI")
 
                 # create XenAPI DSCSI objects.
                 for vscsi_dev in vscsi_devs:
@@ -1787,8 +1870,14 @@ class XendConfig(dict):
                     }
                     XendDSCSI(dscsi_uuid, dscsi_record)
 
-                self['devices'][dev_uuid] = \
-                    (dev_type, {'devs': vscsi_devs, 'uuid': dev_uuid} )
+                vscsi_info = { 
+                    'devs': vscsi_devs,
+                    'feature-host': vscsi_mode,
+                    'uuid': dev_uuid
+                }
+                if vscsi_be is not None:
+                    vscsi_info['backend'] = vscsi_be
+                self['devices'][dev_uuid] = (dev_type, vscsi_info)
                 return True
                 
             for opt_val in config[1:]:
@@ -1865,7 +1954,6 @@ class XendConfig(dict):
     def all_devices_sxpr(self, target = None):
         """Returns the SXPR for all devices in the current configuration."""
         sxprs = []
-        pci_devs = []
 
         if target == None:
             target = self
@@ -1880,7 +1968,10 @@ class XendConfig(dict):
                 if dev_type == 'pci':
                     sxpr = ['pci', ['uuid', dev_info['uuid']]]
                 elif dev_type == 'vscsi':
-                    sxpr = ['vscsi', ['uuid', dev_info['uuid']]]
+                    sxpr = ['vscsi', ['uuid', dev_info['uuid']],
+                                     ['feature-host', dev_info['feature-host']]]
+                    if dev_info.has_key('backend'):
+                        sxpr.append(['backend', dev_info['backend']])
                 for pci_dev_info in dev_info['devs']:
                     pci_dev_sxpr = ['dev']
                     for opt, val in pci_dev_info.items():
index 13e046a0868c25532ce3cc858ca085e5852d7879..b1c2957a7afca71ea3136dbce1d8c7ac3fd36300 100644 (file)
@@ -50,6 +50,7 @@ HVM_PARAM_VIRIDIAN     = 9 # x86
 HVM_PARAM_TIMER_MODE   = 10
 HVM_PARAM_HPET_ENABLED = 11
 HVM_PARAM_ACPI_S_STATE = 14
+HVM_PARAM_VPT_ALIGN    = 16
 
 restart_modes = [
     "restart",
@@ -95,7 +96,7 @@ SHUTDOWN_TIMEOUT = (60.0 * 5)
 ZOMBIE_PREFIX = 'Zombie-'
 
 """Minimum time between domain restarts in seconds."""
-MINIMUM_RESTART_TIME = 20
+MINIMUM_RESTART_TIME = 60
 
 RESTART_IN_PROGRESS = 'xend/restart_in_progress'
 DUMPCORE_IN_PROGRESS = 'xend/dumpcore_in_progress'
@@ -134,3 +135,6 @@ VTPM_DELETE_SCRIPT = '/etc/xen/scripts/vtpm-delete'
 
 XS_VMROOT = "/vm/"
 
+NR_PCI_DEV = 32
+AUTO_PHP_SLOT = NR_PCI_DEV
+AUTO_PHP_SLOT_STR = "%02x" % NR_PCI_DEV
index c5dc920cfb620a00765ef37ca66002ce1808a4f6..0a564d7c2306982afcf64f5fdc7d773420c1cb81 100644 (file)
@@ -41,7 +41,8 @@ class XendDPCI(XendBase):
                   'virtual_name',
                   'VM',
                   'PPCI',
-                  'hotplug_slot']
+                  'hotplug_slot',
+                  'options']
         return XendBase.getAttrRO() + attrRO
 
     def getAttrRW(self):
@@ -119,6 +120,8 @@ class XendDPCI(XendBase):
         self.VM = record['VM']
         self.PPCI = record['PPCI']
         self.hotplug_slot = record['hotplug_slot']
+        if 'options' in record.keys():
+            self.options = record['options']
 
     def destroy(self):
         xendom = XendDomain.instance()
@@ -152,3 +155,5 @@ class XendDPCI(XendBase):
     def get_hotplug_slot(self):
         return self.hotplug_slot
 
+    def get_options(self):
+        return self.options
index 9faebe95aaa0ff0663f481de92d8e3cdb1f9b058..b624f786ad160c2cc35b5ebbd66b3cf561ddf390 100644 (file)
@@ -50,7 +50,7 @@ from xen.xend.XendAPIConstants import *
 
 from xen.xend.xenstore.xstransact import xstransact
 from xen.xend.xenstore.xswatch import xswatch
-from xen.util import mkdir
+from xen.util import mkdir, rwlock
 from xen.xend import uuid
 
 xc = xen.lowlevel.xc.xc()
@@ -93,6 +93,8 @@ class XendDomain:
         self.managed_domains = {}
         self.domains_lock = threading.RLock()
 
+        self.policy_lock = rwlock.RWLock()
+
         # xen api instance vars
         # TODO: nothing uses this at the moment
         self._allow_new_domains = True
@@ -421,7 +423,7 @@ class XendDomain:
                     log.exception("Unable to recreate domain")
                     try:
                         xc.domain_pause(domid)
-                        do_FLR(domid)
+                        XendDomainInfo.do_FLR(domid)
                         xc.domain_destroy(domid)
                     except:
                         log.exception("Hard destruction of domain failed: %d" %
@@ -1139,16 +1141,21 @@ class XendDomain:
         """
 
         try:
-            return XendCheckpoint.restore(self, fd, paused=paused, relocating=relocating)
-        except XendError, e:
-            log.exception("Restore failed")
-            raise
-        except:
-            # I don't really want to log this exception here, but the error
-            # handling in the relocation-socket handling code (relocate.py) is
-            # poor, so we need to log this for debugging.
-            log.exception("Restore failed")
-            raise XendError("Restore failed")
+            self.policy_lock.acquire_reader()
+
+            try:
+                return XendCheckpoint.restore(self, fd, paused=paused, relocating=relocating)
+            except XendError, e:
+                log.exception("Restore failed")
+                raise
+            except:
+                # I don't really want to log this exception here, but the error
+                # handling in the relocation-socket handling code (relocate.py) is
+                # poor, so we need to log this for debugging.
+                log.exception("Restore failed")
+                raise XendError("Restore failed")
+        finally:
+            self.policy_lock.release()
  
     def domain_unpause(self, domid):
         """Unpause domain execution.
@@ -1216,7 +1223,7 @@ class XendDomain:
             log.exception("domain_pause")
             raise XendError(str(ex))
 
-    def domain_dump(self, domid, filename, live, crash):
+    def domain_dump(self, domid, filename=None, live=False, crash=False, reset=False):
         """Dump domain core."""
 
         dominfo = self.domain_lookup_nr(domid)
@@ -1230,13 +1237,25 @@ class XendDomain:
                              POWER_STATE_NAMES[DOM_STATE_PAUSED],
                              POWER_STATE_NAMES[dominfo._stateGet()])
 
+        dopause = (not live and dominfo._stateGet() == DOM_STATE_RUNNING)
+        if dopause:
+            dominfo.pause()
+
         try:
-            log.info("Domain core dump requested for domain %s (%d) "
-                     "live=%d crash=%d.",
-                     dominfo.getName(), dominfo.getDomid(), live, crash)
-            return dominfo.dumpCore(filename)
-        except Exception, ex:
-            raise XendError(str(ex))
+            try:
+                log.info("Domain core dump requested for domain %s (%d) "
+                         "live=%d crash=%d reset=%d.",
+                         dominfo.getName(), dominfo.getDomid(), live, crash, reset)
+                dominfo.dumpCore(filename)
+                if crash:
+                    self.domain_destroy(domid)
+                elif reset:
+                    self.domain_reset(domid)
+            except Exception, ex:
+                raise XendError(str(ex))
+        finally:
+            if dopause and not crash and not reset:
+                dominfo.unpause()
 
     def domain_destroy(self, domid):
         """Terminate domain immediately.
@@ -1257,7 +1276,7 @@ class XendDomain:
         else:
             try:
                 xc.domain_pause(int(domid))
-                do_FLR(int(domid))
+                XendDomainInfo.do_FLR(int(domid))
                 val = xc.domain_destroy(int(domid))
             except ValueError:
                 raise XendInvalidDomain(domid)
index 9784f47c9d5f4c971d99c283e727f7258b917d24..ea68657c88e1051a1f8a6fc880600a7642af2e1d 100644 (file)
@@ -52,6 +52,7 @@ from xen.xend.xenstore.xsutil import GetDomainPath, IntroduceDomain, SetTarget,
 from xen.xend.xenstore.xswatch import xswatch
 from xen.xend.XendConstants import *
 from xen.xend.XendAPIConstants import *
+from xen.xend.server.DevConstants import xenbusState
 
 from xen.xend.XendVMMetrics import XendVMMetrics
 
@@ -289,19 +290,21 @@ def dom_get(dom):
         log.trace("domain_getinfo(%d) failed, ignoring: %s", dom, str(err))
     return None
 
-def do_FLR(domid):
-    from xen.xend.server.pciif import parse_pci_name, PciDevice
+def get_assigned_pci_devices(domid):
+    dev_str_list = []
     path = '/local/domain/0/backend/pci/%u/0/' % domid
     num_devs = xstransact.Read(path + 'num_devs');
     if num_devs is None or num_devs == "":
-        return;
-
-    num_devs = int(xstransact.Read(path + 'num_devs'));
-
-    dev_str_list = []
+        return dev_str_list
+    num_devs = int(num_devs);
     for i in range(num_devs):
         dev_str = xstransact.Read(path + 'dev-%i' % i)
         dev_str_list = dev_str_list + [dev_str]
+    return dev_str_list 
+
+def do_FLR(domid):
+    from xen.xend.server.pciif import parse_pci_name, PciDevice
+    dev_str_list = get_assigned_pci_devices(domid)
 
     for dev_str in dev_str_list:
         (dom, b, d, f) = parse_pci_name(dev_str)
@@ -478,6 +481,14 @@ class XendDomainInfo:
         if state in (DOM_STATE_SUSPENDED, DOM_STATE_HALTED):
             try:
                 self._constructDomain()
+
+                try:
+                    self._setCPUAffinity()
+                except:
+                    # usually a CPU we want to set affinity to does not exist
+                    # we just ignore it so that the domain can still be restored
+                    log.warn("Cannot restore CPU affinity")
+
                 self._storeVmDetails()
                 self._createChannels()
                 self._createDevices()
@@ -508,7 +519,8 @@ class XendDomainInfo:
         # HVM domain shuts itself down only if it has PV drivers
         if self.info.is_hvm():
             hvm_pvdrv = xc.hvm_get_param(self.domid, HVM_PARAM_CALLBACK_IRQ)
-            if not hvm_pvdrv:
+            hvm_s_state = xc.hvm_get_param(self.domid, HVM_PARAM_ACPI_S_STATE)
+            if not hvm_pvdrv or hvm_s_state != 0:
                 code = REVERSE_DOMAIN_SHUTDOWN_REASONS[reason]
                 log.info("HVM save:remote shutdown dom %d!", self.domid)
                 xc.domain_shutdown(self.domid, code)
@@ -635,10 +647,66 @@ class XendDomainInfo:
                           " already been assigned to other domain, or maybe"
                           " it doesn't exist." % (bus, dev, func))
 
-        bdf_str = "%s:%s:%s.%s@%s" % (new_dev['domain'],
+        # Here, we duplicate some checkings (in some cases, we mustn't allow
+        # a device to be hot-plugged into an HVM guest) that are also done in
+        # pci_device_configure()'s self.device_create(dev_sxp) or
+        # dev_control.reconfigureDevice(devid, dev_config).
+        # We must make the checkings before sending the command 'pci-ins' to
+        # ioemu.
+
+        # Test whether the device is owned by pciback. For instance, we can't
+        # hotplug a device being used by Dom0 itself to an HVM guest.
+        from xen.xend.server.pciif import PciDevice, parse_pci_name
+        domain = int(new_dev['domain'],16)
+        bus    = int(new_dev['bus'],16)
+        dev    = int(new_dev['slot'],16)
+        func   = int(new_dev['func'],16)
+        try:
+            pci_device = PciDevice(domain, bus, dev, func)
+        except Exception, e:
+            raise VmError("pci: failed to locate device and "+
+                    "parse it's resources - "+str(e))
+        if pci_device.driver!='pciback':
+            raise VmError(("pci: PCI Backend does not own device "+ \
+                    "%s\n"+ \
+                    "See the pciback.hide kernel "+ \
+                    "command-line parameter or\n"+ \
+                    "bind your slot/device to the PCI backend using sysfs" \
+                    )%(pci_device.name))
+
+        # Check non-page-aligned MMIO BAR.
+        if pci_device.has_non_page_aligned_bar and arch.type != "ia64":
+            raise VmError("pci: %s: non-page-aligned MMIO BAR found." % \
+                pci_device.name)
+
+        # Check the co-assignment.
+        # To pci-attach a device D to domN, we should ensure each of D's
+        # co-assignment devices hasn't been assigned, or has been assigned to
+        # domN.
+        coassignment_list = pci_device.find_coassigned_devices()
+        assigned_pci_device_str_list = get_assigned_pci_devices(self.domid)
+        for pci_str in coassignment_list:
+            (domain, bus, dev, func) = parse_pci_name(pci_str) 
+            dev_str =  '0x%x,0x%x,0x%x,0x%x' % (domain, bus, dev, func)
+            if xc.test_assign_device(self.domid, dev_str) == 0:
+                continue
+            if not pci_str in assigned_pci_device_str_list:
+                raise VmError(('pci: failed to pci-attach %s to dom%d" + \
+                    " because one of its co-assignment device %s has been" + \
+                    " assigned to other domain.' \
+                    )% (pci_device.name, self.domid, pci_str))
+
+        opts = ''
+        if 'opts' in new_dev and len(new_dev['opts']) > 0:
+            config_opts = new_dev['opts']
+            config_opts = map(lambda (x, y): x+'='+y, config_opts)
+            opts = ',' + reduce(lambda x, y: x+','+y, config_opts)
+
+        bdf_str = "%s:%s:%s.%s%s@%s" % (new_dev['domain'],
                 new_dev['bus'],
                 new_dev['slot'],
                 new_dev['func'],
+                opts,
                 new_dev['vslt'])
         self.image.signalDeviceModel('pci-ins', 'pci-inserted', bdf_str)
 
@@ -665,7 +733,7 @@ class XendDomainInfo:
                 if dev_type == 'pci':
                     for dev in dev_config_dict['devs']:
                         XendAPIStore.deregister(dev['uuid'], 'DPCI')
-                if dev_type == 'vscsi':
+                elif dev_type == 'vscsi':
                     for dev in dev_config_dict['devs']:
                         XendAPIStore.deregister(dev['uuid'], 'DSCSI')
                 elif dev_type == 'tap':
@@ -725,7 +793,7 @@ class XendDomainInfo:
                 existing_dev_uuid = sxp.child_value(existing_dev_info, 'uuid')
                 existing_pci_conf = self.info['devices'][existing_dev_uuid][1]
                 existing_pci_devs = existing_pci_conf['devs']
-                vslt = '0x0'
+                vslt = AUTO_PHP_SLOT_STR
                 for x in existing_pci_devs:
                     if ( int(x['domain'], 16) == int(dev['domain'], 16) and
                          int(x['bus'], 16) == int(dev['bus'], 16) and
@@ -733,7 +801,7 @@ class XendDomainInfo:
                          int(x['func'], 16) == int(dev['func'], 16) ):
                         vslt = x['vslt']
                         break
-                if vslt == '0x0':
+                if vslt == AUTO_PHP_SLOT_STR:
                     raise VmError("Device %04x:%02x:%02x.%01x is not connected"
                                   % (int(dev['domain'],16), int(dev['bus'],16),
                                      int(dev['slot'],16), int(dev['func'],16)))
@@ -787,44 +855,122 @@ class XendDomainInfo:
         """Configure an existing vscsi device.
             quoted pci funciton
         """
+        def _is_vscsi_defined(dev_info, p_devs = None, v_devs = None):
+            if not dev_info:
+                return False
+            for dev in sxp.children(dev_info, 'dev'):
+                if p_devs is not None:
+                    if sxp.child_value(dev, 'p-dev') in p_devs:
+                        return True
+                if v_devs is not None:
+                    if sxp.child_value(dev, 'v-dev') in v_devs:
+                        return True
+            return False
+
+        def _vscsi_be(be):
+            be_xdi = xen.xend.XendDomain.instance().domain_lookup_nr(be)
+            if be_xdi is not None:
+                be_domid = be_xdi.getDomid()
+                if be_domid is not None:
+                    return str(be_domid)
+            return str(be)
+
         dev_class = sxp.name(dev_sxp)
         if dev_class != 'vscsi':
             return False
 
         dev_config = self.info.vscsi_convert_sxp_to_dict(dev_sxp)
-        dev = dev_config['devs'][0]
-        req_devid = int(dev['devid'])
-        existing_dev_info = self._getDeviceInfo_vscsi(req_devid, dev['v-dev'])
-        state = dev['state']
+        devs = dev_config['devs']
+        v_devs = [d['v-dev'] for d in devs]
+        state = devs[0]['state']
+        req_devid = int(devs[0]['devid'])
+        cur_dev_sxp = self._getDeviceInfo_vscsi(req_devid)
 
-        if state == 'Initialising':
+        if state == xenbusState['Initialising']:
             # new create
             # If request devid does not exist, create and exit.
-            if existing_dev_info is None:
+            p_devs = [d['p-dev'] for d in devs]
+            for dev_type, dev_info in self.info.all_devices_sxpr():
+                if dev_type != 'vscsi':
+                    continue
+                if _is_vscsi_defined(dev_info, p_devs = p_devs):
+                    raise XendError('The physical device "%s" is already defined' % \
+                                    p_devs[0])
+            if cur_dev_sxp is None:
                 self.device_create(dev_sxp)
                 return True
-            elif existing_dev_info == "exists":
-                raise XendError("The virtual device %s is already defined" % dev['v-dev'])
 
-        elif state == 'Closing':
-            if existing_dev_info is None:
+            if _is_vscsi_defined(cur_dev_sxp, v_devs = v_devs):
+                raise XendError('The virtual device "%s" is already defined' % \
+                                v_devs[0])
+
+            if int(dev_config['feature-host']) != \
+               int(sxp.child_value(cur_dev_sxp, 'feature-host')):
+                raise XendError('The physical device "%s" cannot define '
+                                'because mode is different' % devs[0]['p-dev'])
+
+            new_be = dev_config.get('backend', None)
+            if new_be is not None:
+                cur_be = sxp.child_value(cur_dev_sxp, 'backend', None)
+                if cur_be is None:
+                    cur_be = xen.xend.XendDomain.DOM0_ID
+                new_be_dom = _vscsi_be(new_be)
+                cur_be_dom = _vscsi_be(cur_be)
+                if new_be_dom != cur_be_dom:
+                    raise XendError('The physical device "%s" cannot define '
+                                    'because backend is different' % devs[0]['p-dev'])
+
+        elif state == xenbusState['Closing']:
+            if not _is_vscsi_defined(cur_dev_sxp, v_devs = v_devs):
                 raise XendError("Cannot detach vscsi device does not exist")
 
-        # use DevController.reconfigureDevice to change device config
-        dev_control = self.getDeviceController(dev_class)
-        dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
-        dev_control.waitForDevice_reconfigure(req_devid)
-        num_devs = dev_control.cleanupDevice(req_devid)
+        if self.domid is not None:
+            # use DevController.reconfigureDevice to change device config
+            dev_control = self.getDeviceController(dev_class)
+            dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
+            dev_control.waitForDevice_reconfigure(req_devid)
+            num_devs = dev_control.cleanupDevice(req_devid)
+
+            # update XendConfig with new device info
+            if dev_uuid:
+                new_dev_sxp = dev_control.configuration(req_devid)
+                self.info.device_update(dev_uuid, new_dev_sxp)
+
+            # If there is no device left, destroy vscsi and remove config.
+            if num_devs == 0:
+                self.destroyDevice('vscsi', req_devid)
+                del self.info['devices'][dev_uuid]
 
-        # update XendConfig with new device info
-        if dev_uuid:
-            new_dev_sxp = dev_control.configuration(req_devid)
+        else:
+            new_dev_sxp = ['vscsi']
+            cur_mode = sxp.children(cur_dev_sxp, 'feature-host')[0]
+            new_dev_sxp.append(cur_mode)
+            try:
+                cur_be = sxp.children(cur_dev_sxp, 'backend')[0]
+                new_dev_sxp.append(cur_be)
+            except IndexError:
+                pass
+
+            for cur_dev in sxp.children(cur_dev_sxp, 'dev'):
+                if state == xenbusState['Closing']:
+                    if int(cur_mode[1]) == 1:
+                        continue
+                    if sxp.child_value(cur_dev, 'v-dev') in v_devs:
+                        continue
+                new_dev_sxp.append(cur_dev)
+
+            if state == xenbusState['Initialising']:
+                for new_dev in sxp.children(dev_sxp, 'dev'):
+                    new_dev_sxp.append(new_dev)
+
+            dev_uuid = sxp.child_value(cur_dev_sxp, 'uuid')
             self.info.device_update(dev_uuid, new_dev_sxp)
 
-        # If there is no device left, destroy vscsi and remove config.
-        if num_devs == 0:
-            self.destroyDevice('vscsi', req_devid)
-            del self.info['devices'][dev_uuid]
+            # If there is only 'vscsi' in new_dev_sxp, remove the config.
+            if len(sxp.children(new_dev_sxp, 'dev')) == 0:
+                del self.info['devices'][dev_uuid]
+
+        xen.xend.XendDomain.instance().managed_config_save(self)
 
         return True
 
@@ -902,6 +1048,31 @@ class XendDomainInfo:
         if vslot == 0:
             raise VmError("Device @ vslot 0x%x do not support hotplug." % (vslot))
 
+        # Check the co-assignment.
+        # To pci-detach a device D from domN, we should ensure: for each DD in the
+        # list of D's co-assignment devices, DD is not assigned (to domN).
+        # 
+        from xen.xend.server.pciif import PciDevice
+        domain = int(x['domain'],16)
+        bus    = int(x['bus'],16)
+        dev    = int(x['slot'],16)
+        func   = int(x['func'],16)
+        try:
+            pci_device = PciDevice(domain, bus, dev, func)
+        except Exception, e:
+            raise VmError("pci: failed to locate device and "+
+                    "parse it's resources - "+str(e))
+        coassignment_list = pci_device.find_coassigned_devices()
+        coassignment_list.remove(pci_device.name)
+        assigned_pci_device_str_list = get_assigned_pci_devices(self.domid)
+        for pci_str in coassignment_list:
+            if pci_str in assigned_pci_device_str_list:
+                raise VmError(('pci: failed to pci-detach %s from dom%d" + \
+                    " because one of its co-assignment device %s is still " + \
+                    " assigned to the domain.' \
+                    )% (pci_device.name, self.domid, pci_str))
+
+
         bdf_str = "%s:%s:%s.%s" % (x['domain'], x['bus'], x['slot'], x['func'])
         log.info("hvm_destroyPCIDevice:%s:%s!", x, bdf_str)
 
@@ -986,7 +1157,29 @@ class XendDomainInfo:
             sxprs = []
             dev_num = 0
             for dev_type, dev_info in self.info.all_devices_sxpr():
-                if dev_type == deviceClass:
+                if (deviceClass == 'vbd' and dev_type not in ['vbd', 'tap']) or \
+                   (deviceClass != 'vbd' and dev_type != deviceClass):
+                    continue
+
+                if deviceClass == 'vscsi':
+                    vscsi_devs = ['devs', []]
+                    for vscsi_dev in sxp.children(dev_info, 'dev'):
+                        vscsi_dev.append(['frontstate', None])
+                        vscsi_devs[1].append(vscsi_dev)
+                        dev_num = int(sxp.child_value(vscsi_dev, 'devid'))
+                    vscsi_mode = sxp.children(dev_info, 'feature-host')[0]
+                    sxprs.append([dev_num, [vscsi_devs, vscsi_mode]])
+                elif deviceClass == 'vbd':
+                    dev = sxp.child_value(dev_info, 'dev')
+                    if 'ioemu:' in dev:
+                        (_, dev) = dev.split(':', 1)
+                    try:
+                        (dev_name, _) = dev.split(':', 1)  # Remove ":disk" or ":cdrom"
+                    except ValueError:
+                        dev_name = dev
+                    dev_num = self.getDeviceController('vbd').convertToDeviceNumber(dev_name)
+                    sxprs.append([dev_num, dev_info])
+                else:
                     sxprs.append([dev_num, dev_info])
                     dev_num += 1
             return sxprs
@@ -1023,23 +1216,14 @@ class XendDomainInfo:
             return dev_info
         return None
 
-    def _getDeviceInfo_vscsi(self, devid, vdev):
+    def _getDeviceInfo_vscsi(self, devid):
         devid = int(devid)
         for dev_type, dev_info in self.info.all_devices_sxpr():
             if dev_type != 'vscsi':
                 continue
-            existing_dev_uuid = sxp.child_value(dev_info, 'uuid')
-            existing_conf = self.info['devices'][existing_dev_uuid][1]
-            existing_dev = existing_conf['devs'][0]
-            existing_devid = int(existing_dev['devid'])
-            existing_vdev = existing_dev['v-dev']
-
-            if vdev == existing_vdev:
-                return "exists"
-
-            if devid == existing_devid:
+            devs = sxp.children(dev_info, 'dev')
+            if devid == int(sxp.child_value(devs[0], 'devid')):
                 return dev_info
-
         return None
 
     def setMemoryTarget(self, target):
@@ -1050,10 +1234,10 @@ class XendDomainInfo:
                   self.info['name_label'], str(self.domid), target)
         
         MiB = 1024 * 1024
+        memory_cur = self.get_memory_dynamic_max() / MiB
 
         if self.domid == 0:
             dom0_min_mem = xoptions.get_dom0_min_mem()
-            memory_cur = self.get_memory_dynamic_max() / MiB
             if target < memory_cur and dom0_min_mem > target:
                 raise XendError("memory_dynamic_max too small")
 
@@ -1061,8 +1245,12 @@ class XendDomainInfo:
         self._safe_set_memory('memory_dynamic_max', target * MiB)
 
         if self.domid >= 0:
+            if target > memory_cur:
+                balloon.free((target - memory_cur) * 1024, self)
             self.storeVm("memory", target)
             self.storeDom("memory/target", target << 10)
+            xc.domain_set_target_mem(self.domid,
+                                     (target * 1024))
         xen.xend.XendDomain.instance().managed_config_save(self)
 
     def setMemoryMaximum(self, limit):
@@ -1222,7 +1410,8 @@ class XendDomainInfo:
             for dev_uuid, (dev_type, dev_info) in self.info['devices'].items():
                 if dev_type == 'vfb':
                     old_location = dev_info.get('location')
-                    listen_host = dev_info.get('vnclisten', 'localhost')
+                    listen_host = dev_info.get('vnclisten', \
+                                XendOptions.instance().get_vnclisten_address())
                     new_location = '%s:%s' % (listen_host, str(vnc_port))
                     if old_location == new_location:
                         break
@@ -1307,7 +1496,8 @@ class XendDomainInfo:
         t.mkdir()
         t.set_permissions({'dom' : self.domid, 'read' : True})
         t.write('vm', self.vmpath)
-        for i in [ 'device', 'control', 'error', 'memory' ]:
+        # NB. Solaris guests use guest/ and hvmpv/ xenstore directories
+        for i in [ 'device', 'control', 'error', 'memory', 'guest', 'hvmpv' ]:
             t.mkdir(i)
             t.set_permissions(i, {'dom' : self.domid})
 
@@ -1506,23 +1696,18 @@ class XendDomainInfo:
         return self.info['VCPUs_max']
 
     def setVCpuCount(self, vcpus):
-        if vcpus <= 0:
-            raise XendError('Invalid VCPUs')
+        def vcpus_valid(n):
+            if vcpus <= 0:
+                raise XendError('Zero or less VCPUs is invalid')
+            if self.domid >= 0 and vcpus > self.info['VCPUs_max']:
+                raise XendError('Cannot set vcpus greater than max vcpus on running domain')
+        vcpus_valid(vcpus)
         
         self.info['vcpu_avail'] = (1 << vcpus) - 1
         if self.domid >= 0:
             self.storeVm('vcpu_avail', self.info['vcpu_avail'])
-            # update dom differently depending on whether we are adjusting
-            # vcpu number up or down, otherwise _vcpuDomDetails does not
-            # disable the vcpus
-            if self.info['VCPUs_max'] > vcpus:
-                # decreasing
-                self._writeDom(self._vcpuDomDetails())
-                self.info['VCPUs_live'] = vcpus
-            else:
-                # same or increasing
-                self.info['VCPUs_live'] = vcpus
-                self._writeDom(self._vcpuDomDetails())
+            self._writeDom(self._vcpuDomDetails())
+            self.info['VCPUs_live'] = vcpus
         else:
             if self.info['VCPUs_max'] > vcpus:
                 # decreasing
@@ -1532,7 +1717,7 @@ class XendDomainInfo:
                 for c in range(self.info['VCPUs_max'], vcpus):
                     self.info['cpus'].append(list())
             self.info['VCPUs_max'] = vcpus
-            xen.xend.XendDomain.instance().managed_config_save(self)
+        xen.xend.XendDomain.instance().managed_config_save(self)
         log.info("Set VCPU count on domain %s to %d", self.info['name_label'],
                  vcpus)
 
@@ -1844,26 +2029,31 @@ class XendDomainInfo:
         @raise: XendError if core dumping failed.
         """
         
-        try:
-            if not corefile:
-                this_time = time.strftime("%Y-%m%d-%H%M.%S", time.localtime())
-                corefile = "/var/xen/dump/%s-%s.%s.core" % (this_time,
-                                  self.info['name_label'], self.domid)
+        if not corefile:
+            this_time = time.strftime("%Y-%m%d-%H%M.%S", time.localtime())
+            corefile = "/var/xen/dump/%s-%s.%s.core" % (this_time,
+                              self.info['name_label'], self.domid)
                 
-            if os.path.isdir(corefile):
-                raise XendError("Cannot dump core in a directory: %s" %
-                                corefile)
-            
-            self._writeVm(DUMPCORE_IN_PROGRESS, 'True')
-            xc.domain_dumpcore(self.domid, corefile)
-            self._removeVm(DUMPCORE_IN_PROGRESS)
-        except RuntimeError, ex:
-            corefile_incomp = corefile+'-incomplete'
-            os.rename(corefile, corefile_incomp)
+        if os.path.isdir(corefile):
+            raise XendError("Cannot dump core in a directory: %s" %
+                            corefile)
+
+        try:
+            try:
+                self._writeVm(DUMPCORE_IN_PROGRESS, 'True')
+                xc.domain_dumpcore(self.domid, corefile)
+            except RuntimeError, ex:
+                corefile_incomp = corefile+'-incomplete'
+                try:
+                    os.rename(corefile, corefile_incomp)
+                except:
+                    pass
+
+                log.error("core dump failed: id = %s name = %s: %s",
+                          self.domid, self.info['name_label'], str(ex))
+                raise XendError("Failed to dump core: %s" %  str(ex))
+        finally:
             self._removeVm(DUMPCORE_IN_PROGRESS)
-            log.exception("XendDomainInfo.dumpCore failed: id = %s name = %s",
-                          self.domid, self.info['name_label'])
-            raise XendError("Failed to dump core: %s" %  str(ex))
 
     #
     # Device creation/deletion functions
@@ -1953,13 +2143,21 @@ class XendDomainInfo:
             for devclass in XendDevices.valid_devices():
                 for dev in t.list(devclass):
                     try:
+                        true_devclass = devclass
+                        if devclass == 'vbd':
+                            # In the case of "vbd", the true device class
+                            # may possibly be "tap". Just in case, verify
+                            # device class.
+                            devid = dev.split('/')[-1]
+                            true_devclass = self.getBlockDeviceClass(devid)
                         log.debug("Removing %s", dev);
-                        self.destroyDevice(devclass, dev, False);
+                        self.destroyDevice(true_devclass, dev, False);
                     except:
                         # Log and swallow any exceptions in removal --
                         # there's nothing more we can do.
                         log.exception("Device release failed: %s; %s; %s",
-                                      self.info['name_label'], devclass, dev)
+                                      self.info['name_label'],
+                                      true_devclass, dev)
         finally:
             t.abort()
 
@@ -2048,7 +2246,7 @@ class XendDomainInfo:
         # overhead is greater for some types of domain than others. For
         # example, an x86 HVM domain will have a default shadow-pagetable
         # allocation of 1MB. We free up 2MB here to be on the safe side.
-        balloon.free(2*1024) # 2MB should be plenty
+        balloon.free(2*1024, self) # 2MB should be plenty
 
         ssidref = 0
         if security.on() == xsconstants.XS_POLICY_USE:
@@ -2056,12 +2254,17 @@ class XendDomainInfo:
             if security.has_authorization(ssidref) == False:
                 raise VmError("VM is not authorized to run.")
 
+        s3_integrity = 0
+        if self.info.has_key('s3_integrity'):
+            s3_integrity = self.info['s3_integrity']
+        flags = (int(hvm) << 0) | (int(hap) << 1) | (int(s3_integrity) << 2)
+
         try:
             self.domid = xc.domain_create(
                 domid = 0,
                 ssidref = ssidref,
                 handle = uuid.fromString(self.info['uuid']),
-                flags = (int(hvm) << 0) | (int(hap) << 1),
+                flags = flags,
                 target = self.info.target())
         except Exception, e:
             # may get here if due to ACM the operation is not permitted
@@ -2093,11 +2296,21 @@ class XendDomainInfo:
             xc.hvm_set_param(self.domid, HVM_PARAM_HPET_ENABLED,
                              long(hpet))
 
+        # Optionally enable periodic vpt aligning
+        vpt_align = self.info["platform"].get("vpt_align")
+        if hvm and vpt_align is not None:
+            xc.hvm_set_param(self.domid, HVM_PARAM_VPT_ALIGN,
+                             long(vpt_align))
+
         # Set maximum number of vcpus in domain
         xc.domain_max_vcpus(self.domid, int(self.info['VCPUs_max']))
 
         # Test whether the devices can be assigned with VT-d
-        pci_str = str(self.info["platform"].get("pci"))
+        pci = self.info["platform"].get("pci")
+        pci_str = ''
+        if pci and len(pci) > 0:
+            pci = map(lambda x: x[0:4], pci)  # strip options 
+            pci_str = str(pci)
         if hvm and pci_str:
             bdf = xc.test_assign_device(self.domid, pci_str)
             if bdf != 0:
@@ -2137,6 +2350,64 @@ class XendDomainInfo:
             raise XendError(str(exn))
 
 
+    def _setCPUAffinity(self):
+        """ Repin domain vcpus if a restricted cpus list is provided
+        """
+
+        def has_cpus():
+            if self.info['cpus'] is not None:
+                for c in self.info['cpus']:
+                    if c:
+                        return True
+            return False
+
+        if has_cpus():
+            for v in range(0, self.info['VCPUs_max']):
+                if self.info['cpus'][v]:
+                    xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v])
+        else:
+            def find_relaxed_node(node_list):
+                import sys
+                nr_nodes = info['nr_nodes']
+                if node_list is None:
+                    node_list = range(0, nr_nodes)
+                nodeload = [0]
+                nodeload = nodeload * nr_nodes
+                from xen.xend import XendDomain
+                doms = XendDomain.instance().list('all')
+                for dom in filter (lambda d: d.domid != self.domid, doms):
+                    cpuinfo = dom.getVCPUInfo()
+                    for vcpu in sxp.children(cpuinfo, 'vcpu'):
+                        if sxp.child_value(vcpu, 'online') == 0: continue
+                        cpumap = list(sxp.child_value(vcpu,'cpumap'))
+                        for i in range(0, nr_nodes):
+                            node_cpumask = info['node_to_cpu'][i]
+                            for j in node_cpumask:
+                                if j in cpumap:
+                                    nodeload[i] += 1
+                                    break
+                for i in range(0, nr_nodes):
+                    if len(info['node_to_cpu'][i]) > 0 and i in node_list:
+                        nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i]))
+                    else:
+                        nodeload[i] = sys.maxint
+                index = nodeload.index( min(nodeload) )    
+                return index
+
+            info = xc.physinfo()
+            if info['nr_nodes'] > 1:
+                node_memory_list = info['node_to_memory']
+                needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
+                candidate_node_list = []
+                for i in range(0, info['nr_nodes']):
+                    if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0:
+                        candidate_node_list.append(i)
+                index = find_relaxed_node(candidate_node_list)
+                cpumask = info['node_to_cpu'][index]
+                for v in range(0, self.info['VCPUs_max']):
+                    xc.vcpu_setaffinity(self.domid, v, cpumask)
+
+
     def _initDomain(self):
         log.debug('XendDomainInfo.initDomain: %s %s',
                   self.domid,
@@ -2156,58 +2427,7 @@ class XendDomainInfo:
             # repin domain vcpus if a restricted cpus list is provided
             # this is done prior to memory allocation to aide in memory
             # distribution for NUMA systems.
-            def has_cpus():
-                if self.info['cpus'] is not None:
-                    for c in self.info['cpus']:
-                        if c:
-                            return True
-                return False
-
-            if has_cpus():
-                for v in range(0, self.info['VCPUs_max']):
-                    if self.info['cpus'][v]:
-                        xc.vcpu_setaffinity(self.domid, v, self.info['cpus'][v])
-            else:
-                def find_relaxed_node(node_list):
-                    import sys
-                    nr_nodes = info['nr_nodes']
-                    if node_list is None:
-                        node_list = range(0, nr_nodes)
-                    nodeload = [0]
-                    nodeload = nodeload * nr_nodes
-                    from xen.xend import XendDomain
-                    doms = XendDomain.instance().list('all')
-                    for dom in filter (lambda d: d.domid != self.domid, doms):
-                        cpuinfo = dom.getVCPUInfo()
-                        for vcpu in sxp.children(cpuinfo, 'vcpu'):
-                            if sxp.child_value(vcpu, 'online') == 0: continue
-                            cpumap = list(sxp.child_value(vcpu,'cpumap'))
-                            for i in range(0, nr_nodes):
-                                node_cpumask = info['node_to_cpu'][i]
-                                for j in node_cpumask:
-                                    if j in cpumap:
-                                        nodeload[i] += 1
-                                        break
-                    for i in range(0, nr_nodes):
-                        if len(info['node_to_cpu'][i]) > 0 and i in node_list:
-                            nodeload[i] = int(nodeload[i] * 16 / len(info['node_to_cpu'][i]))
-                        else:
-                            nodeload[i] = sys.maxint
-                    index = nodeload.index( min(nodeload) )    
-                    return index
-
-                info = xc.physinfo()
-                if info['nr_nodes'] > 1:
-                    node_memory_list = info['node_to_memory']
-                    needmem = self.image.getRequiredAvailableMemory(self.info['memory_dynamic_max']) / 1024
-                    candidate_node_list = []
-                    for i in range(0, info['nr_nodes']):
-                        if node_memory_list[i] >= needmem and len(info['node_to_cpu'][i]) > 0:
-                            candidate_node_list.append(i)
-                    index = find_relaxed_node(candidate_node_list)
-                    cpumask = info['node_to_cpu'][index]
-                    for v in range(0, self.info['VCPUs_max']):
-                        xc.vcpu_setaffinity(self.domid, v, cpumask)
+            self._setCPUAffinity()
 
             # Use architecture- and image-specific calculations to determine
             # the various headrooms necessary, given the raw configured
@@ -2235,7 +2455,7 @@ class XendDomainInfo:
             vtd_mem = ((vtd_mem + 1023) / 1024) * 1024
 
             # Make sure there's enough RAM available for the domain
-            balloon.free(memory + shadow + vtd_mem)
+            balloon.free(memory + shadow + vtd_mem, self)
 
             # Set up the shadow memory
             shadow_cur = xc.shadow_mem_control(self.domid, shadow / 1024)
@@ -2331,6 +2551,31 @@ class XendDomainInfo:
         finally:
             self.state_updated.release()
 
+    def waitForSuspend(self):
+        """Wait for the guest to respond to a suspend request by
+        shutting down.  If the guest hasn't re-written control/shutdown
+        after a certain amount of time, it's obviously not listening and
+        won't suspend, so we give up.  HVM guests with no PV drivers
+        should already be shutdown.
+        """
+        state = "suspend"
+        nr_tries = 60
+
+        self.state_updated.acquire()
+        try:
+            while self._stateGet() in (DOM_STATE_RUNNING,DOM_STATE_PAUSED):
+                self.state_updated.wait(1.0)
+                if state == "suspend":
+                    if nr_tries == 0:
+                        msg = ('Timeout waiting for domain %s to suspend'
+                            % self.domid)
+                        self._writeDom('control/shutdown', '')
+                        raise XendError(msg)
+                    state = self.readDom('control/shutdown')
+                    nr_tries -= 1
+        finally:
+            self.state_updated.release()
+
     #
     # TODO: recategorise - called from XendCheckpoint
     # 
@@ -2385,11 +2630,10 @@ class XendDomainInfo:
             time.sleep(2)
         for paths in plist:
             if paths.find('backend') != -1:
-                from xen.xend.server import DevController
                 # Modify online status /before/ updating state (latter is watched by
                 # drivers, so this ordering avoids a race).
                 xstransact.Write(paths, 'online', "0")
-                xstransact.Write(paths, 'state', str(DevController.xenbusState['Closing']))
+                xstransact.Write(paths, 'state', str(xenbusState['Closing']))
             # force
             xstransact.Remove(paths)
 
@@ -2653,7 +2897,7 @@ class XendDomainInfo:
             # The domain might already have some shadow memory
             overhead_kb -= xc.shadow_mem_control(self.domid) * 1024
         if overhead_kb > 0:
-            balloon.free(overhead_kb)
+            balloon.free(overhead_kb, self)
 
     def _unwatchVm(self):
         """Remove the watch on the VM path, if any.  Idempotent.  Nothrow
@@ -2676,7 +2920,9 @@ class XendDomainInfo:
         while True:
             test = 0
             diff = time.time() - start
-            for i in self.getDeviceController('vbd').deviceIDs():
+            vbds = self.getDeviceController('vbd').deviceIDs()
+            taps = self.getDeviceController('tap').deviceIDs()
+            for i in vbds + taps:
                 test = 1
                 log.info("Dev %s still active, looping...", i)
                 time.sleep(0.1)
@@ -2983,64 +3229,69 @@ class XendDomainInfo:
         if not xspol:
             xspol = poladmin.get_policy_by_name(policy)
 
-        if state in [ DOM_STATE_RUNNING, DOM_STATE_PAUSED ]:
-            #if domain is running or paused try to relabel in hypervisor
-            if not xspol:
-                return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
-
-            if typ != xspol.get_type_name() or \
-               policy != xspol.get_name():
-                return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+        try:
+            xen.xend.XendDomain.instance().policy_lock.acquire_writer()
 
-            if typ == xsconstants.ACM_POLICY_ID:
-                new_ssidref = xspol.vmlabel_to_ssidref(label)
-                if new_ssidref == xsconstants.INVALID_SSIDREF:
-                    return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+            if state in [ DOM_STATE_RUNNING, DOM_STATE_PAUSED ]:
+                #if domain is running or paused try to relabel in hypervisor
+                if not xspol:
+                    return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
 
-                # Check that all used resources are accessible under the
-                # new label
-                if not is_policy_update and \
-                   not security.resources_compatible_with_vmlabel(xspol,
-                          self, label):
+                if typ != xspol.get_type_name() or \
+                   policy != xspol.get_name():
                     return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
 
-                #Check label against expected one. Can only do this
-                # if the policy hasn't changed underneath in the meantime
-                if xspol_old == None:
-                    old_label = self.get_security_label()
-                    if old_label != old_seclab:
-                        log.info("old_label != old_seclab: %s != %s" %
-                                 (old_label, old_seclab))
+                if typ == xsconstants.ACM_POLICY_ID:
+                    new_ssidref = xspol.vmlabel_to_ssidref(label)
+                    if new_ssidref == xsconstants.INVALID_SSIDREF:
                         return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
 
-                # relabel domain in the hypervisor
-                rc, errors = security.relabel_domains([[domid, new_ssidref]])
-                log.info("rc from relabeling in HV: %d" % rc)
-            else:
-                return (-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED, "", "", 0)
+                    # Check that all used resources are accessible under the
+                    # new label
+                    if not is_policy_update and \
+                       not security.resources_compatible_with_vmlabel(xspol,
+                              self, label):
+                        return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
 
-        if rc == 0:
-            # HALTED, RUNNING or PAUSED
-            if domid == 0:
-                if xspol:
-                    self.info['security_label'] = seclab
-                    ssidref = poladmin.set_domain0_bootlabel(xspol, label)
+                    #Check label against expected one. Can only do this
+                    # if the policy hasn't changed underneath in the meantime
+                    if xspol_old == None:
+                        old_label = self.get_security_label()
+                        if old_label != old_seclab:
+                            log.info("old_label != old_seclab: %s != %s" %
+                                     (old_label, old_seclab))
+                            return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+
+                    # relabel domain in the hypervisor
+                    rc, errors = security.relabel_domains([[domid, new_ssidref]])
+                    log.info("rc from relabeling in HV: %d" % rc)
                 else:
-                    return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
-            else:
-                if self.info.has_key('security_label'):
-                    old_label = self.info['security_label']
-                    # Check label against expected one, unless wildcard
-                    if old_label != old_seclab:
-                        return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
+                    return (-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED, "", "", 0)
+
+            if rc == 0:
+                # HALTED, RUNNING or PAUSED
+                if domid == 0:
+                    if xspol:
+                        self.info['security_label'] = seclab
+                        ssidref = poladmin.set_domain0_bootlabel(xspol, label)
+                    else:
+                        return (-xsconstants.XSERR_POLICY_NOT_LOADED, "", "", 0)
+                else:
+                    if self.info.has_key('security_label'):
+                        old_label = self.info['security_label']
+                        # Check label against expected one, unless wildcard
+                        if old_label != old_seclab:
+                            return (-xsconstants.XSERR_BAD_LABEL, "", "", 0)
 
-                self.info['security_label'] = seclab
+                    self.info['security_label'] = seclab
 
-                try:
-                    xen.xend.XendDomain.instance().managed_config_save(self)
-                except:
-                    pass
-        return (rc, errors, old_label, new_ssidref)
+                    try:
+                        xen.xend.XendDomain.instance().managed_config_save(self)
+                    except:
+                        pass
+            return (rc, errors, old_label, new_ssidref)
+        finally:
+            xen.xend.XendDomain.instance().policy_lock.release()
 
     def get_on_shutdown(self):
         after_shutdown = self.info.get('actions_after_shutdown')
@@ -3379,6 +3630,11 @@ class XendDomainInfo:
 
         dpci_uuid = uuid.createString()
 
+        dpci_opts = []
+        opts_dict = xenapi_pci.get('options')
+        for k in opts_dict.keys():
+            dpci_opts.append([k, opts_dict[k]])
+
         # Convert xenapi to sxp
         ppci = XendAPIStore.get(xenapi_pci.get('PPCI'), 'PPCI')
 
@@ -3390,6 +3646,7 @@ class XendDomainInfo:
                     ['slot', '0x%02x' % ppci.get_slot()],
                     ['func', '0x%1x' % ppci.get_func()],
                     ['vslt', '0x%02x' % xenapi_pci.get('hotplug_slot')],
+                    ['opts', dpci_opts],
                     ['uuid', dpci_uuid]
                 ],
                 ['state', 'Initialising']
@@ -3444,14 +3701,15 @@ class XendDomainInfo:
                     ['p-devname', pscsi.get_dev_name()],
                     ['p-dev', pscsi.get_physical_HCTL()],
                     ['v-dev', xenapi_dscsi.get('virtual_HCTL')],
-                    ['state', 'Initialising'],
+                    ['state', xenbusState['Initialising']],
                     ['uuid', dscsi_uuid]
-                ]
+                ],
+                ['feature-host', 0]
             ]
 
         if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING:
 
-            cur_vscsi_sxp = self._getDeviceInfo_vscsi(devid, None)
+            cur_vscsi_sxp = self._getDeviceInfo_vscsi(devid)
 
             if cur_vscsi_sxp is None:
                 dev_uuid = self.info.device_add('vscsi', cfg_sxp = target_vscsi_sxp)
@@ -3459,7 +3717,7 @@ class XendDomainInfo:
                     raise XendError('Failed to create device')
 
             else:
-                new_vscsi_sxp = ['vscsi']
+                new_vscsi_sxp = ['vscsi', ['feature-host', 0]]
                 for existing_dev in sxp.children(cur_vscsi_sxp, 'dev'):
                     new_vscsi_sxp.append(existing_dev)
                 new_vscsi_sxp.append(sxp.child0(target_vscsi_sxp, 'dev'))
@@ -3549,11 +3807,11 @@ class XendDomainInfo:
         dscsi = XendAPIStore.get(dev_uuid, 'DSCSI')
         devid = dscsi.get_virtual_host()
         vHCTL = dscsi.get_virtual_HCTL()
-        cur_vscsi_sxp = self._getDeviceInfo_vscsi(devid, None)
+        cur_vscsi_sxp = self._getDeviceInfo_vscsi(devid)
         dev_uuid = sxp.child_value(cur_vscsi_sxp, 'uuid')
 
         target_dev = None
-        new_vscsi_sxp = ['vscsi']
+        new_vscsi_sxp = ['vscsi', ['feature-host', 0]]
         for dev in sxp.children(cur_vscsi_sxp, 'dev'):
             if vHCTL == sxp.child_value(dev, 'v-dev'):
                 target_dev = dev
@@ -3563,8 +3821,8 @@ class XendDomainInfo:
         if target_dev is None:
             raise XendError('Failed to destroy device')
 
-        target_dev.append(['state', 'Closing'])
-        target_vscsi_sxp = ['vscsi', target_dev]
+        target_dev.append(['state', xenbusState['Closing']])
+        target_vscsi_sxp = ['vscsi', target_dev, ['feature-host', 0]]
 
         if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING:
 
index 72db2974a96c2a81f1778b36dfce13010e8ce67b..61779fa77efe08c528c91d9531bc3b4436d31df6 100644 (file)
@@ -18,6 +18,7 @@
 
 import os
 import socket
+import time
 import xen.lowlevel.xc
 
 from xen.util import Brctl
@@ -145,6 +146,18 @@ class XendNode:
 
         self.srs = {}
 
+        self._init_networks()
+        self._init_PIFs()
+
+        self._init_SRs()
+        self._init_PBDs()
+
+        self._init_PPCIs()
+
+        self._init_PSCSIs()
+
+
+    def _init_networks(self):
         # Initialise networks
         # First configure ones off disk
         saved_networks = self.state_store.load_state('network')
@@ -157,7 +170,16 @@ class XendNode:
                 
         # Next discover any existing bridges and check
         # they are not already configured
-        bridges = Brctl.get_state().keys()
+
+        # 'tmpbridge' is a temporary bridge created by network-bridge script.
+        # Wait a couple of seconds for it to be renamed.
+        for i in xrange(20):
+            bridges = Brctl.get_state().keys()
+            if 'tmpbridge' in bridges:
+                time.sleep(0.1)
+            else:
+                break
+            
         configured_bridges = [XendAPIStore.get(
                                   network_uuid, "network")
                                       .get_name_label()
@@ -166,8 +188,10 @@ class XendNode:
                                 for bridge in bridges
                                 if bridge not in configured_bridges]
         for unconfigured_bridge in unconfigured_bridges:
-            XendNetwork.create_phy(unconfigured_bridge)
+            if unconfigured_bridge != 'tmpbridge':
+                XendNetwork.create_phy(unconfigured_bridge)
 
+    def _init_PIFs(self):
         # Initialise PIFs
         # First configure ones off disk
         saved_pifs = self.state_store.load_state('pif')
@@ -210,7 +234,8 @@ class XendNode:
                     log.debug("Cannot find network for bridge %s "
                               "when configuring PIF %s",
                               (bridge_name, name))     
-        
+
+    def _init_SRs(self):
         # initialise storage
         saved_srs = self.state_store.load_state('sr')
         if saved_srs:
@@ -229,6 +254,7 @@ class XendNode:
             qcow_sr_uuid = uuid.createString()
             self.srs[qcow_sr_uuid] = XendQCoWStorageRepo(qcow_sr_uuid)
 
+    def _init_PBDs(self):
         saved_pbds = self.state_store.load_state('pbd')
         if saved_pbds:
             for pbd_uuid, pbd_cfg in saved_pbds.items():
@@ -237,8 +263,7 @@ class XendNode:
                 except CreateUnspecifiedAttributeError:
                     log.warn("Error recreating PBD %s", pbd_uuid) 
 
-
-        # Initialise PPCIs
+    def _init_PPCIs(self):
         saved_ppcis = self.state_store.load_state('ppci')
         saved_ppci_table = {}
         if saved_ppcis:
@@ -271,7 +296,7 @@ class XendNode:
             ppci_uuid = saved_ppci_table.get(pci_dev.name, uuid.createString())
             XendPPCI(ppci_uuid, ppci_record)
 
-
+    def _init_PSCSIs(self):
         # Initialise PSCSIs
         saved_pscsis = self.state_store.load_state('pscsi')
         saved_pscsi_table = {}
@@ -290,6 +315,75 @@ class XendNode:
                 XendPSCSI(pscsi_uuid, pscsi_record)
 
 
+    def add_network(self, interface):
+        # TODO
+        log.debug("add_network(): Not implemented.")
+
+
+    def remove_network(self, interface):
+        # TODO
+        log.debug("remove_network(): Not implemented.")
+
+
+    def add_PPCI(self, pci_name):
+        # Update lspci info
+        PciUtil.create_lspci_info()
+
+        # Initialise the PPCI
+        saved_ppcis = self.state_store.load_state('ppci')
+        saved_ppci_table = {}
+        if saved_ppcis:
+            for ppci_uuid, ppci_record in saved_ppcis.items():
+                try:
+                    saved_ppci_table[ppci_record['name']] = ppci_uuid
+                except KeyError:
+                    pass
+
+        (domain, bus, slot, func) = PciUtil.parse_pci_name(pci_name)
+        pci_dev = PciUtil.PciDevice(domain, bus, slot, func)
+        ppci_record = {
+            'domain':                   pci_dev.domain,
+            'bus':                      pci_dev.bus,
+            'slot':                     pci_dev.slot,
+            'func':                     pci_dev.func,
+            'vendor_id':                pci_dev.vendor,
+            'vendor_name':              pci_dev.vendorname,
+            'device_id':                pci_dev.device,
+            'device_name':              pci_dev.devicename,
+            'revision_id':              pci_dev.revision,
+            'class_code':               pci_dev.classcode,
+            'class_name':               pci_dev.classname,
+            'subsystem_vendor_id':      pci_dev.subvendor,
+            'subsystem_vendor_name':    pci_dev.subvendorname,
+            'subsystem_id':             pci_dev.subdevice,
+            'subsystem_name':           pci_dev.subdevicename,
+            'driver':                   pci_dev.driver
+            }
+        # If saved uuid exists, use it. Otherwise create one.
+        ppci_uuid = saved_ppci_table.get(pci_dev.name, uuid.createString())
+        XendPPCI(ppci_uuid, ppci_record)
+
+
+    def remove_PPCI(self, pci_name):
+        # Update lspci info
+        PciUtil.create_lspci_info()
+
+        # Remove the PPCI
+        (domain, bus, slot, func) = PciUtil.parse_pci_name(pci_name)
+        ppci_ref = XendPPCI.get_by_sbdf(domain, bus, slot, func)
+        XendAPIStore.get(ppci_ref, "PPCI").destroy()
+
+
+    def add_PSCSI(self):
+        # TODO
+        log.debug("add_network(): Not implemented.")
+
+
+    def remove_PSCSI(self):
+        # TODO
+        log.debug("add_network(): Not implemented.")
+
+
 ##    def network_destroy(self, net_uuid):
  ##       del self.networks[net_uuid]
   ##      self.save_networks()
index 350f20736c878f50a982418143f738684061fdc6..b507f9b3b710627051f3ea25f91b6fae51e34ffc 100644 (file)
@@ -75,6 +75,9 @@ class XendOptions:
     """Default for the flag indicating whether xend should run a ssl relocation server."""
     xend_relocation_ssl_server_default = 'no'
 
+    """Default for the flag indicating whether xend should run a udev event server."""
+    xend_udev_event_server_default = 'no'
+
     """Default interface address the xend relocation server listens at. """
     xend_relocation_address_default = ''
 
@@ -216,6 +219,10 @@ class XendOptions:
     def get_xend_relocation_server_ssl_cert_file(self):
         return self.get_config_string("xend-relocation-server-ssl-cert-file")
 
+    def get_xend_udev_event_server(self):
+        return self.get_config_bool("xend-udev-event-server",
+                                    self.xend_udev_event_server_default)
+
     def get_xend_port(self):
         """Get the port xend listens at for its HTTP interface.
         """
index 9ea61218d9fe2ec3549f84f5fba176831fed7ca3..e6248b714a9c641ee2b036105a1a59b76825b00a 100644 (file)
@@ -20,6 +20,8 @@ from xen.xend.XendBase import XendBase
 from xen.xend.XendBase import XendAPIStore
 from xen.xend import uuid as genuuid
 
+from xen.util.pci import parse_hex
+
 class XendPPCI(XendBase):
     """Representation of a physical PCI device."""
 
@@ -72,10 +74,10 @@ class XendPPCI(XendBase):
  
     def get_by_sbdf(self, domain, bus, slot, func):
         for ppci in XendAPIStore.get_all("PPCI"):
-            if ppci.get_domain() == int(domain, 16) and \
-               ppci.get_bus() == int(bus, 16) and \
-               ppci.get_slot() == int(slot, 16) and \
-               ppci.get_func() == int(func, 16):
+            if ppci.get_domain() == parse_hex(domain) and \
+               ppci.get_bus() == parse_hex(bus) and \
+               ppci.get_slot() == parse_hex(slot) and \
+               ppci.get_func() == parse_hex(func):
                 return ppci.get_uuid()
         return None
 
index 828e1bc024da9c383819728d3fde81349d943519..b31398c7450e513081c1c7e3de006953027dc8f3 100644 (file)
@@ -67,7 +67,7 @@ def get_dom0_target_alloc():
         raise VmError('Failed to query target memory allocation of dom0.')
     return kb
 
-def free(need_mem):
+def free(need_mem, dominfo):
     """Balloon out memory from the privileged domain so that there is the
     specified required amount (in KiB) free.
     """
@@ -122,6 +122,40 @@ def free(need_mem):
         if need_mem >= max_free_mem:
             retries = rlimit
 
+        # Check whethercurrent machine is a numa system and the new 
+        # created hvm has all its vcpus in the same node, if all the 
+        # conditions above are fit. We will wait until all the pages 
+        # in scrub list are freed (if waiting time go beyond 20s, 
+        # we will stop waiting it.)
+        if physinfo['nr_nodes'] > 1 and retries == 0:
+            oldnode = -1
+            waitscrub = 1
+            vcpus = dominfo.info['cpus'][0]
+            for vcpu in vcpus:
+                nodenum = 0
+                for node in physinfo['node_to_cpu']:
+                    for cpu in node:
+                        if vcpu == cpu:
+                            if oldnode == -1:
+                                oldnode = nodenum
+                            elif oldnode != nodenum:
+                                waitscrub = 0
+                    nodenum = nodenum + 1
+
+            if waitscrub == 1 and scrub_mem > 0:
+                log.debug("wait for scrub %s", scrub_mem)
+                while scrub_mem > 0 and retries < rlimit:
+                    time.sleep(sleep_time)
+                    physinfo = xc.physinfo()
+                    free_mem = physinfo['free_memory']
+                    scrub_mem = physinfo['scrub_memory']
+                    retries += 1
+                    sleep_time += SLEEP_TIME_GROWTH
+                log.debug("scrub for %d times", retries)
+
+            retries = 0
+            sleep_time = SLEEP_TIME_GROWTH
+
         while retries < rlimit:
             physinfo = xc.physinfo()
             free_mem = physinfo['free_memory']
index e101665a150fae7af9bc4407fbc0f66387c0dd32..04689c3eee4c45eb9206c30546d61ed30c6bd187 100644 (file)
@@ -28,6 +28,7 @@ import sys
 import errno
 import glob
 import traceback
+import platform
 
 import xen.lowlevel.xc
 from xen.xend.XendConstants import *
@@ -40,6 +41,7 @@ from xen.xend import arch
 from xen.xend import XendOptions
 from xen.util import oshelp
 from xen.util import utils
+from xen.xend import osdep
 
 xc = xen.lowlevel.xc.xc()
 
@@ -226,23 +228,23 @@ class ImageHandler:
         if self.device_model is None:
             return
 
-        # If we use a device model, the pipes for communication between
-        # blktapctrl and ioemu must be present before the devices are 
-        # created (blktapctrl must access them for new block devices)
+        if platform.system() != 'SunOS':
+            # If we use a device model, the pipes for communication between
+            # blktapctrl and ioemu must be present before the devices are 
+            # created (blktapctrl must access them for new block devices)
 
-        # mkdir throws an exception if the path already exists
-        try:
-            os.mkdir('/var/run/tap', 0755)
-        except:
-            pass
+            try:
+                os.makedirs('/var/run/tap', 0755)
+            except:
+                pass
 
-        try:
-            os.mkfifo('/var/run/tap/qemu-read-%d' % domid, 0600)
-            os.mkfifo('/var/run/tap/qemu-write-%d' % domid, 0600)
-        except OSError, e:
-            log.warn('Could not create blktap pipes for domain %d' % domid)
-            log.exception(e)
-            pass
+            try:
+                os.mkfifo('/var/run/tap/qemu-read-%d' % domid, 0600)
+                os.mkfifo('/var/run/tap/qemu-write-%d' % domid, 0600)
+            except OSError, e:
+                log.warn('Could not create blktap pipes for domain %d' % domid)
+                log.exception(e)
+                pass
 
 
     # Return a list of cmd line args to the device models based on the
@@ -265,6 +267,10 @@ class ImageHandler:
             ret.append('-nographic')
             return ret
 
+        vram = str(vmConfig['platform'].get('videoram',4))
+        ret.append('-videoram')
+        ret.append(vram)
+
         vnc_config = {}
         has_vnc = int(vmConfig['platform'].get('vnc', 0)) != 0
         has_sdl = int(vmConfig['platform'].get('sdl', 0)) != 0
@@ -275,15 +281,16 @@ class ImageHandler:
             if dev_type == 'vfb':
                 if 'keymap' in dev_info:
                     keymap = dev_info.get('keymap',{})
-                vfb_type = dev_info.get('type', {})
-                if vfb_type == 'sdl':
+                if int(dev_info.get('vnc', 0)) != 0 :
+                    has_vnc = True
+                if int(dev_info.get('sdl', 0)) != 0 :
+                    has_sdl = True
+                if has_sdl:
                     self.display = dev_info.get('display', {})
                     self.xauthority = dev_info.get('xauthority', {})
                     opengl = int(dev_info.get('opengl', opengl))
-                    has_sdl = True
-                else:
+                if has_vnc:
                     vnc_config = dev_info.get('other_config', {})
-                    has_vnc = True
                 break
 
         if keymap:
@@ -331,11 +338,12 @@ class ImageHandler:
             if int(vnc_config.get('vncunused', 1)) != 0:
                 ret.append('-vncunused')
 
-        elif has_sdl:
-            # SDL is default in QEMU.
+        if has_sdl:
+            ret.append('-sdl')
             if int(vmConfig['platform'].get('opengl', opengl)) != 1 :
                 ret.append('-disable-opengl')
-        else:
+
+        if not has_sdl and not has_vnc :
             ret.append('-nographic')
 
         if int(vmConfig['platform'].get('monitor', 0)) != 0:
@@ -368,8 +376,6 @@ class ImageHandler:
             env['DISPLAY'] = self.display
         if self.xauthority:
             env['XAUTHORITY'] = self.xauthority
-        if self.vncconsole:
-            args = args + ([ "-vncviewer" ])
         unique_id = "%i-%i" % (self.vm.getDomid(), time.time())
         sentinel_path = sentinel_path_prefix + unique_id
         sentinel_path_fifo = sentinel_path + '.fifo'
@@ -403,9 +409,12 @@ class ImageHandler:
         logfd = os.open(self.logfile, logfile_mode)
         
         sys.stderr.flush()
+        contract = osdep.prefork("%s:%d" %
+                                 (self.vm.getName(), self.vm.getDomid()))
         pid = os.fork()
         if pid == 0: #child
             try:
+                osdep.postfork(contract)
                 os.dup2(null, 0)
                 os.dup2(logfd, 1)
                 os.dup2(logfd, 2)
@@ -422,6 +431,7 @@ class ImageHandler:
             except:
                 os._exit(127)
         else:
+            osdep.postfork(contract, abandon=True)
             self.pid = pid
             os.close(null)
             os.close(logfd)
@@ -478,11 +488,7 @@ class ImageHandler:
 
     def _dmfailed(self, message):
         log.warning("domain %s: %s", self.vm.getName(), message)
-        # ideally we would like to forcibly crash the domain with
-        # something like
-        #    xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
-        # but this can easily lead to very rapid restart loops against
-        # which we currently have no protection
+        xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
 
     def recreate(self):
         if self.device_model is None:
@@ -554,24 +560,30 @@ class ImageHandler:
                     os.kill(self.pid, signal.SIGHUP)
                 except OSError, exn:
                     log.exception(exn)
-                try:
-                    # Try to reap the child every 100ms for 10s. Then SIGKILL it.
-                    for i in xrange(100):
+                # Try to reap the child every 100ms for 10s. Then SIGKILL it.
+                for i in xrange(100):
+                    try:
                         (p, rv) = os.waitpid(self.pid, os.WNOHANG)
                         if p == self.pid:
                             break
-                        time.sleep(0.1)
-                    else:
-                        log.warning("DeviceModel %d took more than 10s "
-                                    "to terminate: sending SIGKILL" % self.pid)
+                    except OSError:
+                        # This is expected if Xend has been restarted within
+                        # the life of this domain.  In this case, we can kill
+                        # the process, but we can't wait for it because it's
+                        # not our child. We continue this loop, and after it is
+                        # terminated make really sure the process is going away
+                        # (SIGKILL).
+                        pass
+                    time.sleep(0.1)
+                else:
+                    log.warning("DeviceModel %d took more than 10s "
+                                "to terminate: sending SIGKILL" % self.pid)
+                    try:
                         os.kill(self.pid, signal.SIGKILL)
                         os.waitpid(self.pid, 0)
-                except OSError, exn:
-                    # This is expected if Xend has been restarted within the
-                    # life of this domain.  In this case, we can kill the process,
-                    # but we can't wait for it because it's not our child.
-                    # We just make really sure it's going away (SIGKILL) first.
-                    os.kill(self.pid, signal.SIGKILL)
+                    except OSError:
+                        # This happens if the process doesn't exist.
+                        pass
                 state = xstransact.Remove("/local/domain/0/device-model/%i"
                                           % self.vm.getDomid())
             finally:
@@ -629,6 +641,8 @@ class LinuxImageHandler(ImageHandler):
 
     def configure(self, vmConfig):
         ImageHandler.configure(self, vmConfig)
+        self.vramsize = int(vmConfig['platform'].get('videoram',4)) * 1024
+        self.is_stubdom = (self.kernel.find('stubdom') >= 0)
 
     def buildDomain(self):
         store_evtchn = self.vm.getStorePort()
@@ -660,6 +674,17 @@ class LinuxImageHandler(ImageHandler):
                               flags          = self.flags,
                               vhpt           = self.vhpt)
 
+    def getRequiredAvailableMemory(self, mem_kb):
+        if self.is_stubdom :
+            mem_kb += self.vramsize
+        return mem_kb
+
+    def getRequiredInitialReservation(self):
+        return self.vm.getMemoryTarget()
+
+    def getRequiredMaximumReservation(self):
+        return self.vm.getMemoryMaximum()
+
     def parseDeviceModelArgs(self, vmConfig):
         ret = ImageHandler.parseDeviceModelArgs(self, vmConfig)
         # Equivalent to old xenconsoled behaviour. Should make
@@ -691,19 +716,32 @@ class HVMImageHandler(ImageHandler):
         if 'hvm' not in info['xen_caps']:
             raise HVMRequired()
 
+        xen_platform_pci = int(vmConfig['platform'].get('xen_platform_pci',1))
         rtc_timeoffset = vmConfig['platform'].get('rtc_timeoffset')
 
+        if not self.display :
+            self.display = ''
         self.vm.storeVm(("image/dmargs", " ".join(self.dmargs)),
                         ("image/device-model", self.device_model),
                         ("image/display", self.display))
         self.vm.permissionsVm("image/dmargs", { 'dom': self.vm.getDomid(), 'read': True } )
+
+        if xen_platform_pci == 0:
+            disable_pf = 1
+            log.info("No need to create platform device.[domid:%d]", self.vm.getDomid())
+        else:
+            disable_pf = 0
+            log.info("Need to create platform device.[domid:%d]", self.vm.getDomid())
+
+        xstransact.Store("/local/domain/0/device-model/%i"%self.vm.getDomid(),
+                                      ('disable_pf', disable_pf))
         self.vm.storeVm(("rtc/timeoffset", rtc_timeoffset))
         self.vm.permissionsVm("rtc/timeoffset", { 'dom': self.vm.getDomid(), 'read': True } )
 
         self.apic = int(vmConfig['platform'].get('apic', 0))
         self.acpi = int(vmConfig['platform'].get('acpi', 0))
         self.guest_os_type = vmConfig['platform'].get('guest_os_type')
-           
+
 
     # Return a list of cmd line args to the device models based on the
     # xm config file
@@ -799,19 +837,22 @@ class HVMImageHandler(ImageHandler):
     def buildDomain(self):
         store_evtchn = self.vm.getStorePort()
 
+        memmax_mb = self.getRequiredMaximumReservation() / 1024
         mem_mb = self.getRequiredInitialReservation() / 1024
 
         log.debug("domid          = %d", self.vm.getDomid())
         log.debug("image          = %s", self.loader)
         log.debug("store_evtchn   = %d", store_evtchn)
-        log.debug("memsize        = %d", mem_mb)
+        log.debug("memsize        = %d", memmax_mb)
+        log.debug("target         = %d", mem_mb)
         log.debug("vcpus          = %d", self.vm.getVCpuCount())
         log.debug("acpi           = %d", self.acpi)
         log.debug("apic           = %d", self.apic)
 
         rc = xc.hvm_build(domid          = self.vm.getDomid(),
                           image          = self.loader,
-                          memsize        = mem_mb,
+                          memsize        = memmax_mb,
+                          target         = mem_mb,
                           vcpus          = self.vm.getVCpuCount(),
                           acpi           = self.acpi,
                           apic           = self.apic)
@@ -830,6 +871,7 @@ class IA64_HVM_ImageHandler(HVMImageHandler):
     def configure(self, vmConfig):
         HVMImageHandler.configure(self, vmConfig)
         self.vhpt = int(vmConfig['platform'].get('vhpt',  0))
+        self.vramsize = int(vmConfig['platform'].get('videoram',4)) * 1024
 
     def buildDomain(self):
         xc.nvram_init(self.vm.getName(), self.vm.getDomid())
@@ -844,8 +886,8 @@ class IA64_HVM_ImageHandler(HVMImageHandler):
         # buffer io page, buffer pio page and memmap info page
         extra_pages = 1024 + 5
         mem_kb += extra_pages * page_kb
-        # Add 8 MiB overhead for QEMU's video RAM.
-        return mem_kb + 8192
+        mem_kb += self.vramsize
+        return mem_kb
 
     def getRequiredInitialReservation(self):
         return self.vm.getMemoryTarget()
@@ -879,6 +921,7 @@ class X86_HVM_ImageHandler(HVMImageHandler):
     def configure(self, vmConfig):
         HVMImageHandler.configure(self, vmConfig)
         self.pae = int(vmConfig['platform'].get('pae',  0))
+        self.vramsize = int(vmConfig['platform'].get('videoram',4)) * 1024
 
     def buildDomain(self):
         xc.hvm_set_param(self.vm.getDomid(), HVM_PARAM_PAE_ENABLED, self.pae)
@@ -887,8 +930,7 @@ class X86_HVM_ImageHandler(HVMImageHandler):
         return rc
 
     def getRequiredAvailableMemory(self, mem_kb):
-        # Add 8 MiB overhead for QEMU's video RAM.
-        return mem_kb + 8192
+        return mem_kb + self.vramsize
 
     def getRequiredInitialReservation(self):
         return self.vm.getMemoryTarget()
index a026c85277fa54ce2d910ec6bfd92d9a210e8c6f..024eab216f6fa586c303b04e5bc4b8800b306541 100644 (file)
@@ -18,6 +18,7 @@
 # Use is subject to license terms.
 
 import os
+import commands
 
 _scripts_dir = {
     "Linux": "/etc/xen/scripts",
@@ -38,7 +39,10 @@ _vif_script = {
     "SunOS": "vif-vnic"
 }
 
-def _linux_balloon_stat(label):
+PROC_XEN_BALLOON = '/proc/xen/balloon'
+SYSFS_XEN_MEMORY = '/sys/devices/system/xen_memory/xen_memory0'
+
+def _linux_balloon_stat_proc(label):
     """Returns the value for the named label, or None if an error occurs."""
 
     xend2linux_labels = { 'current'      : 'Current allocation',
@@ -47,7 +51,6 @@ def _linux_balloon_stat(label):
                           'high-balloon' : 'High-mem balloon',
                           'limit'        : 'Xen hard limit' }
 
-    PROC_XEN_BALLOON = '/proc/xen/balloon'
     f = file(PROC_XEN_BALLOON, 'r')
     try:
         for line in f:
@@ -62,6 +65,29 @@ def _linux_balloon_stat(label):
     finally:
         f.close()
 
+def _linux_balloon_stat_sysfs(label):
+    sysfiles = { 'target'       : 'target_kb',
+                 'current'      : 'info/current_kb',
+                 'low-balloon'  : 'info/low_kb',
+                 'high-balloon' : 'info/high_kb',
+                 'limit'        : 'info/hard_limit_kb' }
+
+    name = os.path.join(SYSFS_XEN_MEMORY, sysfiles[label])
+    f = file(name, 'r')
+
+    val = f.read().strip()
+    if val.isdigit():
+        return int(val)
+    return None
+
+def _linux_balloon_stat(label):
+       if os.access(PROC_XEN_BALLOON, os.F_OK):
+               return _linux_balloon_stat_proc(label)
+       elif os.access(SYSFS_XEN_MEMORY, os.F_OK):
+               return _linux_balloon_stat_sysfs(label)
+
+       return None
+
 def _solaris_balloon_stat(label):
     """Returns the value for the named label, or None if an error occurs."""
 
@@ -117,7 +143,79 @@ def _linux_get_cpuinfo():
     finally:
         f.close()
 
+def _solaris_get_cpuinfo():
+    cpuinfo = {}
+
+    # call kstat to extrace specific cpu_info output
+    cmd = "/usr/bin/kstat -p -c misc -m cpu_info"
+    kstatoutput = commands.getoutput (cmd)
+
+    # walk each line
+    for kstatline in kstatoutput.split('\n'):
+
+        # split the line on 
+        # module:cpu #:module#:name value
+        (module, cpunum, combo, namevalue) = kstatline.split (":")
+
+        # check to see if this cpunum is already a key.  If not,
+        # initialize an empty hash table
+        if not cpuinfo.has_key (int(cpunum)):
+            cpuinfo[int(cpunum)] = {}
+
+        # split the namevalue output on whitespace
+        data = namevalue.split()
+
+        # the key will be data[0]
+        key = data[0]
+
+        # check the length of the data list.  If it's larger than
+        # 2, join the rest of the list together with a space.
+        # Otherwise, value is just data[1]
+        if len (data) > 2:
+            value = ' '.join (data[1:])
+        else:
+            value = data[1]
+
+        # add this key/value pair to the cpuhash
+        cpuinfo[int(cpunum)][key] = value
+    
+    # Translate Solaris tokens into what Xend expects
+    for key in cpuinfo.keys():
+        cpuinfo[key]["flags"] = ""
+        cpuinfo[key]["model name"] = cpuinfo[key]["brand"]
+        cpuinfo[key]["cpu MHz"] = cpuinfo[key]["clock_MHz"]
+
+    # return the hash table
+    return cpuinfo
+
 _get_cpuinfo = {
+    "SunOS": _solaris_get_cpuinfo
+}
+
+def _default_prefork(name):
+    pass
+
+def _default_postfork(ct, abandon=False):
+    pass
+
+# call this for long-running processes that should survive a xend
+# restart
+def _solaris_prefork(name):
+    from xen.lowlevel import process
+    return process.activate(name)
+
+def _solaris_postfork(ct, abandon=False):
+    from xen.lowlevel import process
+    process.clear(ct)
+    if abandon:
+        process.abandon_latest()
+
+_get_prefork = {
+    "SunOS": _solaris_prefork
+}
+
+_get_postfork = {
+    "SunOS": _solaris_postfork
 }
 
 def _get(var, default=None):
@@ -129,3 +227,5 @@ pygrub_path = _get(_pygrub_path, "/usr/bin/pygrub")
 vif_script = _get(_vif_script, "vif-bridge")
 lookup_balloon_stat = _get(_balloon_stat, _linux_balloon_stat)
 get_cpuinfo = _get(_get_cpuinfo, _linux_get_cpuinfo)
+prefork = _get(_get_prefork, _default_prefork)
+postfork = _get(_get_postfork, _default_postfork)
index e143d36b204018ad8ea67bd00b2c5acbaf1cb0f9..36c1d0688e5dde2a9161fa995de2b5b50d59301a 100644 (file)
@@ -15,7 +15,8 @@ blktap_disk_types = [
     'qcow',
     'qcow2',
 
-    'ioemu'
+    'ioemu',
+    'tapdisk',
     ]
 
 class BlktapController(BlkifController):
diff --git a/tools/python/xen/xend/server/DevConstants.py b/tools/python/xen/xend/server/DevConstants.py
new file mode 100644 (file)
index 0000000..ba7abfc
--- /dev/null
@@ -0,0 +1,45 @@
+#============================================================================
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#============================================================================
+# Copyright (C) 2004, 2005 Mike Wray <mike.wray@hp.com>
+# Copyright (C) 2005 XenSource Ltd
+#============================================================================
+
+DEVICE_CREATE_TIMEOUT  = 100
+DEVICE_DESTROY_TIMEOUT = 100
+HOTPLUG_STATUS_NODE = "hotplug-status"
+HOTPLUG_ERROR_NODE  = "hotplug-error"
+HOTPLUG_STATUS_ERROR = "error"
+HOTPLUG_STATUS_BUSY  = "busy"
+
+Connected    = 1
+Error        = 2
+Missing      = 3
+Timeout      = 4
+Busy         = 5
+Disconnected = 6
+
+xenbusState = {
+    'Unknown'       : 0,
+    'Initialising'  : 1,
+    'InitWait'      : 2,
+    'Initialised'   : 3,
+    'Connected'     : 4,
+    'Closing'       : 5,
+    'Closed'        : 6,
+    'Reconfiguring' : 7,
+    'Reconfigured'  : 8,
+    }
+xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
+
index 022ed1a5ab287b132f41c0e6d8a496b70f0fe5cb..6c2bb09ca638419681591e3c7b1ebdb9a58a802c 100644 (file)
@@ -23,42 +23,15 @@ from xen.xend import sxp, XendOptions
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 import xen.xend.XendConfig
+from xen.xend.server.DevConstants import *
 
 from xen.xend.xenstore.xstransact import xstransact, complete
 from xen.xend.xenstore.xswatch import xswatch
 
 import os
 
-DEVICE_CREATE_TIMEOUT  = 100
-DEVICE_DESTROY_TIMEOUT = 100
-HOTPLUG_STATUS_NODE = "hotplug-status"
-HOTPLUG_ERROR_NODE  = "hotplug-error"
-HOTPLUG_STATUS_ERROR = "error"
-HOTPLUG_STATUS_BUSY  = "busy"
-
-Connected    = 1
-Error        = 2
-Missing      = 3
-Timeout      = 4
-Busy         = 5
-Disconnected = 6
-
-xenbusState = {
-    'Unknown'      : 0,
-    'Initialising' : 1,
-    'InitWait'     : 2,
-    'Initialised'  : 3,
-    'Connected'    : 4,
-    'Closing'      : 5,
-    'Closed'       : 6,
-    'Reconfiguring': 7,
-    'Reconfigured' : 8,
-    }
-
 xoptions = XendOptions.instance()
 
-xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
-
 
 class DevController:
     """Abstract base class for a device controller.  Device controllers create
@@ -262,8 +235,8 @@ class DevController:
                 xstransact.Remove(backpath)
             xstransact.Remove(frontpath)
 
-        # xstransact.Remove(self.devicePath()) ?? Below is the same ?
-        self.vm._removeVm("device/%s/%d" % (self.deviceClass, dev))
+            # xstransact.Remove(self.devicePath()) ?? Below is the same ?
+            self.vm._removeVm("device/%s/%d" % (self.deviceClass, dev))
 
     def configurations(self, transaction = None):
         return map(lambda x: self.configuration(x, transaction), self.deviceIDs(transaction))
@@ -569,7 +542,7 @@ class DevController:
             xswatch(statusPath, hotplugStatusCallback, ev, result)
             ev.wait(DEVICE_CREATE_TIMEOUT)
             err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE)
-            if result['status'] != 'Connected':
+            if result['status'] != Connected:
                 return (result['status'], err)
             
         backpath = self.readVm(devid, "backend")
index 225b901168088cef6addf1c79d31fe00e0a44fe3..0a330f15b047b0f46e2585a21b33a11e49a7ef7e 100644 (file)
@@ -24,6 +24,7 @@ from xen.xend import osdep
 from xen.util import mkdir
 
 import relocate
+import udevevent
 import SrvServer
 from params import *
 
@@ -336,6 +337,7 @@ class Daemon:
             del xc
 
             relocate.listenRelocation()
+            udevevent.listenUdevEvent()
             servers = SrvServer.create()
             servers.start(status)
             del servers
index 531b02e5cf79771e813c863f28f2435463aab01d..b76c7b365235b79e9c5145990642591c3e32d61b 100644 (file)
@@ -104,7 +104,8 @@ class SrvDomain(SrvDir):
                     [['dom',         'int'],
                      ['file',        'str'],
                      ['live',        'int'],
-                     ['crash',       'int']])
+                     ['crash',       'int'],
+                     ['reset',       'int']])
         return fn(req.args, {'dom': self.dom.domid})
 
     def op_migrate(self, op, req):
index 28ddf5f95db85d120caa5699f93cbbadd4c69bd0..7c4cb8b6bb8c7db2fc680a87d57a527b76f0b819 100644 (file)
@@ -18,6 +18,7 @@
 
 import re
 import string
+import os
 
 from xen.util import blkif
 import xen.util.xsm.xsm as security
@@ -35,6 +36,13 @@ class BlkifController(DevController):
         """
         DevController.__init__(self, vm)
 
+    def _isValidProtocol(self, protocol):
+        if protocol in ('phy', 'file', 'tap'):
+            return True
+
+        return os.access('/etc/xen/scripts/block-%s' % protocol, os.X_OK)
+
+
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
         uname = config.get('uname', '')
@@ -56,10 +64,8 @@ class BlkifController(DevController):
         else:
             try:
                 (typ, params) = string.split(uname, ':', 1)
-                if typ not in ('phy', 'file', 'tap'):
-                    raise VmError(
-                        'Block device must have "phy", "file" or "tap" '
-                        'specified to type')
+                if not self._isValidProtocol(typ):
+                    raise VmError('Block device type "%s" is invalid.' % typ)
             except ValueError:
                 raise VmError(
                     'Block device must have physical details specified')
@@ -78,6 +84,10 @@ class BlkifController(DevController):
         if uuid:
             back['uuid'] = uuid
 
+        bootable = config.get('bootable', None)
+        if bootable != None:
+            back['bootable'] = str(bootable)
+
         if security.on() == xsconstants.XS_POLICY_USE:
             self.do_access_control(config, uname)
 
@@ -143,11 +153,12 @@ class BlkifController(DevController):
         config = DevController.getDeviceConfiguration(self, devid, transaction)
         if transaction is None:
             devinfo = self.readBackend(devid, 'dev', 'type', 'params', 'mode',
-                                       'uuid')
+                                       'uuid', 'bootable')
         else:
             devinfo = self.readBackendTxn(transaction, devid,
-                                          'dev', 'type', 'params', 'mode', 'uuid')
-        dev, typ, params, mode, uuid = devinfo
+                                          'dev', 'type', 'params', 'mode', 'uuid',
+                                          'bootable')
+        dev, typ, params, mode, uuid, bootable = devinfo
         
         if dev:
             if transaction is None:
@@ -165,6 +176,8 @@ class BlkifController(DevController):
             config['mode'] = mode
         if uuid:
             config['uuid'] = uuid
+        if bootable != None:
+            config['bootable'] = int(bootable)
 
         proto = self.readFrontend(devid, 'protocol')
         if proto:
index 3b1a2637361ba8d51fc3f6f70375a1f2455843a3..4c6e61246544a1ed27b0c0d400a3baf7b183c0d8 100644 (file)
@@ -45,9 +45,22 @@ def parse_ioport(val):
 
 class IOPortsController(DevController):
 
+    valid_cfg = ['to', 'from', 'uuid']
+
     def __init__(self, vm):
         DevController.__init__(self, vm)
 
+    def getDeviceConfiguration(self, devid, transaction = None):
+        result = DevController.getDeviceConfiguration(self, devid, transaction)
+        if transaction is None:
+            devinfo = self.readBackend(devid, *self.valid_cfg)
+        else:
+            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
+        config = dict(zip(self.valid_cfg, devinfo))
+        config = dict([(key, val) for key, val in config.items()
+                       if val != None])
+        return config
+
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
 
@@ -81,4 +94,9 @@ class IOPortsController(DevController):
                 'ioports: Failed to configure legacy i/o range: %s - %s' %
                 (io_from, io_to))
 
-        return (None, {}, {})
+        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
+        return (self.allocateDeviceID(), back, {})
+
+    def waitForDevice(self, devid):
+        # don't wait for hotplug
+        return
index db4b1deedd6ebf7d837fa7c9228ea3fec76107f0..ae0b1ff4b6ef48626a484d91be6a9b9af12099d9 100644 (file)
@@ -39,6 +39,18 @@ class IRQController(DevController):
     def __init__(self, vm):
         DevController.__init__(self, vm)
 
+    valid_cfg = ['irq', 'uuid']
+
+    def getDeviceConfiguration(self, devid, transaction = None):
+        result = DevController.getDeviceConfiguration(self, devid, transaction)
+        if transaction is None:
+            devinfo = self.readBackend(devid, *self.valid_cfg)
+        else:
+            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
+        config = dict(zip(self.valid_cfg, devinfo))
+        config = dict([(key, val) for key, val in config.items()
+                       if val != None])
+        return config
 
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
@@ -75,4 +87,9 @@ class IRQController(DevController):
         if rc < 0:
             raise VmError(
                 'irq: Failed to map irq %x' % (pirq))
-        return (None, {}, {})
+        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
+        return (self.allocateDeviceID(), back, {})
+
+    def waitForDevice(self, devid):
+        # don't wait for hotplug
+        return
index 469818e8c9c68ece787420651da8c6f5c84701e5..1f0d2a047ba41d3ad397f72d1e2f4f92726c0440 100644 (file)
@@ -24,7 +24,7 @@ import os
 import random
 import re
 
-from xen.xend import XendOptions
+from xen.xend import XendOptions, sxp
 from xen.xend.server.DevController import DevController
 from xen.xend.XendError import VmError
 from xen.xend.XendXSPolicyAdmin import XSPolicyAdminInstance
@@ -196,3 +196,23 @@ class NetifController(DevController):
                 result[x] = y
 
         return result
+
+    # match a VIF ID from xenstore, or a MAC address stored in the domain config
+    def convertToDeviceNumber(self, devid):
+        try:
+            return int(devid)
+        except ValueError:
+            if type(devid) is not str:
+                raise VmError("devid %s is wrong type" % str(devid))
+            try:
+                dev = devid.split('/')[-1]
+                return (int(dev))
+            except ValueError:
+                devs = [d for d in self.vm.info.all_devices_sxpr()
+                    if d[0] == 'vif']
+                for nr in range(len(devs)):
+                    dev_type, dev_info = devs[nr]
+                    if (sxp.child_value(dev_info, 'mac').lower() ==
+                        devid.lower()):
+                        return nr
+                raise VmError("unknown devid %s" % str(devid))
index 326e9d60e73512999d75bede53ec7cbb3940a53a..e6ba4bc695a8ba2d4fe45666670c469fd7e26edc 100644 (file)
@@ -24,8 +24,10 @@ from xen.xend import sxp
 from xen.xend import arch
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
+from xen.xend.XendConstants import *
 
-from xen.xend.server.DevController import DevController, xenbusState
+from xen.xend.server.DevController import DevController
+from xen.xend.server.DevConstants import xenbusState
 
 import xen.lowlevel.xc
 
@@ -34,6 +36,8 @@ import resource
 import re
 
 from xen.xend.server.pciquirk import *
+from xen.xend.xenstore.xstransact import xstransact
+from xen.xend.xenstore.xswatch import xswatch
 
 xc = xen.lowlevel.xc.xc()
 
@@ -57,6 +61,7 @@ def parse_hex(val):
 class PciController(DevController):
 
     def __init__(self, vm):
+        self.aerStateWatch = None
         DevController.__init__(self, vm)
 
 
@@ -70,6 +75,13 @@ class PciController(DevController):
             bus = parse_hex(pci_config.get('bus', 0))
             slot = parse_hex(pci_config.get('slot', 0))
             func = parse_hex(pci_config.get('func', 0))            
+            vslot = parse_hex(pci_config.get('vslot', 0))
+
+            opts = pci_config.get('opts', '')
+            if len(opts) > 0:
+                opts = map(lambda (x, y): x+'='+y, opts)
+                opts = reduce(lambda x, y: x+','+y, opts)
+                back['opts-%i' % pcidevid] = opts
 
             vslt = pci_config.get('vslt')
             if vslt is not None:
@@ -85,6 +97,11 @@ class PciController(DevController):
 
         back['num_devs']=str(pcidevid)
         back['uuid'] = config.get('uuid','')
+        if 'pci_msitranslate' in self.vm.info['platform']:
+            back['msitranslate']=str(self.vm.info['platform']['pci_msitranslate'])
+        if 'pci_power_mgmt' in self.vm.info['platform']:
+            back['power_mgmt']=str(self.vm.info['platform']['pci_power_mgmt'])
+
         return (0, back, {})
 
 
@@ -104,6 +121,9 @@ class PciController(DevController):
                 dev = back['dev-%i' % i]
                 state = states[i]
                 uuid = back['uuid-%i' %i]
+                opts = ''
+                if 'opts-%i' % i in back:
+                    opts = back['opts-%i' % i]
             except:
                 raise XendError('Error reading config')
 
@@ -125,6 +145,8 @@ class PciController(DevController):
                 self.writeBackend(devid, 'state-%i' % (num_olddevs + i),
                                   str(xenbusState['Initialising']))
                 self.writeBackend(devid, 'uuid-%i' % (num_olddevs + i), uuid)
+                if len(opts) > 0:
+                    self.writeBackend(devid, 'opts-%i' % (num_olddevs + i), opts)
                 self.writeBackend(devid, 'num_devs', str(num_olddevs + i + 1))
 
                 # Update vslots
@@ -198,7 +220,7 @@ class PciController(DevController):
                     try:
                         dev_dict['vslt'] = slot_list[i]
                     except IndexError:
-                        dev_dict['vslt'] = '0x0'
+                        dev_dict['vslt'] = AUTO_PHP_SLOT_STR
 
                 pci_devs.append(dev_dict)
 
@@ -332,12 +354,6 @@ class PciController(DevController):
             if rc<0:
                 raise VmError(('pci: failed to configure I/O memory on device '+
                             '%s - errno=%d')%(dev.name,rc))
-            rc = xc.physdev_map_pirq(domid = fe_domid,
-                                   index = dev.irq,
-                                   pirq  = dev.irq)
-            if rc < 0:
-                raise VmError(('pci: failed to map irq on device '+
-                            '%s - errno=%d')%(dev.name,rc))
 
         if dev.msix:
             for (start, size) in dev.msix_iomem:
@@ -352,6 +368,12 @@ class PciController(DevController):
                 if rc<0:
                     raise VmError(('pci: failed to remove msi-x iomem'))
 
+        rc = xc.physdev_map_pirq(domid = fe_domid,
+                               index = dev.irq,
+                               pirq  = dev.irq)
+        if rc < 0:
+            raise VmError(('pci: failed to map irq on device '+
+                        '%s - errno=%d')%(dev.name,rc))
         if dev.irq>0:
             log.debug('pci: enabling irq %d'%dev.irq)
             rc = xc.domain_irq_permission(domid =  fe_domid, pirq = dev.irq,
@@ -413,7 +435,7 @@ class PciController(DevController):
                 else:
                     # All devices behind the uppermost PCI/PCI-X bridge must be\
                     # co-assigned to the same guest.
-                    devs_str = dev.find_coassigned_devices(True)
+                    devs_str = dev.find_coassigned_pci_devices(True)
                     # Remove the element 0 which is a bridge
                     del devs_str[0]
 
@@ -430,9 +452,23 @@ class PciController(DevController):
 
         for (domain, bus, slot, func) in pci_dev_list:
             self.setupOneDevice(domain, bus, slot, func)
-
+        wPath = '/local/domain/0/backend/pci/%u/0/aerState' % (self.getDomid())
+        self.aerStatePath = xswatch(wPath, self._handleAerStateWatch)
+        log.debug('pci: register aer watch %s', wPath)
         return
 
+    def _handleAerStateWatch(self, _):
+        log.debug('XendDomainInfo.handleAerStateWatch')
+        if self.getDomid() == 0:
+            raise XendError('Domain 0 cannot be shutdown')
+        readPath = '/local/domain/0/backend/pci/%u/0/aerState' % (self.getDomid())
+        action = xstransact.Read(readPath)
+        if action and action=='aerfail':
+            log.debug('shutdown domain because of aer handle error')
+            self.vm.shutdown('poweroff')
+        return True
+
+
     def cleanupOneDevice(self, domain, bus, slot, func):
         """ Detach I/O resources for device from frontend domain
         """
@@ -522,6 +558,9 @@ class PciController(DevController):
                 self.removeBackend(devid, 'vdev-%i' % i)
                 self.removeBackend(devid, 'state-%i' % i)
                 self.removeBackend(devid, 'uuid-%i' % i)
+                tmpopts = self.readBackend(devid, 'opts-%i' % i)
+                if tmpopts is not None:
+                    self.removeBackend(devid, 'opts-%i' % i)
             else:
                 if new_num_devs != i:
                     tmpdev = self.readBackend(devid, 'dev-%i' % i)
@@ -538,12 +577,31 @@ class PciController(DevController):
                     tmpuuid = self.readBackend(devid, 'uuid-%i' % i)
                     self.writeBackend(devid, 'uuid-%i' % new_num_devs, tmpuuid)
                     self.removeBackend(devid, 'uuid-%i' % i)
+                    tmpopts = self.readBackend(devid, 'opts-%i' % i)
+                    if tmpopts is not None:
+                        self.removeBackend(devid, 'opts-%i' % i)
                 new_num_devs = new_num_devs + 1
 
         self.writeBackend(devid, 'num_devs', str(new_num_devs))
 
         return new_num_devs
 
+    def destroyDevice(self, devid, force):
+        DevController.destroyDevice(self, devid, True)
+        log.debug('pci: unregister aer watch')
+        self.unwatchAerState
+
+    def unwatchAerState(self):
+        """Remove the watch on the domain's aerState node, if any."""
+        try:
+            try:
+                if self.aerStateWatch:
+                    self.aerStateWatch.unwatch()
+            finally:
+                self.aerStateWatch = None
+        except:
+            log.exception("Unwatching aerState failed.")
+  
     def waitForBackend(self,devid):
         return (0, "ok - no hotplug")
 
index c8f8f6d392d8d6f5841da23b6586dcd079fa76e9..702c3b2a50fd4e8bd541dafeecb00317e1febaaa 100644 (file)
@@ -123,7 +123,8 @@ class PCIQuirk:
             log.info("Config file does not exist: %s" % PERMISSIVE_CONFIG_FILE)
             self.pci_perm_dev_config = ['xend-pci-perm-devs']
 
-        devices = child_at(child(pci_perm_dev_config, 'unconstrained_dev_ids'),0)
+        devices = child_at(child(self.pci_perm_dev_config,
+                                 'unconstrained_dev_ids'),0)
         if self.__matchPCIdev( devices ):
             log.debug("Permissive mode enabled for PCI device [%s]" %
                       self.devid)
index 007884b46aca6e2ef7e41b29ebd42ca2fe46047c..6aa6f83d405ce6f6585cc6d3be92353400db29db 100644 (file)
@@ -122,6 +122,8 @@ class RelocationProtocol(protocol.Protocol):
         if self.transport:
             self.send_reply(["ready", name])
             p2cread, p2cwrite = os.pipe()
+            from xen.util import oshelp
+            oshelp.fcntl_setfd_cloexec(p2cwrite, True)
             threading.Thread(target=connection.SSLSocketServerConnection.recv2fd,
                              args=(self.transport.sock, p2cwrite)).start()
             try:
diff --git a/tools/python/xen/xend/server/udevevent.py b/tools/python/xen/xend/server/udevevent.py
new file mode 100644 (file)
index 0000000..b7ce26f
--- /dev/null
@@ -0,0 +1,68 @@
+import socket
+
+from xen.web import protocol, unix
+
+from xen.xend.XendLogging import log
+from xen.xend import XendNode
+from xen.xend import XendOptions
+
+UDEV_EVENT_PATH = '\0/org/xen/xend/udev_event'
+
+class UdevEventProtocol(protocol.Protocol):
+
+    def __init__(self):
+        protocol.Protocol.__init__(self)
+
+    def dataReceived(self, data):
+        udev_event = {}
+        for entry in data.split('\0'):
+            try:
+                opt, val = entry.split("=")
+                udev_event[opt] = val
+            except (TypeError, ValueError):
+                pass
+        if udev_event.get('ACTION', None) is None:
+            log.warn("Invalid udev event received")
+            return
+
+        log.debug("udev event received: %s", udev_event)
+
+        self._process_event(udev_event)
+
+    def _process_event(self, udev_event):
+        try:
+            if (udev_event.get('SUBSYSTEM', None) == 'pci'):
+                pci_name = udev_event.get('PCI_SLOT_NAME', None)
+                if (udev_event['ACTION'] == 'add'):
+                    log.info("Adding pci device %s", pci_name)
+                    XendNode.instance().add_PPCI(pci_name)
+                elif (udev_event['ACTION'] == 'remove'):
+                    log.info("Removing pci device %s", pci_name)
+                    XendNode.instance().remove_PPCI(pci_name)
+
+            elif (udev_event.get('SUBSYSTEMS', None) == 'scsi'):
+                if (udev_event['ACTION'] == 'add'):
+                    log.info("Adding scsi device")
+                    XendNode.instance().add_PSCSI()
+                elif (udev_event['ACTION'] == 'remove'):
+                    log.info("Removing scci device")
+                    XendNode.instance().remove_PSCSI()
+
+            elif (udev_event.get('SUBSYSTEM', None) == 'net'):
+                interface = udev_event.get('INTERFACE', None)
+                if (udev_event['ACTION'] == 'add'):
+                    log.info("Adding net device %s", interface)
+                    XendNode.instance().add_network(interface)
+                elif (udev_event['ACTION'] == 'remove'):
+                    log.info("Removing net device %s", interface)
+                    XendNode.instance().remove_network(interface)
+
+        except Exception, e:
+            log.warn("error while processing udev event(): %s" % str(e))
+
+
+def listenUdevEvent():
+    xoptions = XendOptions.instance()
+    if xoptions.get_xend_udev_event_server():
+        unix.UnixDgramListener(UDEV_EVENT_PATH, UdevEventProtocol)
+
index be2ea4c69873e1379c09a0c1a6c5fac815ff4878..9657e4f4a85f4a207af57eea6281a31c04db06d9 100644 (file)
@@ -28,7 +28,8 @@ from xen.xend import sxp
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 
-from xen.xend.server.DevController import DevController, xenbusState
+from xen.xend.server.DevController import DevController
+from xen.xend.server.DevConstants import xenbusState
 from xen.xend.xenstore.xstransact import xstransact
 
 class VSCSIController(DevController):
@@ -67,6 +68,8 @@ class VSCSIController(DevController):
             vscsi_config.append(['devs', devs])
             state = self.readFrontend(devid, 'state')
             vscsi_config.append(['state', state])
+            hostmode = self.readBackend(devid, 'feature-host')
+            vscsi_config.append(['feature-host', hostmode])
             backid = self.readFrontend(devid, 'backend-id')
             vscsi_config.append(['backend-id', backid])
             backpath = self.readFrontend(devid, 'backend')
@@ -92,11 +95,13 @@ class VSCSIController(DevController):
             back[devpath + '/p-devname'] = pdevname
             vdev = vscsi_config.get('v-dev', '')
             back[devpath + '/v-dev'] = vdev
-            state = vscsi_config.get('state', '')
-            back[devpath + '/state'] = str(xenbusState[state])
+            state = vscsi_config.get('state', xenbusState['Unknown'])
+            back[devpath + '/state'] = str(state)
             devid = vscsi_config.get('devid', '')
             back[devpath + '/devid'] = str(devid)
 
+        host_mode = config.get('feature-host','')
+        back['feature-host'] = str(host_mode)
         back['uuid'] = config.get('uuid','')
         devid = int(devid)
         return (devid, back, {})
@@ -132,6 +137,7 @@ class VSCSIController(DevController):
             vscsi_devs.append(dev_dict)
 
         config['devs'] = vscsi_devs
+        config['feature-host'] = self.readBackend(devid, 'feature-host')
         config['uuid'] = self.readBackend(devid, 'uuid')
         return config
 
@@ -168,26 +174,34 @@ class VSCSIController(DevController):
         (devid, back, front) = self.getDeviceDetails(config)
         devid = int(devid)
         vscsi_config = config['devs'][0]
-        state = vscsi_config.get('state', '')
+        state = vscsi_config.get('state', xenbusState['Unknown'])
         driver_state = self.readBackend(devid, 'state')
+
         if str(xenbusState['Connected']) != driver_state:
             raise VmError("Driver status is not connected")
 
         uuid = self.readBackend(devid, 'uuid')
-        if state == 'Initialising':
+        if state == xenbusState['Initialising']:
             back['uuid'] = uuid
             self.writeBackend(devid, back)
 
-        elif state == 'Closing':
+        elif state == xenbusState['Closing']:
             found = False
             devs = self.readBackendList(devid, "vscsi-devs")
+            hostmode = int(self.readBackend(devid, 'feature-host'))
             vscsipath = "vscsi-devs/"
             vdev = vscsi_config.get('v-dev', '')
 
             for dev in devs:
                 devpath = vscsipath + dev
                 old_vdev = self.readBackend(devid, devpath + '/v-dev')
-                if vdev == old_vdev:
+
+                if hostmode == 1:
+                    #At hostmode, all v-dev that belongs to devid is deleted.
+                    found = True
+                    self.writeBackend(devid, devpath + '/state', \
+                                    str(xenbusState['Closing']))
+                elif vdev == old_vdev:
                     found = True
                     self.writeBackend(devid, devpath + '/state', \
                                     str(xenbusState['Closing']))
@@ -198,7 +212,7 @@ class VSCSIController(DevController):
 
         else:
             raise XendError("Error configuring device invalid "
-                            "state '%s'" % state)
+                            "state '%s'" % xenbusState[state])
 
         self.writeBackend(devid, 'state', str(xenbusState['Reconfiguring']))
         return self.readBackend(devid, 'uuid')
index 961787a76eadc50fb3ed27b91165f9622b4bee3b..b4130cc2f2dd9b5003f6c336356e3731df1647e3 100644 (file)
@@ -64,12 +64,13 @@ def validate_config_file(configfile):
         return 0
 
     # sanity check on the data from the file
+    # requiring 'memory,' 'name,' and ether 'kernel' or 'bootloader'
     count = 0
-    required = ['kernel', 'memory', 'name']
+    required = ['kernel', 'bootloader', 'memory', 'name']
     for (k, v) in locs.items():
         if k in required:
             count += 1
-    if count != 3:
+    if count < len(required) - 1:
         print "Invalid configuration file."
         return 0
     else:
index bae076afe75f2bc2b399eb17488041c5fb28a388..d3010b62482bee6bea493ae6e9fba0b39c8b3072 100644 (file)
@@ -47,6 +47,7 @@
                  other_config*)> 
 <!ATTLIST vm     is_a_template          CDATA #REQUIRED
                  auto_power_on          CDATA #REQUIRED
+                 s3_integrity           CDATA #REQUIRED
                  vcpus_max              CDATA #REQUIRED
                  vcpus_at_startup       CDATA #REQUIRED
                  actions_after_shutdown %NORMAL_EXIT; #REQUIRED 
 <!ELEMENT vtpm   (name*)>
 <!ATTLIST vtpm   backend         CDATA #REQUIRED>
 
-<!ELEMENT pci    EMPTY>
+<!ELEMENT pci    (pci_opt*)>
 <!ATTLIST pci    domain          CDATA #REQUIRED
                  bus             CDATA #REQUIRED
                  slot            CDATA #REQUIRED
                  func            CDATA #REQUIRED
+                 opts_str        CDATA #IMPLIED
                  vslt            CDATA #IMPLIED>
 
 <!ELEMENT vscsi  EMPTY>
 <!ATTLIST vcpu_param key   CDATA #REQUIRED
                      value CDATA #REQUIRED>
 
+<!ELEMENT pci_opt    EMPTY>
+<!ATTLIST pci_opt    key   CDATA #REQUIRED
+                     value CDATA #REQUIRED>
+
 <!ELEMENT other_config EMPTY>
 <!ATTLIST other_config key   CDATA #REQUIRED
                        value CDATA #REQUIRED>
index 11661e7e10506169489105ea10fde78771fffaa1..8ad0fe6dfdb79b17eb851e33afb40333d2f3f421 100644 (file)
@@ -1,4 +1,4 @@
-#============================================================================
+#============================================================================UTO
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of version 2.1 of the GNU Lesser General Public
 # License as published by the Free Software Foundation.
@@ -32,6 +32,8 @@ from xen.xend import PrettyPrint as SXPPrettyPrint
 from xen.xend import osdep
 import xen.xend.XendClient
 from xen.xend.XendBootloader import bootloader
+from xen.xend.XendConstants import *
+from xen.xend.server.DevConstants import xenbusState
 from xen.util import blkif
 from xen.util import vscsi_util
 import xen.util.xsm.xsm as security
@@ -218,6 +220,10 @@ gopts.var('timer_mode', val='TIMER_MODE',
           use="""Timer mode (0=delay virtual time when ticks are missed;
           1=virtual time is always wallclock time.""")
 
+gopts.var('vpt_align', val='VPT_ALIGN',
+          fn=set_int, default=1,
+          use="Enable aligning all periodic vpt to reduce timer interrupts.")
+
 gopts.var('viridian', val='VIRIDIAN',
           fn=set_int, default=0,
           use="""Expose Viridian interface to x86 HVM guest?
@@ -317,11 +323,18 @@ gopts.var('disk', val='phy:DEV,VDEV,MODE[,DOM]',
           backend driver domain to use for the disk.
           The option may be repeated to add more than one disk.""")
 
-gopts.var('pci', val='BUS:DEV.FUNC',
+gopts.var('pci', val='BUS:DEV.FUNC[@VSLOT][,msitranslate=0|1][,power_mgmt=0|1]',
           fn=append_value, default=[],
           use="""Add a PCI device to a domain, using given params (in hex).
-         For example 'pci=c0:02.1'.
-         The option may be repeated to add more than one pci device.""")
+          For example 'pci=c0:02.1'.
+          If VSLOT is supplied the device will be inserted into that
+          virtual slot in the guest, else a free slot is selected.
+          If msitranslate is set, MSI-INTx translation is enabled if possible.
+          Guest that doesn't support MSI will get IO-APIC type IRQs
+          translated from physical MSI, HVM only. Default is 1.
+          The option may be repeated to add more than one pci device.
+          If power_mgmt is set, the guest OS will be able to program the power
+          states D0-D3hot of the device, HVM only. Default=0.""")
 
 gopts.var('vscsi', val='PDEV,VDEV[,DOM]',
           fn=append_value, default=[],
@@ -340,16 +353,16 @@ gopts.var('irq', val='IRQ',
          For example 'irq=7'.
          This option may be repeated to add more than one IRQ.""")
 
-gopts.var('vfb', val="type={vnc,sdl},vncunused=1,vncdisplay=N,vnclisten=ADDR,display=DISPLAY,xauthority=XAUTHORITY,vncpasswd=PASSWORD,opengl=1,keymap=FILE",
+gopts.var('vfb', val="vnc=1,sdl=1,vncunused=1,vncdisplay=N,vnclisten=ADDR,display=DISPLAY,xauthority=XAUTHORITY,vncpasswd=PASSWORD,opengl=1,keymap=FILE",
           fn=append_value, default=[],
           use="""Make the domain a framebuffer backend.
-          The backend type should be either sdl or vnc.
-          For type=vnc, connect an external vncviewer.  The server will listen
+          Both sdl=1 and vnc=1 can be enabled at the same time.
+          For vnc=1, connect an external vncviewer.  The server will listen
           on ADDR (default 127.0.0.1) on port N+5900.  N defaults to the
           domain id.  If vncunused=1, the server will try to find an arbitrary
           unused port above 5900.  vncpasswd overrides the XenD configured
           default password.
-          For type=sdl, a viewer will be started automatically using the
+          For sdl=1, a viewer will be started automatically using the
           given DISPLAY and XAUTHORITY, which default to the current user's
           ones.  OpenGL will be used by default unless opengl is set to 0.
           keymap overrides the XendD configured default layout file.""")
@@ -522,9 +535,9 @@ gopts.var('vncunused', val='',
           use="""Try to find an unused port for the VNC server.
           Only valid when vnc=1.""")
 
-gopts.var('videoram', val='',
-          fn=set_value, default=None,
-          use="""Maximum amount of videoram PV guest can allocate
+gopts.var('videoram', val='MEMORY',
+          fn=set_int, default=4,
+          use="""Maximum amount of videoram a guest can allocate
           for frame buffer.""")
 
 gopts.var('sdl', val='',
@@ -571,6 +584,11 @@ gopts.var('hap', val='HAP',
           use="""Hap status (0=hap is disabled;
           1=hap is enabled.""")
 
+gopts.var('s3_integrity', val='TBOOT_MEMORY_PROTECT',
+          fn=set_int, default=1,
+          use="""Should domain memory integrity be verified during S3?
+          (0=protection is disabled; 1=protection is enabled.""")
+
 gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX",
           fn=append_value, default=[],
           use="""Cpuid description.""")
@@ -587,6 +605,19 @@ gopts.var('suppress_spurious_page_faults', val='yes|no',
           fn=set_bool, default=None,
           use="""Do not inject spurious page faults into this guest""")
 
+gopts.var('pci_msitranslate', val='TRANSLATE',
+          fn=set_int, default=1,
+          use="""Global PCI MSI-INTx translation flag (0=disable;
+          1=enable.""")
+
+gopts.var('pci_power_mgmt', val='POWERMGMT',
+          fn=set_int, default=0,
+          use="""Global PCI Power Management flag (0=disable;1=enable).""")
+
+gopts.var('xen_platform_pci', val='0|1',
+           fn=set_int, default=1,
+           use="Is xen_platform_pci used?")
+
 def err(msg):
     """Print an error to stderr and exit.
     """
@@ -626,6 +657,8 @@ def configure_image(vals):
     if vals.root:
         cmdline_root = strip('root=', vals.root)
         config_image.append(['root', cmdline_root])
+    if vals.videoram:
+        config_image.append(['videoram', vals.videoram])
     if vals.extra:
         config_image.append(['args', vals.extra])
 
@@ -666,80 +699,102 @@ def configure_pci(config_devs, vals):
     """Create the config for pci devices.
     """
     config_pci = []
-    for (domain, bus, slot, func) in vals.pci:
-        config_pci.append(['dev', ['domain', domain], ['bus', bus], \
-                        ['slot', slot], ['func', func]])
+    for (domain, bus, slot, func, vslot, opts) in vals.pci:
+        config_pci_opts = []
+        d = comma_sep_kv_to_dict(opts)
+
+        def f(k):
+            if k not in ['msitranslate', 'power_mgmt']:
+                err('Invalid pci option: ' + k)
+
+            config_pci_opts.append([k, d[k]])
+
+        config_pci_bdf = ['dev', ['domain', domain], ['bus', bus], \
+                          ['slot', slot], ['func', func], ['vslot', vslot]]
+        map(f, d.keys())
+        if len(config_pci_opts)>0:
+            config_pci_bdf.append(['opts', config_pci_opts])
+
+        config_pci.append(config_pci_bdf)
 
     if len(config_pci)>0:
         config_pci.insert(0, 'pci')
         config_devs.append(['device', config_pci])
 
-def vscsi_convert_sxp_to_dict(dev_sxp):
-    dev_dict = {}
-    for opt_val in dev_sxp[1:]:
-        try:
-            opt, val = opt_val
-            dev_dict[opt] = val
-        except TypeError:
-            pass
-    return dev_dict
-
-def vscsi_lookup_devid(devlist, req_devid):
-    if len(devlist) == 0:
-        return 0
-    else:
-        for devid, backend in devlist:
-            if devid == req_devid:
-                return 1
-        return 0
-
 def configure_vscsis(config_devs, vals):
     """Create the config for vscsis (virtual scsi devices).
     """
-    devidlist = []
-    config_scsi = []
+
+    def get_devid(hctl):
+        return int(hctl.split(':')[0])
+
     if len(vals.vscsi) == 0:
         return 0
 
+    config_scsi = {}
+    pHCTL_list = []
+    vHCTL_list = []
+
     scsi_devices = vscsi_util.vscsi_get_scsidevices()
     for (p_dev, v_dev, backend) in vals.vscsi:
-        tmp = p_dev.split(':')
-        if len(tmp) == 4:
-            (p_hctl, block) = vscsi_util._vscsi_hctl_block(p_dev, scsi_devices)
-        else:
-            (p_hctl, block) = vscsi_util._vscsi_block_scsiid_to_hctl(p_dev, scsi_devices)
+        (p_hctl, devname) = \
+            vscsi_util.vscsi_get_hctl_and_devname_by(p_dev, scsi_devices)
 
         if p_hctl == None:
-            raise ValueError("Cannot find device \"%s\"" % p_dev)
-
-        for config in config_scsi:
-            dev = vscsi_convert_sxp_to_dict(config)
-            if dev['v-dev'] == v_dev:
-                raise ValueError('The virtual device "%s" is already defined' % v_dev)
-
-        v_hctl = v_dev.split(':')
-        devid = int(v_hctl[0])
-        config_scsi.append(['dev', \
-                        ['state', 'Initialising'], \
-                        ['devid', devid], \
-                        ['p-dev', p_hctl], \
-                        ['p-devname', block], \
-                        ['v-dev', v_dev] ])
-
-        if vscsi_lookup_devid(devidlist, devid) == 0:
-            devidlist.append([devid, backend])
-
-    for devid, backend in devidlist:
-        tmp = []
-        for config in config_scsi:
-            dev = vscsi_convert_sxp_to_dict(config)
-            if dev['devid'] == devid:
-                tmp.append(config)
-
-        tmp.insert(0, 'vscsi')
-        if backend:
-            tmp.append(['backend', backend])
-        config_devs.append(['device', tmp])
+            raise ValueError('Cannot find device "%s"' % p_dev)
+
+        feature_host = 0
+        if v_dev == 'host':
+            if serverType == SERVER_XEN_API:
+                # TODO
+                raise ValueError("SCSI devices assignment by HBA is not implemeted")
+            feature_host = 1
+            scsi_info = []
+            devid = get_devid(p_hctl)
+            for (pHCTL, devname, _, _) in scsi_devices:
+                if get_devid(pHCTL) == devid:
+                    scsi_info.append([devid, pHCTL, devname, pHCTL])
+        else:
+            scsi_info = [[get_devid(v_dev), p_hctl, devname, v_dev]]
+
+        devid_key = scsi_info[0][0]
+        try:
+            config = config_scsi[devid_key]
+        except KeyError:
+            config = {'feature-host': feature_host, 'backend': backend, 'devs': []}
+
+        devs = config['devs']
+        for (devid, pHCTL, devname, vHCTL) in scsi_info:
+            if pHCTL in pHCTL_list:
+                raise ValueError('The physical device "%s" is already defined' % pHCTL)
+            if vHCTL in vHCTL_list:
+                raise ValueError('The virtual device "%s" is already defined' % vHCTL)
+            pHCTL_list.append(pHCTL)
+            vHCTL_list.append(vHCTL)
+            devs.append(['dev', \
+                         ['state', xenbusState['Initialising']], \
+                         ['devid', devid], \
+                         ['p-dev', pHCTL], \
+                         ['p-devname', devname], \
+                         ['v-dev', vHCTL] ])
+
+        if config['feature-host'] != feature_host:
+            raise ValueError('The physical device "%s" cannot define '
+                             'because mode is different' % scsi_info[0][1])
+        if config['backend'] != backend:
+            raise ValueError('The physical device "%s" cannot define '
+                             'because backend is different' % scsi_info[0][1])
+
+        config['devs'] = devs
+        config_scsi[devid_key] = config
+
+    for config in config_scsi.values():
+        device = ['vscsi', ['feature-host', config['feature-host']]]
+        for dev in config['devs']:
+            device.append(dev)
+        if config['backend']:
+            device.append(['backend', config['backend']])
+        config_devs.append(['device', device])
 
 def configure_ioports(config_devs, vals):
     """Create the config for legacy i/o ranges.
@@ -759,11 +814,13 @@ def configure_vfbs(config_devs, vals):
     for f in vals.vfb:
         d = comma_sep_kv_to_dict(f)
         config = ['vfb']
-        if not d.has_key("type"):
-            d['type'] = 'sdl'
+        #handle the legacy case
+        if d.has_key("type"):
+            d[d['type']] = '1'
+            del d['type']
         for (k,v) in d.iteritems():
             if not k in [ 'vnclisten', 'vncunused', 'vncdisplay', 'display',
-                          'videoram', 'xauthority', 'type', 'vncpasswd',
+                          'videoram', 'xauthority', 'sdl', 'vnc', 'vncpasswd',
                           'opengl', 'keymap' ]:
                 err("configuration option %s unknown to vfbs" % k)
             config.append([k,v])
@@ -798,6 +855,10 @@ def configure_security(config, vals):
     elif num > 1:
         err("VM config error: Multiple access_control definitions!")
 
+def configure_mem_prot(config_image, vals):
+    """Create the config for S3 memory integrity verification under tboot.
+    """
+    config_image.append(['s3_integrity', vals.s3_integrity])
 
 def configure_vtpm(config_devs, vals):
     """Create the config for virtual TPM interfaces.
@@ -869,7 +930,8 @@ def configure_hvm(config_image, vals):
              'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor',
              'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci', 'hpet',
              'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check',
-             'viridian', 'xen_extended_power_mgmt' ]
+             'viridian', 'xen_extended_power_mgmt', 'pci_msitranslate',
+             'vpt_align', 'pci_power_mgmt', 'xen_platform_pci' ]
 
     for a in args:
         if a in vals.__dict__ and vals.__dict__[a] is not None:
@@ -929,6 +991,7 @@ def make_config(vals):
             else:
                 config.append(['bootloader_args', '-q'])
     config.append(['image', config_image])
+    configure_mem_prot(config, vals);
 
     config_devs = []
     configure_disks(config_devs, vals)
@@ -993,14 +1056,23 @@ def preprocess_pci(vals):
         pci_match = re.match(r"((?P<domain>[0-9a-fA-F]{1,4})[:,])?" + \
                 r"(?P<bus>[0-9a-fA-F]{1,2})[:,]" + \
                 r"(?P<slot>[0-9a-fA-F]{1,2})[.,]" + \
-                r"(?P<func>[0-7])$", pci_dev_str)
+                r"(?P<func>[0-7])" + \
+                r"(@(?P<vslot>[0-9a-fA-F]))?" + \
+                r"(,(?P<opts>.*))?$", \
+                pci_dev_str)
         if pci_match!=None:
-            pci_dev_info = pci_match.groupdict('0')
+            pci_dev_info = pci_match.groupdict('')
+            if pci_dev_info['domain']=='':
+                pci_dev_info['domain']='0'
+            if pci_dev_info['vslot']=='':
+                pci_dev_info['vslot']="%02x" % AUTO_PHP_SLOT
             try:
                 pci.append( ('0x'+pci_dev_info['domain'], \
                         '0x'+pci_dev_info['bus'], \
                         '0x'+pci_dev_info['slot'], \
-                        '0x'+pci_dev_info['func']))
+                        '0x'+pci_dev_info['func'], \
+                        '0x'+pci_dev_info['vslot'], \
+                        pci_dev_info['opts']))
             except IndexError:
                 err('Error in PCI slot syntax "%s"'%(pci_dev_str))
     vals.pci = pci
@@ -1013,7 +1085,7 @@ def preprocess_vscsi(vals):
         n = len(d)
         if n == 2:
             tmp = d[1].split(':')
-            if len(tmp) != 4:
+            if d[1] != 'host' and len(tmp) != 4:
                 err('vscsi syntax error "%s"' % d[1])
             else:
                 d.append(None)
@@ -1038,6 +1110,14 @@ def preprocess_ioports(vals):
         ioports.append(hexd)
     vals.ioports = ioports
         
+def preprocess_irq(vals):
+    if not vals.irq: return
+    irq = []
+    for v in vals.irq:
+        d = repr(v)
+        irq.append(d)
+    vals.irq = irq
+
 def preprocess_vtpm(vals):
     if not vals.vtpm: return
     vtpms = []
@@ -1136,6 +1216,7 @@ def preprocess(vals):
     preprocess_vscsi(vals)
     preprocess_ioports(vals)
     preprocess_ip(vals)
+    preprocess_irq(vals)
     preprocess_nfs(vals)
     preprocess_vtpm(vals)
     preprocess_access_control(vals)
@@ -1195,8 +1276,9 @@ def make_domain(opts, config):
         except:
             server.xend.domain.destroy(dom)
             err("Failed to unpause domain %s" % dom)
-    opts.info("Started domain %s" % (dom))
-    return int(sxp.child_value(dominfo, 'domid'))
+    domid = int(sxp.child_value(dominfo, 'domid'))
+    opts.info("Started domain %s (id=%d)" % (dom, domid))
+    return domid
 
 
 def get_xauthority():
@@ -1301,7 +1383,7 @@ def main(argv):
     elif not opts.is_xml:
         dom = make_domain(opts, config)
         
-    if opts.vals.vncviewer:
+    if opts.vals.vncconsole:
         domid = domain_name_to_domid(sxp.child_value(config, 'name', -1))
         vncviewer_autopass = getattr(opts.vals,'vncviewer-autopass', False)
         console.runVncViewer(domid, vncviewer_autopass, True)
index 8ed8b1b17d7bd2f0860d0800e795bddf901be17d..46cf8e4052d21c1646e80a9c5e2ed51e7d962425 100644 (file)
@@ -47,6 +47,7 @@ from xen.xend import PrettyPrint
 from xen.xend import sxp
 from xen.xend import XendClient
 from xen.xend.XendConstants import *
+from xen.xend.server.DevConstants import xenbusState
 
 from xen.xm.opts import OptionError, Opts, wrap, set_true
 from xen.xm import console
@@ -58,7 +59,11 @@ from xen.util.acmpolicy import ACM_LABEL_UNLABELED_DISPLAY
 import XenAPI
 
 import xen.lowlevel.xc
-xc = xen.lowlevel.xc.xc()
+try:
+    xc = xen.lowlevel.xc.xc()
+except Exception, ex:
+    print >>sys.stderr, ("Is xen kernel running?")
+    sys.exit(1)
 
 import inspect
 from xen.xend import XendOptions
@@ -186,7 +191,7 @@ SUBCOMMAND_HELP = {
     'vnet-delete'   :  ('<VnetId>', 'Delete a Vnet.'),
     'vnet-list'     :  ('[-l|--long]', 'List Vnets.'),
     'vtpm-list'     :  ('<Domain> [--long]', 'List virtual TPM devices.'),
-    'pci-attach'    :  ('<Domain> <domain:bus:slot.func> [virtual slot]',
+    'pci-attach'    :  ('[-o|--options=<opt>] <Domain> <domain:bus:slot.func> [virtual slot]',
                         'Insert a new pass-through pci device.'),
     'pci-detach'    :  ('<Domain> <domain:bus:slot.func>',
                         'Remove a domain\'s pass-through pci device.'),
@@ -734,7 +739,7 @@ def xm_save(args):
         (options, params) = getopt.gnu_getopt(args, 'c', ['checkpoint'])
     except getopt.GetoptError, opterr:
         err(opterr)
-        sys.exit(1)
+        usage('save')
 
     checkpoint = False
     for (k, v) in options:
@@ -1346,22 +1351,8 @@ def xm_dump_core(args):
     else:
         filename = None
 
-    if not live:
-        ds = server.xend.domain.pause(dom, True)
-
-    try:
-        print "Dumping core of domain: %s ..." % str(dom)
-        server.xend.domain.dump(dom, filename, live, crash)
-
-        if crash:
-            print "Destroying domain: %s ..." % str(dom)
-            server.xend.domain.destroy(dom)
-        elif reset:
-            print "Resetting domain: %s ..." % str(dom)
-            server.xend.domain.reset(dom)
-    finally:
-        if not live and not crash and not reset and ds == DOM_STATE_RUNNING:
-            server.xend.domain.unpause(dom)
+    print "Dumping core of domain: %s ..." % str(dom)
+    server.xend.domain.dump(dom, filename, live, crash, reset)
 
 def xm_rename(args):
     arg_check(args, "rename", 2)
@@ -2027,6 +2018,8 @@ def parse_dev_info(info):
         'mac'        : get_info('mac',          str,   '??'),
         #block-device specific
         'ring-ref'   : get_info('ring-ref',     int,   -1),
+        #vscsi specific
+        'feature-host'   : get_info('feature-host',     int,   -1),
         }
 
 def arg_check_for_resource_list(args, name):
@@ -2224,6 +2217,33 @@ def xm_pci_list_assignable_devices(args):
             print d.name,
         print
 
+def vscsi_sort(devs):
+    def sort_hctl(ds, l):
+        s = []
+        for d1 in ds:
+            for d2 in d1:
+                v_dev = sxp.child_value(d2, 'v-dev')
+                n = int(v_dev.split(':')[l])
+                try:
+                    j = s[n]
+                except IndexError:
+                    j = []
+                    s.extend([ [] for _ in range(len(s), n+1) ])
+                j.append(d2)
+                s[n] = j
+        return s
+
+    for i in range(len(devs)):
+        ds1 = [ devs[i][1][0][1] ]
+        ds1 = sort_hctl(ds1, 3)
+        ds1 = sort_hctl(ds1, 2)
+        ds1 = sort_hctl(ds1, 1)
+        ds2 = []
+        for d in ds1:
+            ds2.extend(d)
+        devs[i][1][0][1] = ds2
+    return devs
+
 def vscsi_convert_sxp_to_dict(dev_sxp):
     dev_dict = {}
     for opt_val in dev_sxp[1:]:
@@ -2264,20 +2284,23 @@ def xm_scsi_list(args):
     else:
         devs = server.xend.domain.getDeviceSxprs(dom, 'vscsi')
 
+    # Sort devs by virtual HCTL.
+    devs = vscsi_sort(devs)
+
     if use_long:
         map(PrettyPrint.prettyprint, devs)
     else:
         hdr = 0
         for x in devs:
             if hdr == 0:
-                print "%-3s %-3s %-5s  %-10s %-5s %-10s %-4s" \
-                        % ('Idx', 'BE', 'state', 'phy-hctl', 'phy', 'vir-hctl', 'devstate')
+                print "%-3s %-3s %-5s %-4s  %-10s %-5s %-10s %-4s" \
+                        % ('Idx', 'BE', 'state', 'host', 'phy-hctl', 'phy', 'vir-hctl', 'devstate')
                 hdr = 1
             ni = parse_dev_info(x[1])
             ni['idx'] = int(x[0])
             for dev in x[1][0][1]:
                 mi = vscsi_convert_sxp_to_dict(dev)
-                print "%(idx)-3d %(backend-id)-3d %(state)-5d " % ni,
+                print "%(idx)-3d %(backend-id)-3d %(state)-5d %(feature-host)-4d " % ni,
                 print "%(p-dev)-10s %(p-devname)-5s %(v-dev)-10s %(frontstate)-4s" % mi
 
 def parse_block_configuration(args):
@@ -2427,13 +2450,13 @@ def xm_network_attach(args):
             vif.append(vif_param)
         server.xend.domain.device_create(dom, vif)
 
-def parse_pci_configuration(args, state):
+def parse_pci_configuration(args, state, opts = ''):
     dom = args[0]
     pci_dev_str = args[1]
     if len(args) == 3:
         vslt = args[2]
     else:
-        vslt = '0x0' #chose a free virtual PCI slot
+        vslt = AUTO_PHP_SLOT_STR
     pci=['pci']
     pci_match = re.match(r"((?P<domain>[0-9a-fA-F]{1,4})[:,])?" + \
             r"(?P<bus>[0-9a-fA-F]{1,2})[:,]" + \
@@ -2442,12 +2465,17 @@ def parse_pci_configuration(args, state):
     if pci_match == None:
         raise OptionError("Invalid argument: %s %s" % (pci_dev_str,vslt))
     pci_dev_info = pci_match.groupdict('0')
+
     try:
-        pci.append(['dev', ['domain', '0x'+ pci_dev_info['domain']], \
+        pci_bdf =['dev', ['domain', '0x'+ pci_dev_info['domain']], \
                 ['bus', '0x'+ pci_dev_info['bus']],
                 ['slot', '0x'+ pci_dev_info['slot']],
                 ['func', '0x'+ pci_dev_info['func']],
-                ['vslt', '0x%x' % int(vslt, 16)]])
+                ['vslt', '0x%x' % int(vslt, 16)]]
+        if len(opts) > 0:
+            pci_bdf.append(['opts', opts])
+        pci.append(pci_bdf)
+
     except:
         raise OptionError("Invalid argument: %s %s" % (pci_dev_str,vslt))
     pci.append(['state', state])
@@ -2455,8 +2483,22 @@ def parse_pci_configuration(args, state):
     return (dom, pci)
 
 def xm_pci_attach(args):
-    arg_check(args, 'pci-attach', 2, 3)
-    (dom, pci) = parse_pci_configuration(args, 'Initialising')
+    config_pci_opts = []
+    (options, params) = getopt.gnu_getopt(args, 'o:', ['options='])
+    for (k, v) in options:
+        if k in ('-o', '--options'):
+            if len(v.split('=')) != 2:
+                err("Invalid pci attach option: %s" % v)
+                usage('pci-attach')
+            config_pci_opts.append(v.split('='))
+
+    n = len([i for i in params if i != '--'])
+    if n < 2 or n > 3:
+        err("Invalid argument for 'xm pci-attach'")
+        usage('pci-attach')
+
+    (dom, pci) = parse_pci_configuration(params, 'Initialising',
+                     config_pci_opts)
 
     if serverType == SERVER_XEN_API:
 
@@ -2479,7 +2521,8 @@ def xm_pci_attach(args):
         dpci_record = {
             "VM":           get_single_vm(dom),
             "PPCI":         target_ref,
-            "hotplug_slot": vslt
+            "hotplug_slot": vslt,
+            "options":      dict(config_pci_opts)
         }
         server.xenapi.DPCI.create(dpci_record)
 
@@ -2487,27 +2530,49 @@ def xm_pci_attach(args):
         server.xend.domain.device_configure(dom, pci)
 
 def parse_scsi_configuration(p_scsi, v_hctl, state):
-    v = v_hctl.split(':')
-    if len(v) != 4:
-        raise OptionError("Invalid argument: %s" % v_hctl)
+    def get_devid(hctl):
+        return int(hctl.split(':')[0])
+
+    host_mode = 0
+    scsi_devices = None
 
     if p_scsi is not None:
-        (p_hctl, block) = vscsi_util.vscsi_search_hctl_and_block(p_scsi)
-        if p_hctl == None:
+        # xm scsi-attach
+        if v_hctl == "host":
+            if serverType == SERVER_XEN_API:
+                # TODO
+                raise OptionError("SCSI devices assignment by HBA is not implemeted")
+            host_mode = 1
+            scsi_devices = vscsi_util.vscsi_get_scsidevices()
+        elif len(v_hctl.split(':')) != 4:
+            raise OptionError("Invalid argument: %s" % v_hctl)
+        (p_hctl, devname) = \
+            vscsi_util.vscsi_get_hctl_and_devname_by(p_scsi, scsi_devices)
+        if p_hctl is None:
             raise OptionError("Cannot find device '%s'" % p_scsi)
+        if host_mode:
+            scsi_info = []
+            devid = get_devid(p_hctl)
+            for pHCTL, devname, _, _ in scsi_devices:
+                if get_devid(pHCTL) == devid:
+                    scsi_info.append([devid, pHCTL, devname, pHCTL])
+        else:
+            scsi_info = [[get_devid(v_hctl), p_hctl, devname, v_hctl]] 
     else:
-        p_hctl = ''
-        block = ''
-
-    scsi = ['vscsi']
-    scsi.append(['dev', \
-                 ['state', state], \
-                 ['devid', int(v[0])], \
-                 ['p-dev', p_hctl], \
-                 ['p-devname', block], \
-                 ['v-dev', v_hctl] \
-               ])
-
+        # xm scsi-detach
+        if len(v_hctl.split(':')) != 4:
+            raise OptionError("Invalid argument: %s" % v_hctl)
+        scsi_info = [[get_devid(v_hctl), None, None, v_hctl]]
+
+    scsi = ['vscsi', ['feature-host', host_mode]]
+    for devid, pHCTL, devname, vHCTL in scsi_info:
+        scsi.append(['dev', \
+                     ['state', state], \
+                     ['devid', devid], \
+                     ['p-dev', pHCTL], \
+                     ['p-devname', devname], \
+                     ['v-dev', vHCTL] \
+                   ])
     return scsi
 
 def xm_scsi_attach(args):
@@ -2515,7 +2580,7 @@ def xm_scsi_attach(args):
     dom = args[0]
     p_scsi = args[1]
     v_hctl = args[2]
-    scsi = parse_scsi_configuration(p_scsi, v_hctl, 'Initialising')
+    scsi = parse_scsi_configuration(p_scsi, v_hctl, xenbusState['Initialising'])
 
     if serverType == SERVER_XEN_API:
 
@@ -2635,7 +2700,7 @@ def xm_scsi_detach(args):
     arg_check(args, 'scsi-detach', 2)
     dom = args[0]
     v_hctl = args[1]
-    scsi = parse_scsi_configuration(None, v_hctl, 'Closing')
+    scsi = parse_scsi_configuration(None, v_hctl, xenbusState['Closing'])
 
     if serverType == SERVER_XEN_API:
 
index b49913c1901826a8e5d49c8ef961466d44fd03a0..d3cf5e233597742ad559c011eba37d0b7ec7eb5d 100644 (file)
@@ -269,6 +269,8 @@ class xenapi_create:
                 vm.attributes["is_a_template"].value == 'true',
             "auto_power_on":
                 vm.attributes["auto_power_on"].value == 'true',
+            "s3_integrity":
+                vm.attributes["s3_integrity"].value,
             "memory_static_max":
                 get_child_node_attribute(vm, "memory", "static_max"),
             "memory_static_min":
@@ -533,7 +535,10 @@ class xenapi_create:
             "PPCI":
                 target_ref,
             "hotplug_slot":
-                int(pci.attributes["func"].value, 16)
+                int(pci.attributes["func"].value, 16),
+            "options":
+                get_child_nodes_as_dict(pci,
+                  "pci_opt", "key", "value")
         }
 
         return server.xenapi.DPCI.create(dpci_record)
@@ -647,6 +652,8 @@ class sxp2xml:
             = str(get_child_by_name(config, "vcpus", 1))
         vm.attributes["vcpus_at_startup"] \
             = str(get_child_by_name(config, "vcpus", 1))
+        vm.attributes["s3_integrity"] \
+            = str(get_child_by_name(config, "s3_integrity", 0))
 
         sec_data = get_child_by_name(config, "security")
         if sec_data:
@@ -931,6 +938,12 @@ class sxp2xml:
                     = get_child_by_name(dev_sxp, "func", "0")
                 pci.attributes["vslt"] \
                     = get_child_by_name(dev_sxp, "vslt", "0")
+                for opt in get_child_by_name(dev_sxp, "opts", ""):
+                    if len(opt) > 0:
+                        pci_opt = document.createElement("pci_opt")
+                        pci_opt.attributes["key"] = opt[0]
+                        pci_opt.attributes["value"] = opt[1]
+                        pci.appendChild(pci_opt)
 
                 pcis.append(pci)
 
@@ -1028,10 +1041,14 @@ class sxp2xml:
             'usbdevice',
             'hpet',
             'timer_mode',
+            'vpt_align',
             'viridian',
             'vhpt',
             'guest_os_type',
             'hap',
+            'pci_msitranslate',
+            'pci_power_mgmt',
+            'xen_platform_pci',
         ]
 
         platform_configs = []
index dfd7bd2be8f46b6c677228f076ffefe465ffd0d6..ae7888267ddc3a098e1b3ea79ab41dfea0151ad9 100644 (file)
@@ -1,13 +1,13 @@
 
 override XEN_TARGET_ARCH = x86_32
 XEN_ROOT = ../..
-CFLAGS :=
+CFLAGS =
 include $(XEN_ROOT)/tools/Rules.mk
 
 # Disable PIE/SSP if GCC supports them. They can break us.
-CFLAGS += $(call cc-option,$(CC),-nopie,)
-CFLAGS += $(call cc-option,$(CC),-fno-stack-protector,)
-CFLAGS += $(call cc-option,$(CC),-fno-stack-protector-all,)
+$(call cc-option-add,CFLAGS,CC,-nopie)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector-all)
 
 CFLAGS += -fno-builtin -msoft-float
 
index 33827c6a90cf4da6647be9dcc134274c3755d85e..af69d39bf0b7c7e64edee2b64241064d0f83a7cd 100644 (file)
@@ -17,7 +17,8 @@ SUBDIRS+= vnet-module
 all: compile
 
 gc.tar.gz:
-       wget http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/$@
+       #wget http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/$@
+       wget $(XEN_EXTFILES_URL)/$@
 
 .PHONY: gc
 gc: gc.tar.gz
index 0c8ef6bb73ef09015b7c6c0af62c115f3929d408..4d589b1b7f2fc95a18c917e81bee1217589ed8e9 100644 (file)
@@ -24,14 +24,11 @@ LIB_SRCS += util.c
 LIB_OBJS := $(LIB_SRCS:.c=.o)
 PIC_OBJS := $(LIB_SRCS:.c=.opic)
 
-CFLAGS   += -Werror -fno-strict-aliasing $(call cc-option,$(CC),-fgnu89-inline,)
+$(call cc-option-add,CFLAGS,CC,-fgnu89-inline)
+CFLAGS   += -Werror -fno-strict-aliasing
 CFLAGS   += -O3
 #CFLAGS   += -g
 
-# Get gcc to generate the dependencies for us.
-CFLAGS   += -Wp,-MD,.$(@F).d
-DEPS     = .*.d
-
 MAJOR    := 3.0
 MINOR    := 0
 LIB      := libxutil.so 
index aeab129cbb15969f95df23bd2acd4643025d7dd5..d1cf6caf931025cee273dee430f0b866adf73d1f 100644 (file)
@@ -89,6 +89,6 @@ build_sub:
                        $(MAKE) -C $(TPM_EMULATOR_DIR); \
                fi \
        else \
-               echo "*** Unable to build VTPMs. libgmp could not be found."; \
+               echo "=== Unable to build VTPMs. libgmp could not be found."; \
        fi
 
index 8eb1a3c343402b28bbcb0c20fdd981815b3b8aac..c6a7487fbf9ad9634b599a5ab212675c769cca7a 100644 (file)
@@ -11,11 +11,6 @@ TOOLS_INSTALL_DIR = $(DESTDIR)/usr/bin
 # General compiler flags
 CFLAGS   = -Werror -g3 -I.
 
-# For generating dependencies
-CFLAGS += -Wp,-MD,.$(@F).d
-
-DEP_FILES      = .*.d
-
 # Generic project files
 HDRS   = $(wildcard *.h)
 SRCS   = $(wildcard *.c)
@@ -26,7 +21,7 @@ $(SRCS): Makefile $(XEN_ROOT)/tools/Rules.mk $(XEN_ROOT)/tools/vtpm/Rules.mk
 
 $(OBJS): $(SRCS)
 
--include $(DEP_FILES)
+-include $(DEPS)
 
 BUILD_EMULATOR = y
 
index 3d48870288661cc0f387ed40edb81f7c7dda67a9..461e13bf2849623d821b7e73f72c8c6d5930077d 100644 (file)
@@ -11,11 +11,6 @@ TOOLS_INSTALL_DIR = $(DESTDIR)/usr/bin
 # General compiler flags
 CFLAGS = -Werror -g3 -I.
 
-# For generating dependencies
-CFLAGS += -Wp,-MD,.$(@F).d
-
-DEP_FILES      = .*.d
-
 # Generic project files
 HDRS   = $(wildcard *.h)
 SRCS   = $(wildcard *.c)
@@ -26,7 +21,7 @@ $(SRCS): Makefile $(XEN_ROOT)/tools/Rules.mk $(XEN_ROOT)/tools/vtpm_manager/Rule
 
 $(OBJS): $(SRCS)
 
--include $(DEP_FILES)
+-include $(FILES)
 
 # Make sure these are just rules
 .PHONY : all build install clean
index 15c0c9758ef698a160edef05257fd4647a4951e3..a8029086b28a61d53b98fbbfe793af74d9ac703c 100644 (file)
@@ -14,10 +14,6 @@ include $(XEN_ROOT)/tools/Rules.mk
 CFLAGS += -Werror
 CFLAGS += $(CFLAGS_libxenctrl) $(CFLAGS_libxenguest) $(CFLAGS_libxenstore)
 
-# Make gcc generate dependencies.
-CFLAGS += -Wp,-MD,.$(@F).d
-PROG_DEP = .*.d
-
 PROGRAMS = xc_restore xc_save readnotes lsevtchn
 
 LDLIBS   = $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) $(LDFLAGS_libxenstore)
@@ -40,6 +36,6 @@ install: build
 .PHONY: clean
 clean:
        $(RM) *.o $(PROGRAMS)
-       $(RM) $(PROG_DEP)
+       $(RM) $(DEPS)
 
--include $(PROG_DEP)
+-include $(DEPS)
index 4ba53819c55c77272a7b3fe54b10878c17bf752b..b770f56fe9f6bc748494cbf43f1e68ce5885ffa8 100644 (file)
@@ -13,7 +13,7 @@
 #include <xg_private.h>
 #include <xc_dom.h> /* gunzip bits */
 
-#include <xen/libelf.h>
+#include <xen/libelf/libelf.h>
 
 static void print_string_note(const char *prefix, struct elf_binary *elf,
                              const elf_note *note)
index adbcb9214122c1655a0129b6b6e70dd95de422b9..d38bea409e726feb0928716ea7a1aef0131c38f4 100644 (file)
 #include <xenguest.h>
 
 static struct suspendinfo {
+    int xc_fd; /* libxc handle */
     int xce; /* event channel handle */
     int suspend_evtchn;
+    int domid;
+    unsigned int flags;
 } si;
 
 /**
@@ -43,97 +46,6 @@ static int compat_suspend(void)
             !strncmp(ans, "done\n", 5));
 }
 
-static int suspend_evtchn_release(void)
-{
-    if (si.suspend_evtchn >= 0) {
-        xc_evtchn_unbind(si.xce, si.suspend_evtchn);
-        si.suspend_evtchn = -1;
-    }
-    if (si.xce >= 0) {
-        xc_evtchn_close(si.xce);
-        si.xce = -1;
-    }
-
-    return 0;
-}
-
-static int await_suspend(void)
-{
-    int rc;
-
-    do {
-        rc = xc_evtchn_pending(si.xce);
-        if (rc < 0) {
-            warnx("error polling suspend notification channel: %d", rc);
-            return -1;
-        }
-    } while (rc != si.suspend_evtchn);
-
-    /* harmless for one-off suspend */
-    if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0)
-        warnx("failed to unmask suspend notification channel: %d", rc);
-
-    return 0;
-}
-
-static int suspend_evtchn_init(int xc, int domid)
-{
-    struct xs_handle *xs;
-    char path[128];
-    char *portstr;
-    unsigned int plen;
-    int port;
-    int rc;
-
-    si.xce = -1;
-    si.suspend_evtchn = -1;
-
-    xs = xs_daemon_open();
-    if (!xs) {
-        warnx("failed to get xenstore handle");
-        return -1;
-    }
-    sprintf(path, "/local/domain/%d/device/suspend/event-channel", domid);
-    portstr = xs_read(xs, XBT_NULL, path, &plen);
-    xs_daemon_close(xs);
-
-    if (!portstr || !plen) {
-        warnx("could not read suspend event channel");
-        return -1;
-    }
-
-    port = atoi(portstr);
-    free(portstr);
-
-    si.xce = xc_evtchn_open();
-    if (si.xce < 0) {
-        warnx("failed to open event channel handle");
-        goto cleanup;
-    }
-
-    si.suspend_evtchn = xc_evtchn_bind_interdomain(si.xce, domid, port);
-    if (si.suspend_evtchn < 0) {
-        warnx("failed to bind suspend event channel: %d", si.suspend_evtchn);
-        goto cleanup;
-    }
-
-    rc = xc_domain_subscribe_for_suspend(xc, domid, port);
-    if (rc < 0) {
-        warnx("failed to subscribe to domain: %d", rc);
-        goto cleanup;
-    }
-
-    /* event channel is pending immediately after binding */
-    await_suspend();
-
-    return 0;
-
-  cleanup:
-    suspend_evtchn_release();
-
-    return -1;
-}
-
 /**
  * Issue a suspend request to a dedicated event channel in the guest, and
  * receive the acknowledgement from the subscribe event channel. */
@@ -147,7 +59,7 @@ static int evtchn_suspend(void)
         return 0;
     }
 
-    if (await_suspend() < 0) {
+    if (xc_await_suspend(si.xce, si.suspend_evtchn) < 0) {
         warnx("suspend failed");
         return 0;
     }
@@ -161,7 +73,14 @@ static int evtchn_suspend(void)
 
 static int suspend(void)
 {
-    if (si.suspend_evtchn >= 0)
+    unsigned long sx_state = 0;
+
+    /* Cannot notify guest to shut itself down if it's in ACPI sleep state. */
+    if (si.flags & XCFLAGS_HVM)
+        xc_get_hvm_param(si.xc_fd, si.domid,
+                         HVM_PARAM_ACPI_S_STATE, &sx_state);
+
+    if ((sx_state == 0) && (si.suspend_evtchn >= 0))
         return evtchn_suspend();
 
     return compat_suspend();
@@ -293,36 +212,58 @@ static void *init_qemu_maps(int domid, unsigned int bitmap_size)
     return seg;
 }
 
-
 int
 main(int argc, char **argv)
 {
-    unsigned int domid, maxit, max_f, flags; 
-    int xc_fd, io_fd, ret;
+    unsigned int maxit, max_f;
+    int io_fd, ret, port;
 
     if (argc != 6)
         errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
 
-    xc_fd = xc_interface_open();
-    if (xc_fd < 0)
+    si.xc_fd = xc_interface_open();
+    if (si.xc_fd < 0)
         errx(1, "failed to open control interface");
 
     io_fd = atoi(argv[1]);
-    domid = atoi(argv[2]);
+    si.domid = atoi(argv[2]);
     maxit = atoi(argv[3]);
     max_f = atoi(argv[4]);
-    flags = atoi(argv[5]);
+    si.flags = atoi(argv[5]);
+
+    si.suspend_evtchn = si.xce = -1;
+
+    si.xce = xc_evtchn_open();
+    if (si.xce < 0)
+        warnx("failed to open event channel handle");
+
+    if (si.xce > 0)
+    {
+        port = xs_suspend_evtchn_port(si.domid);
 
-    if (suspend_evtchn_init(xc_fd, domid) < 0)
-        warnx("suspend event channel initialization failed, using slow path");
+        if (port < 0)
+            warnx("faield to get the suspend evtchn port\n");
+        else
+        {
+            si.suspend_evtchn =
+              xc_suspend_evtchn_init(si.xc_fd, si.xce, si.domid, port);
 
-    ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
-                         &suspend, !!(flags & XCFLAGS_HVM),
+            if (si.suspend_evtchn < 0)
+                warnx("suspend event channel initialization failed"
+                       "using slow path");
+        }
+    }
+    ret = xc_domain_save(si.xc_fd, io_fd, si.domid, maxit, max_f, si.flags, 
+                         &suspend, !!(si.flags & XCFLAGS_HVM),
                          &init_qemu_maps, &qemu_flip_buffer);
 
-    suspend_evtchn_release();
+    if (si.suspend_evtchn > 0)
+        xc_suspend_evtchn_release(si.xce, si.suspend_evtchn);
+
+    if (si.xce > 0)
+        xc_evtchn_close(si.xce);
 
-    xc_interface_close(xc_fd);
+    xc_interface_close(si.xc_fd);
 
     return ret;
 }
index 8ad42665673653d43754f8e758b21f7ac94eb03a..276a8e9886c4d475b7d6c96e5353f47bc2cd04d3 100644 (file)
@@ -38,10 +38,12 @@ install: build
 
 .PHONY: clean
 clean:
-       rm -f $(BIN)
+       rm -f $(BIN) $(DEPS)
 
 
 %: %.c Makefile
        $(CC) $(CFLAGS) $< $(LDFLAGS) -o $@
 xentrace_%: %.c Makefile
        $(CC) $(CFLAGS) $< $(LDFLAGS) -o $@
+
+-include $(DEPS)
diff --git a/tools/xenpmd/Makefile b/tools/xenpmd/Makefile
new file mode 100644 (file)
index 0000000..7e9353b
--- /dev/null
@@ -0,0 +1,22 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += -Werror
+CFLAGS  += $(CFLAGS_libxenstore)
+LDFLAGS += $(LDFLAGS_libxenstore)
+
+BIN      = xenpmd
+
+.PHONY: all
+all: $(BIN)
+
+.PHONY: install
+install: all
+       $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+       $(INSTALL_PROG) $(BIN) $(DESTDIR)$(SBINDIR)
+
+.PHONY: clean
+clean:
+       $(RM) -f $(BIN) $(DEPS)
+
+-include $(DEPS)
diff --git a/tools/xenpmd/xenpmd.c b/tools/xenpmd/xenpmd.c
new file mode 100644 (file)
index 0000000..28de744
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * xenpmd.c
+ *
+ * xen power management daemon - Facilitates power management 
+ * functionality within xen guests.
+ *
+ * Copyright (c) 2008  Kamala Narasimhan 
+ * Copyright (c) 2008  Citrix Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Xen extended power management support provides HVM guest power management
+ * features beyond S3, S4, S5.  For example, it helps expose system level 
+ * battery status and battery meter information and in future will be extended
+ * to include more power management support.  This extended power management 
+ * support is enabled by setting xen_extended_power_mgmt to 1 or 2 in the HVM
+ * config file.  When set to 2, non-pass through mode is enabled which heavily
+ * relies on this power management daemon to glean battery information from 
+ * dom0 and store it xenstore which would then be queries and used by qemu and 
+ * passed to the guest when appropriate battery ports are read/written to.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <xs.h>
+
+/* #define RUN_STANDALONE */
+#define RUN_IN_SIMULATE_MODE
+
+enum BATTERY_INFO_TYPE {
+    BIF, 
+    BST 
+};
+
+enum BATTERY_PRESENT {
+    NO, 
+    YES 
+};
+
+enum BATTERY_TECHNOLOGY {
+    NON_RECHARGEABLE, 
+    RECHARGEABLE 
+};
+
+struct battery_info {
+    enum BATTERY_PRESENT    present;
+    unsigned long           design_capacity;
+    unsigned long           last_full_capacity;
+    enum BATTERY_TECHNOLOGY battery_technology;
+    unsigned long           design_voltage;
+    unsigned long           design_capacity_warning;
+    unsigned long           design_capacity_low;
+    unsigned long           capacity_granularity_1;
+    unsigned long           capacity_granularity_2;
+    char                    model_number[32];
+    char                    serial_number[32];
+    char                    battery_type[32];
+    char                    oem_info[32];
+};
+
+struct battery_status {
+    enum BATTERY_PRESENT    present;
+    unsigned long           state;
+    unsigned long           present_rate;
+    unsigned long           remaining_capacity;
+    unsigned long           present_voltage;
+};
+
+static struct xs_handle *xs;
+
+#ifdef RUN_IN_SIMULATE_MODE
+    #define BATTERY_DIR_PATH "/tmp/battery"
+    #define BATTERY_INFO_FILE_PATH "/tmp/battery/%s/info" 
+    #define BATTERY_STATE_FILE_PATH "/tmp/battery/%s/state"
+#else
+    #define BATTERY_DIR_PATH "/proc/acpi/battery"
+    #define BATTERY_INFO_FILE_PATH "/proc/acpi/battery/%s/info"
+    #define BATTERY_STATE_FILE_PATH "/proc/acpi/battery/%s/state"
+#endif
+
+FILE *get_next_battery_file(DIR *battery_dir, 
+                            enum BATTERY_INFO_TYPE battery_info_type)
+{
+    FILE *file = 0;
+    struct dirent *dir_entries;
+    char file_name[32];
+    
+    do 
+    {
+        dir_entries = readdir(battery_dir);
+        if ( !dir_entries ) 
+            return 0;
+        if ( strlen(dir_entries->d_name) < 4 )
+            continue;
+        if ( battery_info_type == BIF ) 
+            snprintf(file_name, 32, BATTERY_INFO_FILE_PATH,
+                     dir_entries->d_name);
+        else 
+            snprintf(file_name, 32, BATTERY_STATE_FILE_PATH,
+                     dir_entries->d_name);
+        file = fopen(file_name, "r");
+    } while ( !file );
+
+    return file;
+}
+
+void set_attribute_battery_info(char *attrib_name,
+                                char *attrib_value,
+                                struct battery_info *info)
+{
+    if ( strstr(attrib_name, "present") ) 
+    {
+        if ( strstr(attrib_value, "yes") ) 
+            info->present = YES;
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity warning") ) 
+    {
+        info->design_capacity_warning = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity low") ) 
+    {
+        info->design_capacity_low = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity") ) 
+    { 
+        info->design_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "last full capacity") ) 
+    {
+        info->last_full_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design voltage") ) 
+    {
+        info->design_voltage = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "capacity granularity 1") ) 
+    {
+        info->capacity_granularity_1 = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "capacity granularity 2") ) 
+    {
+        info->capacity_granularity_2 = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "battery technology") ) 
+    {
+        if ( strncmp(attrib_value, "rechargeable",
+                     strlen("rechargeable")) == 0 ) 
+            info->battery_technology = RECHARGEABLE;
+        else 
+            info->battery_technology = NON_RECHARGEABLE;
+        return;
+    }
+
+    if ( strstr(attrib_name, "model number") ) 
+    {
+        strncpy(info->model_number, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "serial number") ) 
+    {
+        strncpy(info->serial_number, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "battery type") ) 
+    {
+        strncpy(info->battery_type, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "OEM info") ) 
+    {
+        strncpy(info->oem_info, attrib_value, 32);
+        return;
+    }
+
+    return;
+}
+
+void set_attribute_battery_status(char *attrib_name, 
+                                  char *attrib_value,
+                                  struct battery_status *status)
+{
+    if ( strstr(attrib_name, "charging state") ) 
+    {
+        /* Check this, below is half baked */
+        if ( strstr(attrib_value, "charged") ) 
+            status->state = 0;
+        else 
+            status->state = 1;
+        return;
+    }
+
+    if ( strstr(attrib_name, "present rate") ) 
+    {
+        status->present_rate = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "remaining capacity") ) 
+    {
+        status->remaining_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "present voltage") ) 
+    {
+        status->present_voltage = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "present") ) 
+    {
+        if ( strstr(attrib_value, "yes") ) 
+            status->present = YES;
+        return;
+    }
+}
+
+void parse_battery_info_or_status(char *line_info,
+                                  enum BATTERY_INFO_TYPE type,
+                                  void *info_or_status)
+{
+    char attrib_name[128];
+    char attrib_value[64];
+    char *delimiter;
+    unsigned long length;
+
+    length = strlen(line_info);
+    delimiter = (char *) strchr( line_info, ':');
+    if ( (!delimiter) || (delimiter == line_info) ||
+         (delimiter == line_info + length) ) 
+        return;
+
+    strncpy(attrib_name, line_info, delimiter-line_info);
+    while ( *(delimiter+1) == ' ' ) 
+    {
+        delimiter++;
+        if ( delimiter+1 == line_info + length)
+            return;
+    }
+    strncpy(attrib_value, delimiter+1, 
+            (unsigned long)line_info + length -(unsigned long)delimiter); 
+    
+    if ( type == BIF ) 
+        set_attribute_battery_info(attrib_name, attrib_value,
+                                   (struct battery_info *)info_or_status);
+    else 
+        set_attribute_battery_status(attrib_name, attrib_value,
+                                     (struct battery_status *)info_or_status);
+
+    return;
+}
+
+int get_next_battery_info_or_status(DIR *battery_dir,
+                                    enum BATTERY_INFO_TYPE type,
+                                    void *info_or_status)
+{
+    FILE *file;
+    char line_info[256];
+
+    if  ( !info_or_status )
+        return 0;
+
+    if (type == BIF) 
+        memset(info_or_status, 0, sizeof(struct battery_info));
+    else 
+        memset(info_or_status, 0, sizeof(struct battery_status));
+
+    file = get_next_battery_file(battery_dir, type);
+    if ( !file )
+        return 0;
+
+    while ( fgets(line_info, sizeof(line_info), file) != NULL ) 
+        parse_battery_info_or_status(line_info, type, info_or_status);
+
+    fclose(file);
+    return 1;
+}
+
+#ifdef RUN_STANDALONE
+void print_battery_info(struct battery_info *info)
+{
+    printf("present:                %d\n", info->present);
+    printf("design capacity:        %d\n", info->design_capacity);
+    printf("last full capacity:     %d\n", info->last_full_capacity);
+    printf("battery technology:     %d\n", info->battery_technology);
+    printf("design voltage:         %d\n", info->design_voltage);
+    printf("design capacity warning:%d\n", info->design_capacity_warning);
+    printf("design capacity low:    %d\n", info->design_capacity_low);
+    printf("capacity granularity 1: %d\n", info->capacity_granularity_1);
+    printf("capacity granularity 2: %d\n", info->capacity_granularity_2);
+    printf("model number:           %s\n", info->model_number);
+    printf("serial number:          %s\n", info->serial_number);
+    printf("battery type:           %s\n", info->battery_type);
+    printf("OEM info:               %s\n", info->oem_info);
+}
+#endif /*RUN_STANDALONE*/
+
+void write_ulong_lsb_first(char *temp_val, unsigned long val)
+{
+    snprintf(temp_val, 9, "%02x%02x%02x%02x", (unsigned int)val & 0xff, 
+    (unsigned int)(val & 0xff00) >> 8, (unsigned int)(val & 0xff0000) >> 16, 
+    (unsigned int)(val & 0xff000000) >> 24);
+}
+
+void write_battery_info_to_xenstore(struct battery_info *info)
+{
+    char val[1024], string_info[256];
+
+    xs_mkdir(xs, XBT_NULL, "/pm");
+   
+    memset(val, 0, 1024);
+    memset(string_info, 0, 256);
+    /* write 9 dwords (so 9*4) + length of 4 strings + 4 null terminators */
+    snprintf(val, 3, "%02x", 
+             (unsigned int)(9*4 +
+                            strlen(info->model_number) +
+                            strlen(info->serial_number) +
+                            strlen(info->battery_type) +
+                            strlen(info->oem_info) + 4));
+    write_ulong_lsb_first(val+2, info->present);
+    write_ulong_lsb_first(val+10, info->design_capacity);
+    write_ulong_lsb_first(val+18, info->last_full_capacity);
+    write_ulong_lsb_first(val+26, info->battery_technology);
+    write_ulong_lsb_first(val+34, info->design_voltage);
+    write_ulong_lsb_first(val+42, info->design_capacity_warning);
+    write_ulong_lsb_first(val+50, info->design_capacity_low);
+    write_ulong_lsb_first(val+58, info->capacity_granularity_1);
+    write_ulong_lsb_first(val+66, info->capacity_granularity_2);
+
+    snprintf(string_info, 256, "%02x%s%02x%s%02x%s%02x%s", 
+             (unsigned int)strlen(info->model_number), info->model_number,
+             (unsigned int)strlen(info->serial_number), info->serial_number,
+             (unsigned int)strlen(info->battery_type), info->battery_type,
+             (unsigned int)strlen(info->oem_info), info->oem_info);
+    strncat(val+73, string_info, 1024-73-1);
+    xs_write(xs, XBT_NULL, "/pm/bif", 
+             val, 73+8+strlen(info->model_number)+strlen(info->serial_number)+
+             strlen(info->battery_type)+strlen(info->oem_info)+1);
+}
+
+int write_one_time_battery_info(void)
+{
+    DIR *dir;
+    int ret = 0;
+    struct battery_info info;
+    
+    dir = opendir(BATTERY_DIR_PATH);
+    if ( !dir )
+        return 0;
+
+    while ( get_next_battery_info_or_status(dir, BIF, (void *)&info) ) 
+    {
+#ifdef RUN_STANDALONE
+        print_battery_info(&info);
+#endif
+        if ( info.present == YES ) 
+        {
+            write_battery_info_to_xenstore(&info);
+            ret = 1;
+            break; /* rethink this... */
+        }
+    }
+
+    closedir(dir);
+    return ret;
+}
+
+#ifdef RUN_STANDALONE
+void print_battery_status(struct battery_status *status)
+{
+    printf("present:                     %d\n", status->present);
+    printf("Battery state                %d\n", status->state);
+    printf("Battery present rate         %d\n", status->present_rate);
+    printf("Battery remining capacity    %d\n", status->remaining_capacity);
+    printf("Battery present voltage      %d\n", status->present_voltage);
+}
+#endif /*RUN_STANDALONE*/
+
+void write_battery_status_to_xenstore(struct battery_status *status)
+{
+    char val[35];
+
+    xs_mkdir(xs, XBT_NULL, "/pm");
+
+    memset(val, 0, 35);
+    snprintf(val, 3, "%02x", 16);
+    write_ulong_lsb_first(val+2, status->state);
+    write_ulong_lsb_first(val+10, status->present_rate);
+    write_ulong_lsb_first(val+18, status->remaining_capacity);
+    write_ulong_lsb_first(val+26, status->present_voltage);
+
+    xs_write(xs, XBT_NULL, "/pm/bst", val, 35);
+}
+
+int wait_for_and_update_battery_status_request(void)
+{
+    DIR *dir;
+    int ret = 0;
+    unsigned int count;
+    struct battery_status status;
+
+    while ( true )
+    {
+        /* KN:@TODO - It is rather inefficient to not cache the file handle.
+         *  Switch to caching file handle. 
+         */
+        dir = opendir(BATTERY_DIR_PATH);
+        if ( !dir )
+            return 0;
+
+        while ( get_next_battery_info_or_status(dir, BST, (void *)&status) ) 
+        {
+#ifdef RUN_STANDALONE
+            print_battery_status(&status);
+#endif
+            if ( status.present == YES ) 
+            {
+                write_battery_status_to_xenstore(&status);
+                ret = 1;
+                /* rethink this; though I have never seen, there might be
+                 * systems out there with more than one battery device 
+                 * present
+                 */
+                break;
+            }
+        }
+        closedir(dir);
+        xs_watch(xs, "/pm/events", "refreshbatterystatus");
+        xs_read_watch(xs, &count); 
+    }
+
+    return ret;
+}
+
+/* Borrowed daemonize from xenstored - Initially written by Stevens. */
+static void daemonize(void)
+{
+    pid_t pid;
+
+    if ( (pid = fork()) < 0 )
+        exit(1);
+
+    if ( pid != 0 )
+        exit(0);
+
+    setsid();
+
+    if ( (pid = fork()) < 0 )
+        exit(1);
+
+    if ( pid != 0 )
+        exit(0);
+
+    if ( chdir("/") == -1 )
+        exit(1);
+
+    umask(0);
+}
+
+int main(int argc, char *argv[])
+{
+#ifndef RUN_STANDALONE
+    daemonize();
+#endif
+    xs = (struct xs_handle *)xs_daemon_open();
+    if ( xs == NULL ) 
+        return -1;
+
+    if ( write_one_time_battery_info() == 0 ) 
+    {
+        xs_daemon_close(xs);
+        return -1;
+    }
+
+    wait_for_and_update_battery_status_request();
+    xs_daemon_close(xs);
+    return 0;
+}
+
index 1177b55ac06c9b184eec701b7b6aaaf5a4821dfb..b40992959fe8c315c4692f2c926b353aafcf479f 100644 (file)
@@ -155,4 +155,6 @@ endif
 .PHONY: clean
 clean:
        rm -f $(LIB) $(SHLIB) $(SHLIB_LINKS) $(OBJECTS-y) \
-             $(BINDINGS) $(BINDINGSRC)
+             $(BINDINGS) $(BINDINGSRC) $(DEPS)
+
+-include $(DEPS)
index 69f483e8edc415aeadf0edc7efb26639f505fe11..2ac3ccfa01b03bd16ca43a09ba6b77f24abf1a1b 100644 (file)
@@ -182,12 +182,6 @@ int xenstat_collect_vbds(xenstat_node * node)
        struct dirent *dp;
        struct priv_data *priv = get_priv_data(node->handle);
 
-       char *sys_prefix = "statistics/";
-
-       /* 23 = "statistics/" + "xxxx_xx_req" */
-       char ooreq[23], rdreq[23], wrreq[23]; 
-       char *stat_prefix = NULL;
-
        if (priv == NULL) {
                perror("Allocation error");
                return 0;
@@ -215,16 +209,12 @@ int xenstat_collect_vbds(xenstat_node * node)
                if (ret != 3)
                        continue;
 
-
-               if (strcmp(buf,"vbd") == 0){
-                       stat_prefix = "";
+               if (strcmp(buf,"vbd") == 0)
                        vbd.back_type = 1;
-               } else if (strcmp(buf,"tap") == 0){
-                       stat_prefix = "tap_";
+               else if (strcmp(buf,"tap") == 0)
                        vbd.back_type = 2;
-               } else {
+               else
                        continue;
-               }
 
                domain = xenstat_node_domain(node, domid);
                if (domain == NULL) {
@@ -235,22 +225,19 @@ int xenstat_collect_vbds(xenstat_node * node)
                        continue;
                }
 
-               snprintf(ooreq, sizeof(ooreq), "%s%soo_req", sys_prefix, stat_prefix);
-               if((read_attributes_vbd(dp->d_name, ooreq, buf, 256)<=0)
+               if((read_attributes_vbd(dp->d_name, "statistics/oo_req", buf, 256)<=0)
                   || ((ret = sscanf(buf, "%llu", &vbd.oo_reqs)) != 1))
                {
                        continue;
                }
 
-               snprintf(rdreq,  sizeof(rdreq),"%s%srd_req", sys_prefix, stat_prefix);
-               if((read_attributes_vbd(dp->d_name, rdreq, buf, 256)<=0)
+               if((read_attributes_vbd(dp->d_name, "statistics/rd_req", buf, 256)<=0)
                   || ((ret = sscanf(buf, "%llu", &vbd.rd_reqs)) != 1))
                {
                        continue;
                }
 
-               snprintf(wrreq,  sizeof(wrreq),"%s%swr_req", sys_prefix, stat_prefix);
-               if((read_attributes_vbd(dp->d_name, wrreq, buf, 256)<=0)
+               if((read_attributes_vbd(dp->d_name, "statistics/wr_req", buf, 256)<=0)
                   || ((ret = sscanf(buf, "%llu", &vbd.wr_reqs)) != 1))
                {
                        continue;
index 4b862a4ec55e793af6b054c816c91398803744a3..15daa39c598d48a94471e5c082f4e321bd8041a0 100644 (file)
@@ -37,4 +37,6 @@ endif
 
 .PHONY: clean
 clean:
-       rm -f xentop xentop.o
+       rm -f xentop xentop.o $(DEPS)
+
+-include $(DEPS)
index da2df45bb05c7b2d9a5e61b5d2d7b934898a4d6c..0bbedfef771cbbf9b2790b6113bf3ca3b8352e1f 100644 (file)
@@ -254,7 +254,7 @@ static void fail(const char *str)
 {
        if(cwin != NULL && !isendwin())
                endwin();
-       fprintf(stderr, str);
+       fprintf(stderr, "%s", str);
        exit(1);
 }
 
index 9b7e9e89c627e69acc13ddf8f7c916c08563d0bd..91b854da9078255fd5d5d1693ff517d56fbeb3f6 100644 (file)
@@ -8,10 +8,6 @@ CFLAGS += -Werror
 CFLAGS += -I.
 CFLAGS += $(CFLAGS_libxenctrl)
 
-# Make gcc generate dependencies.
-CFLAGS += -Wp,-MD,.$(@F).d
-DEP    = .*.d
-
 CLIENTS := xenstore-exists xenstore-list xenstore-read xenstore-rm xenstore-chmod
 CLIENTS += xenstore-write xenstore-ls
 
@@ -82,7 +78,7 @@ clean:
        rm -f xenstored xs_random xs_stress xs_crashme
        rm -f xs_tdb_dump xenstore-control
        rm -f xenstore $(CLIENTS)
-       $(RM) $(DEP)
+       $(RM) $(DEPS)
 
 .PHONY: TAGS
 TAGS:
@@ -101,9 +97,9 @@ install: all
        $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR)
        $(INSTALL_PROG) xenstored $(DESTDIR)$(SBINDIR)
        $(INSTALL_PROG) xenstore-control $(DESTDIR)$(BINDIR)
-       $(INSTALL_PROG) xenstore $(DESTDIR)/usr/bin
+       $(INSTALL_PROG) xenstore $(DESTDIR)$(BINDIR)
        set -e ; for c in $(CLIENTS) ; do \
-               ln -f $(DESTDIR)/usr/bin/xenstore $(DESTDIR)/usr/bin/$${c} ; \
+               ln -f $(DESTDIR)$(BINDIR)/xenstore $(DESTDIR)$(BINDIR)/$${c} ; \
        done
        $(INSTALL_DIR) $(DESTDIR)$(LIBDIR)
        $(INSTALL_PROG) libxenstore.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
@@ -113,7 +109,7 @@ install: all
        $(INSTALL_DATA) xs.h $(DESTDIR)$(INCLUDEDIR)
        $(INSTALL_DATA) xs_lib.h $(DESTDIR)$(INCLUDEDIR)
 
--include $(DEP)
+-include $(DEPS)
 
 # never delete any intermediate files.
 .SECONDARY:
index 90bfd05b917b2f11ede7bb9fccad98aa74018921..b043ac45f849b926d8c55927bc964a6fe63bd991 100644 (file)
@@ -1937,14 +1937,17 @@ int main(int argc, char *argv[])
                        handle_event();
 
                next = list_entry(connections.next, typeof(*conn), list);
+               if (&next->list != &connections)
+                       talloc_increase_ref_count(next);
                while (&next->list != &connections) {
                        conn = next;
 
                        next = list_entry(conn->list.next,
                                          typeof(*conn), list);
+                       if (&next->list != &connections)
+                               talloc_increase_ref_count(next);
 
                        if (conn->domain) {
-                               talloc_increase_ref_count(conn);
                                if (domain_can_read(conn))
                                        handle_input(conn);
                                if (talloc_free(conn) == 0)
@@ -1957,7 +1960,6 @@ int main(int argc, char *argv[])
                                if (talloc_free(conn) == 0)
                                        continue;
                        } else {
-                               talloc_increase_ref_count(conn);
                                if (FD_ISSET(conn->fd, &inset))
                                        handle_input(conn);
                                if (talloc_free(conn) == 0)
index 7fef574fbdcb9f653b19bba97909349ba00ef6fd..9707d19ca28d244c72a2d22aeca0085af46669f4 100644 (file)
@@ -802,6 +802,31 @@ bool xs_is_domain_introduced(struct xs_handle *h, unsigned int domid)
        return rc;
 }
 
+int xs_suspend_evtchn_port(int domid)
+{
+    char path[128];
+    char *portstr;
+    int port;
+    unsigned int plen;
+    struct xs_handle *xs;
+
+    xs = xs_daemon_open();
+    if (!xs)
+        return -1;
+
+    sprintf(path, "/local/domain/%d/device/suspend/event-channel", domid);
+    portstr = xs_read(xs, XBT_NULL, path, &plen);
+    xs_daemon_close(xs);
+
+    if (!portstr || !plen)
+        return -1;
+
+    port = atoi(portstr);
+    free(portstr);
+
+    return port;
+}
+
 /* Only useful for DEBUG versions */
 char *xs_debug_command(struct xs_handle *h, const char *cmd,
                       void *data, unsigned int len)
index dd8cbf8c956f7de957507c8d9b74f3027be5a83b..629395291ac7d7b871257b15a87e2aba1fb9d21f 100644 (file)
@@ -163,6 +163,7 @@ bool xs_is_domain_introduced(struct xs_handle *h, unsigned int domid);
 char *xs_debug_command(struct xs_handle *h, const char *cmd,
                       void *data, unsigned int len);
 
+int xs_suspend_evtchn_port(int domid);
 #endif /* _XS_H */
 
 /*
index 336ee007e79288cb7d26f47eb635c0b9728bfc2e..f540b420530837f3ffca5b8d54f2a0c01a81b167 100644 (file)
@@ -46,9 +46,12 @@ install: build
 
 .PHONY: clean
 clean:
-       $(RM) *.a *.so *.o *.rpm $(BIN) $(LIBBIN)
+       $(RM) *.a *.so *.o *.rpm $(BIN) $(LIBBIN) $(DEPS)
 
 %: %.c $(HDRS) Makefile
        $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
 xentrace_%: %.c $(HDRS) Makefile
        $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS)
+
+-include $(DEPS)
+
index 5d84eec6b3cafa33d080403d9d56b557c557754f..42744a7352ac2b950ad91acef54e8232e2a928d9 100644 (file)
 0x0040f10f  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_resync_only        [ gfn = 0x%(1)16x ]
 
 0x00801001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  cpu_freq_change [ %(1)dMHz -> %(2)dMHz ]
-0x00802001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  cpu_idle_entry  [ C0 -> C%(1)d ]
-0x00802002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  cpu_idle_exit   [ C%(1)d -> C0 ]
+0x00802001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  cpu_idle_entry  [ C0 -> C%(1)d, acpi_pm_tick = %(2)d ]
+0x00802002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  cpu_idle_exit   [ C%(1)d -> C0, acpi_pm_tick = %(2)d ]
index 94f6cd835f8fe52735ba2c3a250d1d1fbab59d85..d5ec038d9f98d4ca88fe7d294a2372aaf2e2d5d2 100644 (file)
@@ -24,6 +24,9 @@
 #include <getopt.h>
 
 #include "xenctrl.h"
+#include <xen/foreign/x86_32.h>
+#include <xen/foreign/x86_64.h>
+#include <xen/hvm/save.h>
 
 int xc_handle = 0;
 int domid = 0;
@@ -31,28 +34,18 @@ int frame_ptrs = 0;
 int stack_trace = 0;
 int disp_all = 0;
 
-#if defined (__i386__)
-#if defined (__OpenBSD__)
-#define FMT_SIZE_T             "%08lx"
-#define INSTR_POINTER(regs)    (unsigned long)(regs->eip)
-#else
-#define FMT_SIZE_T             "%08x"
-#define INSTR_POINTER(regs)    (regs->eip)
-#endif
-#define STACK_POINTER(regs)    (regs->esp)
-#define FRAME_POINTER(regs)    (regs->ebp)
-#define STACK_ROWS             4
-#define STACK_COLS             8
-#elif defined (__x86_64__)
-#define FMT_SIZE_T             "%016lx"
-#define STACK_POINTER(regs)    (regs->rsp)
-#define FRAME_POINTER(regs)    (regs->rbp)
-#define INSTR_POINTER(regs)    (regs->rip)
-#define STACK_ROWS             4
-#define STACK_COLS             4
+#if defined (__i386__) || defined (__x86_64__)
+typedef unsigned long long guest_word_t;
+#define FMT_32B_WORD "%08llx"
+#define FMT_64B_WORD "%016llx"
+/* Word-length of the guest's own data structures */
+int guest_word_size = sizeof (unsigned long);
+/* Word-length of the context record we get from xen */
+int ctxt_word_size = sizeof (unsigned long);
 #elif defined (__ia64__)
 /* On ia64, we can't translate virtual address to physical address.  */
 #define NO_TRANSLATION
+typedef size_t guest_word_t;
 
 /* Which registers should be displayed.  */
 int disp_cr_regs;
@@ -63,22 +56,19 @@ int disp_tlb;
 #endif
 
 struct symbol {
-    size_t address;
+    guest_word_t address;
     char type;
     char *name;
     struct symbol *next;
 } *symbol_table = NULL;
 
-size_t kernel_stext, kernel_etext, kernel_sinittext, kernel_einittext, kernel_hypercallpage;
+guest_word_t kernel_stext, kernel_etext, kernel_sinittext, kernel_einittext, kernel_hypercallpage;
 
-static int is_kernel_text(size_t addr)
+static int is_kernel_text(guest_word_t addr)
 {
-#if defined (__i386__)
+#if defined (__i386__) || defined (__x86_64__)
     if (symbol_table == NULL)
-        return (addr > 0xc000000);
-#elif defined (__x86_64__)
-    if (symbol_table == NULL)
-        return (addr > 0xffffffff80000000UL);
+        return (addr > ((guest_word_size == 4) ? 0xc000000 : 0xffffffff80000000ULL));
 #elif defined (__ia64__)
     if (symbol_table == NULL)
         return (addr > 0xa000000000000000UL);
@@ -134,7 +124,7 @@ static void insert_symbol(struct symbol *symbol)
     prev = symbol;
 }
 
-static struct symbol *lookup_symbol(size_t address)
+static struct symbol *lookup_symbol(guest_word_t address)
 {
     struct symbol *s = symbol_table;
 
@@ -147,7 +137,7 @@ static struct symbol *lookup_symbol(size_t address)
     return NULL;
 }
 
-static void print_symbol(size_t addr)
+static void print_symbol(guest_word_t addr)
 {
     struct symbol *s;
 
@@ -255,21 +245,23 @@ static void print_flags(uint64_t flags)
     printf("\n");
 }
 
-static void print_special(unsigned long *regs, const char *name, unsigned int mask)
+static void print_special(void *regs, const char *name, unsigned int mask, int width)
 {
     unsigned int i;
 
     printf("\n");
     for (i = 0; mask; mask >>= 1, ++i)
-        if (mask & 1)
-            printf("%s%u: " FMT_SIZE_T "\n", name, i, (size_t)regs[i]);
+        if (mask & 1) {
+            if (width == 4)
+                printf("%s%u: %08"PRIx32"\n", name, i, ((uint32_t *) regs)[i]);
+            else
+                printf("%s%u: %08"PRIx64"\n", name, i, ((uint64_t *) regs)[i]);
+        }
 }
-#endif
 
-#ifdef __i386__
-static void print_ctx(vcpu_guest_context_t *ctx1)
+static void print_ctx_32(vcpu_guest_context_x86_32_t *ctx)
 {
-    struct cpu_user_regs *regs = &ctx1->user_regs;
+    struct cpu_user_regs_x86_32 *regs = &ctx->user_regs;
 
     printf("cs:eip: %04x:%08x ", regs->cs, regs->eip);
     print_symbol(regs->eip);
@@ -291,54 +283,118 @@ static void print_ctx(vcpu_guest_context_t *ctx1)
     printf(" gs:     %04x\n", regs->gs);
 
     if (disp_all) {
-        print_special(ctx1->ctrlreg, "cr", 0x1d);
-        print_special(ctx1->debugreg, "dr", 0xcf);
+        print_special(ctx->ctrlreg, "cr", 0x1d, 4);
+        print_special(ctx->debugreg, "dr", 0xcf, 4);
+    }
+}
+
+static void print_ctx_32on64(vcpu_guest_context_x86_64_t *ctx)
+{
+    struct cpu_user_regs_x86_64 *regs = &ctx->user_regs;
+
+    printf("cs:eip: %04x:%08x ", regs->cs, (uint32_t)regs->eip);
+    print_symbol((uint32_t)regs->eip);
+    print_flags((uint32_t)regs->eflags);
+    printf("ss:esp: %04x:%08x\n", regs->ss, (uint32_t)regs->esp);
+
+    printf("eax: %08x\t", (uint32_t)regs->eax);
+    printf("ebx: %08x\t", (uint32_t)regs->ebx);
+    printf("ecx: %08x\t", (uint32_t)regs->ecx);
+    printf("edx: %08x\n", (uint32_t)regs->edx);
+
+    printf("esi: %08x\t", (uint32_t)regs->esi);
+    printf("edi: %08x\t", (uint32_t)regs->edi);
+    printf("ebp: %08x\n", (uint32_t)regs->ebp);
+
+    printf(" ds:     %04x\t", regs->ds);
+    printf(" es:     %04x\t", regs->es);
+    printf(" fs:     %04x\t", regs->fs);
+    printf(" gs:     %04x\n", regs->gs);
+
+    if (disp_all) {
+        print_special(ctx->ctrlreg, "cr", 0x1d, 4);
+        print_special(ctx->debugreg, "dr", 0xcf, 4);
     }
 }
-#elif defined(__x86_64__)
-static void print_ctx(vcpu_guest_context_t *ctx1)
+
+static void print_ctx_64(vcpu_guest_context_x86_64_t *ctx)
 {
-    struct cpu_user_regs *regs = &ctx1->user_regs;
+    struct cpu_user_regs_x86_64 *regs = &ctx->user_regs;
 
-    printf("rip: %016lx ", regs->rip);
+    printf("rip: %016"PRIx64" ", regs->rip);
     print_symbol(regs->rip);
     print_flags(regs->rflags);
-    printf("rsp: %016lx\n", regs->rsp);
+    printf("rsp: %016"PRIx64"\n", regs->rsp);
 
-    printf("rax: %016lx\t", regs->rax);
-    printf("rcx: %016lx\t", regs->rcx);
-    printf("rdx: %016lx\n", regs->rdx);
+    printf("rax: %016"PRIx64"\t", regs->rax);
+    printf("rcx: %016"PRIx64"\t", regs->rcx);
+    printf("rdx: %016"PRIx64"\n", regs->rdx);
 
-    printf("rbx: %016lx\t", regs->rbx);
-    printf("rsi: %016lx\t", regs->rsi);
-    printf("rdi: %016lx\n", regs->rdi);
+    printf("rbx: %016"PRIx64"\t", regs->rbx);
+    printf("rsi: %016"PRIx64"\t", regs->rsi);
+    printf("rdi: %016"PRIx64"\n", regs->rdi);
 
-    printf("rbp: %016lx\t", regs->rbp);
-    printf(" r8: %016lx\t", regs->r8);
-    printf(" r9: %016lx\n", regs->r9);
+    printf("rbp: %016"PRIx64"\t", regs->rbp);
+    printf(" r8: %016"PRIx64"\t", regs->r8);
+    printf(" r9: %016"PRIx64"\n", regs->r9);
 
-    printf("r10: %016lx\t", regs->r10);
-    printf("r11: %016lx\t", regs->r11);
-    printf("r12: %016lx\n", regs->r12);
+    printf("r10: %016"PRIx64"\t", regs->r10);
+    printf("r11: %016"PRIx64"\t", regs->r11);
+    printf("r12: %016"PRIx64"\n", regs->r12);
 
-    printf("r13: %016lx\t", regs->r13);
-    printf("r14: %016lx\t", regs->r14);
-    printf("r15: %016lx\n", regs->r15);
+    printf("r13: %016"PRIx64"\t", regs->r13);
+    printf("r14: %016"PRIx64"\t", regs->r14);
+    printf("r15: %016"PRIx64"\n", regs->r15);
 
     printf(" cs: %04x\t", regs->cs);
     printf(" ss: %04x\t", regs->ss);
     printf(" ds: %04x\t", regs->ds);
     printf(" es: %04x\n", regs->es);
 
-    printf(" fs: %04x @ %016lx\n", regs->fs, ctx1->fs_base);
-    printf(" gs: %04x @ %016lx/%016lx\n", regs->gs,
-           ctx1->gs_base_kernel, ctx1->gs_base_user);
+    printf(" fs: %04x @ %016"PRIx64"\n", regs->fs, ctx->fs_base);
+    printf(" gs: %04x @ %016"PRIx64"/%016"PRIx64"\n", regs->gs,
+           ctx->gs_base_kernel, ctx->gs_base_user);
 
     if (disp_all) {
-        print_special(ctx1->ctrlreg, "cr", 0x1d);
-        print_special(ctx1->debugreg, "dr", 0xcf);
+        print_special(ctx->ctrlreg, "cr", 0x1d, 8);
+        print_special(ctx->debugreg, "dr", 0xcf, 8);
     }
 }
+
+static void print_ctx(vcpu_guest_context_any_t *ctx)
+{
+    if (ctxt_word_size == 4) 
+        print_ctx_32(&ctx->x32);
+    else if (guest_word_size == 4)
+        print_ctx_32on64(&ctx->x64);
+    else 
+        print_ctx_64(&ctx->x64);
+}
+
+static guest_word_t instr_pointer(vcpu_guest_context_any_t *ctx)
+{
+    if (ctxt_word_size == 4) 
+        return ctx->x32.user_regs.eip;
+    else 
+        return ctx->x64.user_regs.rip;
+}
+
+static guest_word_t stack_pointer(vcpu_guest_context_any_t *ctx)
+{
+    if (ctxt_word_size == 4) 
+        return ctx->x32.user_regs.esp;
+    else 
+        return ctx->x64.user_regs.rsp;
+}
+
+static guest_word_t frame_pointer(vcpu_guest_context_any_t *ctx)
+{
+    if (ctxt_word_size == 4) 
+        return ctx->x32.user_regs.ebp;
+    else 
+        return ctx->x64.user_regs.rbp;
+}
+
 #elif defined(__ia64__)
 
 #define PTE_ED_SHIFT              52
@@ -401,10 +457,10 @@ static void print_tr(int i, const struct ia64_tr_entry *tr)
            tr->itir >> ITIR_KEY_SHIFT & ITIR_KEY_MASK);
 }
 
-void print_ctx(vcpu_guest_context_t *ctx)
+void print_ctx(vcpu_guest_context_any_t *ctx)
 {
-    struct vcpu_guest_context_regs *regs = &ctx->regs;
-    struct vcpu_tr_regs *tr = &ctx->regs.tr;
+    struct vcpu_guest_context_regs *regs = &ctx->c.regs;
+    struct vcpu_tr_regs *tr = &ctx->c.regs.tr;
     int i;
     unsigned int rbs_size, cfm_sof;
 
@@ -457,7 +513,7 @@ void print_ctx(vcpu_guest_context_t *ctx)
         printf(" cmcv: %016lx\n", regs->cr.cmcv);
         printf(" lrr0: %016lx  ", regs->cr.lrr0);
         printf(" lrr1: %016lx  ", regs->cr.lrr1);
-        printf(" ev_cb:%016lx\n", ctx->event_callback_ip);
+        printf(" ev_cb:%016lx\n", ctx->c.event_callback_ip);
 
     }
     if (disp_ar_regs) {
@@ -584,7 +640,7 @@ void print_ctx(vcpu_guest_context_t *ctx)
 #endif
 
 #ifndef NO_TRANSLATION
-static void *map_page(vcpu_guest_context_t *ctx, int vcpu, size_t virt)
+static void *map_page(vcpu_guest_context_any_t *ctx, int vcpu, guest_word_t virt)
 {
     static unsigned long previous_mfn = 0;
     static void *mapped = NULL;
@@ -611,33 +667,53 @@ static void *map_page(vcpu_guest_context_t *ctx, int vcpu, size_t virt)
     return (void *)(mapped + offset);
 }
 
-static void print_stack(vcpu_guest_context_t *ctx, int vcpu)
+static guest_word_t read_stack_word(guest_word_t *src, int width)
 {
-    struct cpu_user_regs *regs = &ctx->user_regs;
-    size_t stack = STACK_POINTER(regs);
-    size_t stack_limit = (STACK_POINTER(regs) & XC_PAGE_MASK) + XC_PAGE_SIZE;
-    size_t frame;
-    size_t instr;
-    size_t *p;
+    guest_word_t word = 0;
+    /* Little-endian only */
+    memcpy(&word, src, width);
+    return word;
+}
+
+static void print_stack_word(guest_word_t word, int width)
+{
+    if (width == 4)
+        printf(FMT_32B_WORD, word);
+    else
+        printf(FMT_64B_WORD, word);
+}
+
+static void print_stack(vcpu_guest_context_any_t *ctx, int vcpu, int width)
+{
+    guest_word_t stack = stack_pointer(ctx);
+    guest_word_t stack_limit;
+    guest_word_t frame;
+    guest_word_t instr;
+    guest_word_t word;
+    guest_word_t *p;
     int i;
 
+    stack_limit = ((stack_pointer(ctx) + XC_PAGE_SIZE)
+                   & ~((guest_word_t) XC_PAGE_SIZE - 1)); 
     printf("\n");
     printf("Stack:\n");
-    for (i=1; i<STACK_ROWS+1 && stack < stack_limit; i++) {
-        while(stack < stack_limit && stack < STACK_POINTER(regs) + i*STACK_COLS*sizeof(stack)) {
+    for (i=1; i<5 && stack < stack_limit; i++) {
+        while(stack < stack_limit && stack < stack_pointer(ctx) + i*32) {
             p = map_page(ctx, vcpu, stack);
-            printf(" " FMT_SIZE_T, *p);
-            stack += sizeof(stack);
+            word = read_stack_word(p, width);
+            printf(" ");
+            print_stack_word(word, width);
+            stack += width;
         }
         printf("\n");
     }
     printf("\n");
 
     printf("Code:\n");
-    instr = INSTR_POINTER(regs) - 21;
+    instr = instr_pointer(ctx) - 21;
     for(i=0; i<32; i++) {
         unsigned char *c = map_page(ctx, vcpu, instr+i);
-        if (instr+i == INSTR_POINTER(regs))
+        if (instr+i == instr_pointer(ctx))
             printf("<%02x> ", *c);
         else
             printf("%02x ", *c);
@@ -650,52 +726,65 @@ static void print_stack(vcpu_guest_context_t *ctx, int vcpu)
         printf("Stack Trace:\n");
     else
         printf("Call Trace:\n");
-    printf("%c [<" FMT_SIZE_T ">] ",
-        stack_trace ? '*' : ' ', INSTR_POINTER(regs));
+    printf("%c [<", stack_trace ? '*' : ' ');
+    print_stack_word(instr_pointer(ctx), width);
+    printf(">] ");
 
-    print_symbol(INSTR_POINTER(regs));
+    print_symbol(instr_pointer(ctx));
     printf(" <--\n");
     if (frame_ptrs) {
-        stack = STACK_POINTER(regs);
-        frame = FRAME_POINTER(regs);
+        stack = stack_pointer(ctx);
+        frame = frame_pointer(ctx);
         while(frame && stack < stack_limit) {
             if (stack_trace) {
                 while (stack < frame) {
                     p = map_page(ctx, vcpu, stack);
-                    printf("|   " FMT_SIZE_T "   ", *p);
-                    printf("\n");
-                    stack += sizeof(*p);
+                    printf("|   ");
+                    print_stack_word(read_stack_word(p, width), width);
+                    printf("   \n");
+                    stack += width;
                 }
             } else {
                 stack = frame;
             }
 
             p = map_page(ctx, vcpu, stack);
-            frame = *p;
-            if (stack_trace)
-                printf("|-- " FMT_SIZE_T "\n", *p);
-            stack += sizeof(*p);
+            frame = read_stack_word(p, width);
+            if (stack_trace) {
+                printf("|-- ");
+                print_stack_word(read_stack_word(p, width), width);
+                printf("\n");
+            }
+            stack += width;
 
             if (frame) {
                 p = map_page(ctx, vcpu, stack);
-                printf("%c [<" FMT_SIZE_T ">] ", stack_trace ? '|' : ' ', *p);
-                print_symbol(*p);
+                word = read_stack_word(p, width);
+                printf("%c [<", stack_trace ? '|' : ' ');
+                print_stack_word(word, width);
+                printf(">] ");
+                print_symbol(word);
                 printf("\n");
-                stack += sizeof(*p);
+                stack += width;
             }
         }
     } else {
-        stack = STACK_POINTER(regs);
+        stack = stack_pointer(ctx);
         while(stack < stack_limit) {
             p = map_page(ctx, vcpu, stack);
-            if (is_kernel_text(*p)) {
-                printf("  [<" FMT_SIZE_T ">] ", *p);
-                print_symbol(*p);
+            word = read_stack_word(p, width);
+            if (is_kernel_text(word)) {
+                printf("  [<");
+                print_stack_word(word, width);
+                printf(">] ");
+                print_symbol(word);
                 printf("\n");
             } else if (stack_trace) {
-                printf("    " FMT_SIZE_T "\n", *p);
+                printf("    ");
+                print_stack_word(word, width);
+                printf("\n");
             }
-            stack += sizeof(*p);
+            stack += width;
         }
     }
 }
@@ -729,10 +818,39 @@ static void dump_ctx(int vcpu)
         exit(-1);
     }
 
-    print_ctx(&ctx.c);
+#if defined(__i386__) || defined(__x86_64__)
+    {
+        if (dominfo.hvm) {
+            struct hvm_hw_cpu cpuctx;
+            xen_capabilities_info_t xen_caps = "";
+            if (xc_domain_hvm_getcontext_partial(
+                    xc_handle, domid, HVM_SAVE_CODE(CPU), 
+                    vcpu, &cpuctx, sizeof cpuctx) != 0) {
+                perror("xc_domain_hvm_getcontext_partial");
+                exit(-1);
+            }
+            guest_word_size = (cpuctx.msr_efer & 0x400) ? 8 : 4;
+            /* HVM guest context records are always host-sized */
+            if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0) {
+                perror("xc_version");
+                exit(-1);
+            }
+            ctxt_word_size = (strstr(xen_caps, "xen-3.0-x86_64")) ? 8 : 4;
+        } else {
+            struct xen_domctl domctl;
+            memset(&domctl, 0, sizeof domctl);
+            domctl.domain = domid;
+            domctl.cmd = XEN_DOMCTL_get_address_size;
+            if (xc_domctl(xc_handle, &domctl) == 0)
+                ctxt_word_size = guest_word_size = domctl.u.address_size.size / 8;
+        }
+    }
+#endif
+
+    print_ctx(&ctx);
 #ifndef NO_TRANSLATION
-    if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs))))
-        print_stack(&ctx.c, vcpu);
+    if (is_kernel_text(instr_pointer(&ctx)))
+        print_stack(&ctx, vcpu, guest_word_size);
 #endif
 
     if (!dominfo.paused) {
@@ -774,9 +892,9 @@ static void usage(void)
 int main(int argc, char **argv)
 {
     int ch;
-    static const char *sopts = "fs:h"
+    static const char *sopts = "fs:ha"
 #ifdef __ia64__
-        "ar:"
+        "r:"
 #endif
         ;
     static const struct option lopts[] = {
index d66cb74578f9a8fd4aec9d5228158ec7fdc40e18..f0127309abb9fb8650b36243c270450a886b21a2 100644 (file)
@@ -81,7 +81,11 @@ signal.signal(signal.SIGINT,  sighand)
 
 interrupted = 0
 
-defs = read_defs(arg[0])
+try:
+    defs = read_defs(arg[0])
+except IOError, exn:
+    print exn
+    sys.exit(1)
 
 # structure of trace record (as output by xentrace):
 # HDR(I) {TSC(Q)} D1(I) D2(I) D3(I) D4(I) D5(I) D6(I) D7(I)
index 0c399698066ef33b0291ae215ed83b6c707e3dc1..857a9a5cca7a6f4c64cda18da7dc74e4705f8bba 100644 (file)
@@ -57,11 +57,13 @@ def getXendNetConfig():
     while val[0] != 'network-script':
         val = pin.get_val()
 
-    if val[1] == "network-bridge":
+    # split network command into script name and its parameters
+    sub_val = val[1].split()
+    if sub_val[0] == "network-bridge":
         netenv = "bridge"
-    elif val[1] == "network-route":
+    elif sub_val[0] == "network-route":
         netenv = "route"
-    elif val[1] == "network-nat":
+    elif sub_val[0] == "network-nat":
         netenv = "nat"
     else:
         raise NetworkError("Failed to get network env from xend config")
index c548aa9f65d70b35a72644329d22930fe634ea3d..159c13ed0e5abbc67d15d6a84e0086b06a792c69 100644 (file)
@@ -4,3 +4,4 @@ obj-m += platform-pci/
 obj-m += balloon/
 obj-m += blkfront/
 obj-m += netfront/
+obj-m += scsifront/
index bcc8b052076c8444445d4208ab4ec20fe0a1c16a..316592d83a75df80b948c6c9d3723ec84142e1bc 100644 (file)
@@ -4,6 +4,5 @@ obj-m  = xen-balloon.o
 
 EXTRA_CFLAGS += -I$(M)/platform-pci
 
-xen-balloon-objs =
-xen-balloon-objs += balloon.o
-xen-balloon-objs += sysfs.o
+xen-balloon-y := balloon.o sysfs.o
+xen-balloon-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o
diff --git a/unmodified_drivers/linux-2.6/compat-include/linux/scatterlist.h b/unmodified_drivers/linux-2.6/compat-include/linux/scatterlist.h
new file mode 100644 (file)
index 0000000..e26a6ac
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SCATTERLIST_H
+#define _LINUX_SCATTERLIST_H
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
+#error "This version of Linux should not need compat linux/scatterlist.h"
+#endif
+
+#include <asm/scatterlist.h>
+
+#endif /* _LINUX_SCATTERLIST_H */
index fe3dc6de3904498febe3f61deece6cd04620bf18..eceac2a4ff50f2ffe236725d50bd60619f297ce5 100644 (file)
@@ -147,9 +147,11 @@ extern char *kasprintf(gfp_t gfp, const char *fmt, ...)
  *   RHEL_VERSION
  */
 #if !defined(RHEL_VERSION) || (RHEL_VERSION == 4 && RHEL_UPDATE < 5)
+#if !defined(RHEL_MAJOR) || (RHEL_MAJOR == 4 && RHEL_MINOR < 5)
 typedef irqreturn_t (*irq_handler_t)(int, void *, struct pt_regs *);
 #endif
 #endif
+#endif
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 #define setup_xen_features xen_setup_features
index 9d0f04907e450d3f2266e406177bf90aaf484d8d..3c1c799c857678436ce3ab651408362f90cf4a4f 100755 (executable)
@@ -53,6 +53,7 @@ i[34567]86|x86_64)
         ln -sf ${XL}/include/asm-x86/mach-xen/asm/synch_bitops*.h include/asm
         ln -sf ${XL}/include/asm-x86/mach-xen/asm/maddr*.h include/asm
         ln -sf ${XL}/include/asm-x86/mach-xen/asm/gnttab_dma.h include/asm
+        ln -sf ${XL}/arch/x86/lib/scrub.c balloon
     else
         if [ $uname = x86_64 ]; then
             mkdir -p include/asm-i386
index 7d121ffe2261ee4fe051058fb512b73b749126b2..f30426eb198e6d215beb8306c616bbfd42a807fa 100644 (file)
@@ -15,3 +15,4 @@ _XEN_CPPFLAGS += -include $(objtree)/include/linux/autoconf.h
 
 EXTRA_CFLAGS += $(_XEN_CPPFLAGS)
 EXTRA_AFLAGS += $(_XEN_CPPFLAGS)
+CPPFLAGS := -I$(M)/include $(CPPFLAGS)
index 92b64e25895933690a9d546cb76b9a43abf966b4..ad667128a250932486f40b462ed5360aef2058ec 100644 (file)
@@ -11,12 +11,6 @@ struct ap_suspend_info {
        atomic_t nr_spinning;
 };
 
-/*
- * Use a rwlock to protect the hypercall page from being executed in AP context
- * while the BSP is re-initializing it after restore.
- */
-static DEFINE_RWLOCK(suspend_lock);
-
 #ifdef CONFIG_SMP
 
 /*
@@ -33,18 +27,18 @@ static void ap_suspend(void *_info)
        atomic_inc(&info->nr_spinning);
        mb();
 
-       while (info->do_spin) {
+       while (info->do_spin)
                cpu_relax();
-               read_lock(&suspend_lock);
-               HYPERVISOR_yield();
-               read_unlock(&suspend_lock);
-       }
 
        mb();
        atomic_dec(&info->nr_spinning);
 }
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
 #define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0, 0)
+#else
+#define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0)
+#endif
 
 #else /* !defined(CONFIG_SMP) */
 
@@ -61,9 +55,7 @@ static int bp_suspend(void)
        suspend_cancelled = HYPERVISOR_suspend(0);
 
        if (!suspend_cancelled) {
-               write_lock(&suspend_lock);
                platform_pci_resume();
-               write_unlock(&suspend_lock);
                gnttab_resume();
                irq_resume();
        }
index 2b35c5c757bf3facbbf3551904afda23b9a9d781..e4a766a909bf4b588f8d90fc834813b8fc02a224 100644 (file)
@@ -14,7 +14,11 @@ EXPORT_SYMBOL(system_state);
 
 void ctrl_alt_del(void)
 {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
        kill_proc(1, SIGINT, 1); /* interrupt init */
+#else
+       kill_cad_pid(SIGINT, 1);
+#endif
 }
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
diff --git a/unmodified_drivers/linux-2.6/scsifront/Kbuild b/unmodified_drivers/linux-2.6/scsifront/Kbuild
new file mode 100644 (file)
index 0000000..14875b3
--- /dev/null
@@ -0,0 +1,6 @@
+include $(M)/overrides.mk
+
+obj-m += xen-scsi.o
+
+xen-scsi-objs := scsifront.o xenbus.o
+
diff --git a/unmodified_drivers/linux-2.6/scsifront/Makefile b/unmodified_drivers/linux-2.6/scsifront/Makefile
new file mode 100644 (file)
index 0000000..64e7acd
--- /dev/null
@@ -0,0 +1,3 @@
+ifneq ($(KERNELRELEASE),)
+include $(src)/Kbuild
+endif
index 645897c3b353d54ea95438ceb5e7cec3db41fb5f..d16477a6a53f7a4372081f069d640381ae0467dd 100644 (file)
@@ -44,6 +44,7 @@ _clean: delete-unfresh-files
        $(MAKE) -f $(BASEDIR)/Rules.mk -C common clean
        $(MAKE) -f $(BASEDIR)/Rules.mk -C drivers clean
        $(MAKE) -f $(BASEDIR)/Rules.mk -C xsm clean
+       $(MAKE) -f $(BASEDIR)/Rules.mk -C crypto clean
        $(MAKE) -f $(BASEDIR)/Rules.mk -C arch/$(TARGET_ARCH) clean
        rm -f include/asm *.o $(TARGET)* *~ core
        rm -f include/asm-*/asm-offsets.h
index 36292c92e54e69afa09556d76b8ea1f88ba4f16e..5601e451918a22d1bea7f4298f6c198e3cfb8c56 100644 (file)
@@ -23,9 +23,6 @@ endif
 ifeq ($(perfc_arrays),y)
 perfc := y
 endif
-ifeq ($(frame_pointer),y)
-CFLAGS := $(shell echo $(CFLAGS) | sed -e 's/-f[^ ]*omit-frame-pointer//g')
-endif
 
 # Set ARCH/SUBARCH appropriately.
 override TARGET_SUBARCH  := $(XEN_TARGET_ARCH)
@@ -34,26 +31,14 @@ override TARGET_ARCH     := $(shell echo $(XEN_TARGET_ARCH) | \
 
 TARGET := $(BASEDIR)/xen
 
-HDRS := $(wildcard *.h)
-HDRS += $(wildcard $(BASEDIR)/include/xen/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/xen/hvm/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/public/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/public/*/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/compat/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-$(TARGET_ARCH)/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-$(TARGET_ARCH)/$(TARGET_SUBARCH)/*.h)
-
 include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk
 
-# Do not depend on auto-generated header files.
-AHDRS := $(filter-out %/include/xen/compile.h,$(HDRS))
-HDRS  := $(filter-out %/asm-offsets.h,$(AHDRS))
-
 # Note that link order matters!
 ALL_OBJS-y               += $(BASEDIR)/common/built_in.o
 ALL_OBJS-y               += $(BASEDIR)/drivers/built_in.o
 ALL_OBJS-y               += $(BASEDIR)/xsm/built_in.o
 ALL_OBJS-y               += $(BASEDIR)/arch/$(TARGET_ARCH)/built_in.o
+ALL_OBJS-$(x86)          += $(BASEDIR)/crypto/built_in.o
 
 CFLAGS-y                += -g -D__XEN__
 CFLAGS-$(XSM_ENABLE)    += -DXSM_ENABLE
@@ -69,20 +54,26 @@ CFLAGS-$(frame_pointer) += -fno-omit-frame-pointer -DCONFIG_FRAME_POINTER
 ifneq ($(max_phys_cpus),)
 CFLAGS-y                += -DMAX_PHYS_CPUS=$(max_phys_cpus)
 endif
+ifneq ($(max_phys_irqs),)
+CFLAGS-y                += -DMAX_PHYS_IRQS=$(max_phys_irqs)
+endif
 
 AFLAGS-y                += -D__ASSEMBLY__
 
 ALL_OBJS := $(ALL_OBJS-y)
 
-CFLAGS   := $(strip $(CFLAGS) $(CFLAGS-y))
+# Get gcc to generate the dependencies for us.
+CFLAGS-y += -MMD -MF .$(@F).d
+DEPS = .*.d
+
+CFLAGS += $(CFLAGS-y)
 
 # Most CFLAGS are safe for assembly files:
 #  -std=gnu{89,99} gets confused by #-prefixed end-of-line comments
-AFLAGS   := $(strip $(AFLAGS) $(AFLAGS-y))
-AFLAGS   += $(patsubst -std=gnu%,,$(CFLAGS))
+AFLAGS += $(AFLAGS-y) $(filter-out -std=gnu%,$(CFLAGS))
 
 # LDFLAGS are only passed directly to $(LD)
-LDFLAGS  := $(strip $(LDFLAGS) $(LDFLAGS_DIRECT))
+LDFLAGS += $(LDFLAGS_DIRECT)
 
 include Makefile
 
@@ -112,19 +103,21 @@ FORCE:
 
 .PHONY: clean
 clean:: $(addprefix _clean_, $(subdir-all))
-       rm -f *.o *~ core
+       rm -f *.o *~ core $(DEPS)
 _clean_%/: FORCE
        $(MAKE) -f $(BASEDIR)/Rules.mk -C $* clean
 
-%.o: %.c $(HDRS) Makefile
+%.o: %.c Makefile
        $(CC) $(CFLAGS) -c $< -o $@
 
-%.o: %.S $(AHDRS) Makefile
+%.o: %.S Makefile
        $(CC) $(AFLAGS) -c $< -o $@
 
-%.i: %.c $(HDRS) Makefile
+%.i: %.c Makefile
        $(CPP) $(CFLAGS) $< -o $@
 
 # -std=gnu{89,99} gets confused by # as an end-of-line comment marker
-%.s: %.S $(AHDRS) Makefile
+%.s: %.S Makefile
        $(CPP) $(AFLAGS) $< -o $@
+
+-include $(DEPS)
index 664bdfacbe890810a3110cd48d7008aecfe7d588..a5e57ba8a21e464ee7bd1fde47e1634f706afb98 100644 (file)
@@ -29,11 +29,11 @@ $(TARGET): $(TARGET)-syms
 # Headers do not depend on auto-generated header, but object files do.
 $(ALL_OBJS): $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h
 
-asm-offsets.s: asm-offsets.c $(HDRS) \
+asm-offsets.s: asm-offsets.c \
     $(BASEDIR)/include/asm-ia64/.offsets.h.stamp 
        $(CC) $(CFLAGS) -DGENERATE_ASM_OFFSETS -DIA64_TASK_SIZE=0 -S -o $@ $<
 
-asm-xsi-offsets.s: asm-xsi-offsets.c $(HDRS)
+asm-xsi-offsets.s: asm-xsi-offsets.c
        $(CC) $(CFLAGS) -S -o $@ $<
 
 $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h: asm-xsi-offsets.s
@@ -61,7 +61,7 @@ $(BASEDIR)/include/asm-ia64/.offsets.h.stamp:
        touch $@
 
 # I'm sure a Makefile wizard would know a better way to do this
-xen.lds.s: xen/xen.lds.S $(HDRS)
+xen.lds.s: xen/xen.lds.S
        $(CC) -E $(CPPFLAGS) -P -DXEN $(AFLAGS) \
                -o xen.lds.s xen/xen.lds.S
 
index 2c59eb4a18c0454b5499287c00adbcde5f3541c5..bef11c3196293982a6d1ecb137b388b58bcc12ea 100644 (file)
@@ -16,6 +16,13 @@ xen_ia64_tlb_track_cnt       ?= n
 xen_ia64_tlbflush_clock        ?= y
 xen_ia64_disable_optvfault ?= n
 
+# If they are enabled,
+# shrink struct page_info assuming all mfn can be addressed by 32 bits.
+# However, with 50bit ia64 architected physical address and 16KB page size,
+# mfn isn't always assessed by 32bit. So they are disabled by default.
+xen_ia64_shrink_page_list ?= n
+xen_ia64_pickle_domain ?= n
+
 # Used only by linux/Makefile.
 AFLAGS_KERNEL  += -mconstant-gp -nostdinc $(CPPFLAGS)
 
@@ -71,20 +78,11 @@ endif
 ifeq ($(xen_ia64_disable_optvfault),y)
 CFLAGS += -DCONFIG_XEN_IA64_DISABLE_OPTVFAULT
 endif
+ifeq ($(xen_ia64_shrink_page_list),y)
+CFLAGS += -DCONFIG_IA64_SHRINK_PAGE_LIST
+endif
+ifeq ($(xen_ia64_pickle_domain),y)
+CFLAGS += -DCONFIG_IA64_PICKLE_DOMAIN
+endif
 
-LDFLAGS := -g
-
-# Additionnal IA64 include dirs.
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux-null/asm/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux-null/asm/sn/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux-null/linux/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux-xen/asm/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux-xen/asm/sn/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux-xen/linux/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux/asm-generic/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux/asm/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/linux/byteorder/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-ia64/hvm/*.h)
-
-HDRS := $(filter-out %/include/asm-ia64/asm-xsi-offsets.h,$(HDRS))
+LDFLAGS = -g
index 97ad0bbcc57c3b27cd8c5280a96780abafd53ddd..7c15f07dfa4fc65e72d4f734cf52561c05edbb44 100644 (file)
@@ -76,11 +76,7 @@ unsigned int acpi_cpei_phys_cpuid;
 unsigned long acpi_wakeup_address = 0;
 
 #ifdef CONFIG_IA64_GENERIC
-#ifndef XEN
 static unsigned long __init acpi_find_rsdp(void)
-#else
-unsigned long __init acpi_find_rsdp(void)
-#endif
 {
        unsigned long rsdp_phys = 0;
 
@@ -797,6 +793,10 @@ int __init acpi_boot_init(void)
        if (acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt))
                printk(KERN_ERR PREFIX "Can't find FADT\n");
 
+#ifdef XEN
+       acpi_dmar_init();
+#endif
+
 #ifdef CONFIG_SMP
        if (available_cpus == 0) {
                printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
index 0c997f593a1a86ec8ef2ea72e1445fcb3c7d4076..2ef757b46d8414310292e4b92ae809cc9dbae236 100644 (file)
@@ -267,8 +267,13 @@ start_ap:
        /*
         * Switch into virtual mode:
         */
+#ifdef XEN
+               movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
+                 |IA64_PSR_DI|IA64_PSR_AC)
+#else
        movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
                  |IA64_PSR_DI)
+#endif
        ;;
        mov cr.ipsr=r16
        movl r17=1f
index b6065f741084d91e09c35261a6b779adeadcda49..6d187f84fb1227140998cfddcd01313e1acad302 100644 (file)
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+#ifdef XEN
+static inline int iosapic_irq_to_vector (int irq)
+{
+       return irq;
+}
+
+#undef irq_to_vector
+#define irq_to_vector(irq)      iosapic_irq_to_vector(irq)
+#define AUTO_ASSIGN    AUTO_ASSIGN_IRQ
+#endif
 
 #undef DEBUG_INTERRUPT_ROUTING
 
index 754c6ea36719c1f52e00129dfc44ab5425e5abac..c20b021adf4c041dc126ac6ec5a14e0862887c12 100644 (file)
@@ -242,14 +242,11 @@ static struct irqaction ipi_irqaction = {
 };
 #endif
 
-#ifdef XEN
-extern void setup_vector (unsigned int vec, struct irqaction *action);
-#endif
-
 void
 register_percpu_irq (ia64_vector vec, struct irqaction *action)
 {
        irq_desc_t *desc;
+#ifndef XEN
        unsigned int irq;
 
        for (irq = 0; irq < NR_IRQS; ++irq)
@@ -258,28 +255,31 @@ register_percpu_irq (ia64_vector vec, struct irqaction *action)
                        desc->status |= IRQ_PER_CPU;
                        desc->handler = &irq_type_ia64_lsapic;
                        if (action)
-#ifdef XEN
-                               setup_vector(irq, action);
-#else
                                setup_irq(irq, action);
-#endif
                }
+#else
+       desc = irq_descp(vec);
+       desc->status |= IRQ_PER_CPU;
+       desc->handler = &irq_type_ia64_lsapic;
+       if (action)
+               setup_vector(vec, action);
+#endif
 }
 
 #ifdef XEN
-int request_irq(unsigned int irq,
+int request_irq_vector(unsigned int vector,
                void (*handler)(int, void *, struct cpu_user_regs *),
                unsigned long irqflags, const char * devname, void *dev_id)
 {
        struct irqaction * action;
-       int retval=0;
+       int retval;
 
        /*
         * Sanity-check: shared interrupts must pass in a real dev-ID,
         * otherwise we'll have trouble later trying to figure out
         * which interrupt is which (messes up the interrupt freeing logic etc).
         *                          */
-       if (irq >= NR_IRQS)
+       if (vector >= NR_VECTORS)
                return -EINVAL;
        if (!handler)
                return -EINVAL;
@@ -291,7 +291,8 @@ int request_irq(unsigned int irq,
        action->handler = handler;
        action->name = devname;
        action->dev_id = dev_id;
-       setup_vector(irq, action);
+
+       retval = setup_vector(vector, action);
        if (retval)
                xfree(action);
 
index 3abef0bf31cbe8396bbd214ece239505264a6971..8e9d9f74015ebf962463696fec1c46d009aefbd9 100644 (file)
@@ -112,10 +112,6 @@ unsigned long __per_cpu_mca[NR_CPUS];
 /* In mca_asm.S */
 extern void                    ia64_monarch_init_handler (void);
 extern void                    ia64_slave_init_handler (void);
-#ifdef XEN
-extern void setup_vector (unsigned int vec, struct irqaction *action);
-#define setup_irq(irq, action) setup_vector(irq, action)
-#endif
 
 static ia64_mc_info_t          ia64_mc_info;
 
@@ -210,6 +206,7 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
 #define IA64_LOG_COUNT(it)         ia64_state_log[it].isl_count
 
 #ifdef XEN
+sal_queue_entry_t sal_entry[NR_CPUS][IA64_MAX_LOG_TYPES];
 struct list_head *sal_queue, sal_log_queues[IA64_MAX_LOG_TYPES];
 sal_log_record_header_t *sal_record;
 DEFINE_SPINLOCK(sal_queue_lock);
@@ -358,6 +355,7 @@ ia64_log_queue(int sal_info_type, int virq)
 
        if (total_len) {
                int queue_type;
+               int cpuid = smp_processor_id();
 
                spin_lock_irqsave(&sal_queue_lock, flags);
 
@@ -366,15 +364,22 @@ ia64_log_queue(int sal_info_type, int virq)
                else
                        queue_type = sal_info_type;
 
-               e = xmalloc(sal_queue_entry_t);
-               BUG_ON(e == NULL);
-               e->cpuid = smp_processor_id();
+               /* Skip if sal_entry is already listed in sal_queue */
+               list_for_each_entry(e, &sal_queue[queue_type], list) {
+                       if (e == &sal_entry[cpuid][queue_type])
+                               goto found;
+               }
+               e = &sal_entry[cpuid][queue_type];
+               memset(e, 0, sizeof(sal_queue_entry_t));
+               e->cpuid = cpuid;
                e->sal_info_type = sal_info_type;
                e->vector = IA64_CMC_VECTOR;
                e->virq = virq;
                e->length = total_len;
 
                list_add_tail(&e->list, &sal_queue[queue_type]);
+
+       found:
                spin_unlock_irqrestore(&sal_queue_lock, flags);
 
                IA64_LOG_INDEX_INC(sal_info_type);
@@ -1917,17 +1922,25 @@ ia64_mca_late_init(void)
 
        {
                irq_desc_t *desc;
+#ifndef XEN
                unsigned int irq;
+#endif
 
                if (cpe_vector >= 0) {
                        /* If platform supports CPEI, enable the irq. */
                        cpe_poll_enabled = 0;
+#ifndef XEN
                        for (irq = 0; irq < NR_IRQS; ++irq)
                                if (irq_to_vector(irq) == cpe_vector) {
                                        desc = irq_descp(irq);
                                        desc->status |= IRQ_PER_CPU;
-                                       setup_irq(irq, &mca_cpe_irqaction);
+                                       setup_vector(irq, &mca_cpe_irqaction);
                                }
+#else
+                       desc = irq_descp(cpe_vector);
+                       desc->status |= IRQ_PER_CPU;
+                       setup_vector(cpe_vector, &mca_cpe_irqaction);
+#endif
                        ia64_mca_register_cpev(cpe_vector);
                        IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
                } else {
index 00d90faff9f7823ae6cb3d3f225ec59ae1414928..12462dc711514b43930a198ff828fcdf0b2e2334 100644 (file)
@@ -190,7 +190,7 @@ per_cpu_allocate(void *xen_heap_start, unsigned long end_in_pa)
        unsigned long end = start + size;
 
        if (__pa(end) < end_in_pa) {
-               init_xenheap_pages(__pa(xen_heap_start), __pa(start));
+               init_boot_pages(__pa(xen_heap_start), __pa(start));
                xen_heap_start = (void*)end;
                percpu_area = (void*)virt_to_xenva(start);
                printk("allocate percpu area 0x%lx@0x%lx 0x%p\n",
index 5ebe45cf3836fe30f430a04a930ba449993db607..43786124f180ee95bb555f9d6455725decc2256d 100644 (file)
@@ -101,6 +101,20 @@ static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
 
 extern void cpu_halt (void);
 
+#ifdef XEN
+/* work around for spinlock irq check. */
+void
+lock_ipi_calllock(unsigned long *flags)
+{
+       spin_lock_irqsave(&call_lock, *flags);
+}
+
+void
+unlock_ipi_calllock(unsigned long flags)
+{
+       spin_unlock_irqrestore(&call_lock, flags);
+}
+#else
 void
 lock_ipi_calllock(void)
 {
@@ -112,6 +126,7 @@ unlock_ipi_calllock(void)
 {
        spin_unlock_irq(&call_lock);
 }
+#endif
 
 static void
 stop_this_cpu (void)
index 0cc4de2cd5d8b693069e2e8e3cca8b1bf5ec30ea..ef7f9ea2ff54fc0fe6bb64d5e535fae5d97adf34 100644 (file)
@@ -364,6 +364,10 @@ smp_setup_percpu_timer (void)
 static void __devinit
 smp_callin (void)
 {
+#ifdef XEN
+       /* work around for spinlock irq assert. */
+       unsigned long flags;
+#endif
        int cpuid, phys_id;
        extern void ia64_init_itm(void);
 
@@ -382,9 +386,17 @@ smp_callin (void)
 
        fix_b0_for_bsp();
 
+#ifdef XEN
+       lock_ipi_calllock(&flags);
+#else
        lock_ipi_calllock();
+#endif
        cpu_set(cpuid, cpu_online_map);
+#ifdef XEN
+       unlock_ipi_calllock(flags);
+#else
        unlock_ipi_calllock();
+#endif
        per_cpu(cpu_state, cpuid) = CPU_ONLINE;
 
        smp_setup_percpu_timer();
index acdc996b4ed9f2742a482774c1bb056d045c2461..8cd350f7a7ab41f32daaf2729120db8a7cf6fe73 100644 (file)
@@ -350,7 +350,11 @@ static void sn_irq_info_free(struct rcu_head *head)
 }
 #endif
 
+#ifdef XEN
+void sn_irq_fixup(struct sn_pci_dev *pci_dev, struct sn_irq_info *sn_irq_info)
+#else  
 void sn_irq_fixup(struct pci_dev *pci_dev, struct sn_irq_info *sn_irq_info)
+#endif
 {
        nasid_t nasid = sn_irq_info->irq_nasid;
        int slice = sn_irq_info->irq_slice;
@@ -377,7 +381,11 @@ void sn_irq_fixup(struct pci_dev *pci_dev, struct sn_irq_info *sn_irq_info)
        register_intr_pda(sn_irq_info);
 }
 
+#ifdef XEN
+void sn_irq_unfixup(struct sn_pci_dev *pci_dev)
+#else
 void sn_irq_unfixup(struct pci_dev *pci_dev)
+#endif
 {
 #ifndef XEN
        struct sn_irq_info *sn_irq_info;
index 526d8c087f4c61206713faaf9619c1a4cffcb562..8c3c8b373c8407fd7bc9e5091dbb50a2ddb4fd40 100644 (file)
@@ -148,7 +148,7 @@ sioemu_set_callback (struct vcpu *v, unsigned long cb_ip, unsigned long paddr)
     pte = *lookup_noalloc_domain_pte(v->domain, paddr);
     if (!pte_present(pte) || !pte_mem(pte))
         return -EINVAL;
-    mfn = (pte_val(pte) & _PFN_MASK) >> PAGE_SHIFT;
+    mfn = pte_pfn(pte);
     ASSERT(mfn_valid(mfn));
 
     page = mfn_to_page(mfn);
index a3cf1865fc49a6c9a04597dd27e6c2b2a2da5a0f..8922fbe2dc3b242134b88a3a55773f3248116161 100644 (file)
@@ -46,9 +46,9 @@
 
 static void viosapic_deliver(struct viosapic *viosapic, int irq)
 {
-    uint16_t dest = viosapic->redirtbl[irq].dest_id;
-    uint8_t delivery_mode = viosapic->redirtbl[irq].delivery_mode;
-    uint8_t vector = viosapic->redirtbl[irq].vector;
+    uint16_t dest = viosapic->redirtbl[irq].fields.dest_id;
+    uint8_t delivery_mode = viosapic->redirtbl[irq].fields.delivery_mode;
+    uint8_t vector = viosapic->redirtbl[irq].fields.vector;
 
     ASSERT(spin_is_locked(&viosapic->lock));
 
@@ -78,7 +78,7 @@ static int get_redir_num(struct viosapic *viosapic, int vector)
 
     ASSERT(spin_is_locked(&viosapic->lock));
     for ( i = 0; i < VIOSAPIC_NUM_PINS; i++ )
-        if ( viosapic->redirtbl[i].vector == vector )
+        if ( viosapic->redirtbl[i].fields.vector == vector )
             return i;
 
     return -1;
@@ -91,7 +91,7 @@ static void service_iosapic(struct viosapic *viosapic)
 
     while ( (irq = iosapic_get_highest_irq(viosapic)) != -1 )
     {
-        if ( viosapic->redirtbl[irq].trig_mode == SAPIC_LEVEL )
+        if ( viosapic->redirtbl[irq].fields.trig_mode == SAPIC_LEVEL )
             viosapic->isr |= (1UL << irq);
 
         viosapic_deliver(viosapic, irq);
@@ -116,11 +116,18 @@ static void viosapic_update_EOI(struct viosapic *viosapic, int vector)
     if ( !test_and_clear_bit(redir_num, &viosapic->isr) )
     {
         spin_unlock(&viosapic->lock);
-        if ( viosapic->redirtbl[redir_num].trig_mode == SAPIC_LEVEL )
+        if ( viosapic->redirtbl[redir_num].fields.trig_mode == SAPIC_LEVEL )
             gdprintk(XENLOG_WARNING, "redir %d not set for %d EOI\n",
                      redir_num, vector);
         return;
     }
+    if ( iommu_enabled )
+    {
+        spin_unlock(&viosapic->lock);
+        hvm_dpci_eoi(current->domain, redir_num, &viosapic->redirtbl[redir_num]);
+        spin_lock(&viosapic->lock);
+    }
+
     service_iosapic(viosapic);
     spin_unlock(&viosapic->lock);
 }
@@ -278,7 +285,7 @@ static void viosapic_reset(struct viosapic *viosapic)
 
     for ( i = 0; i < VIOSAPIC_NUM_PINS; i++ )
     {
-        viosapic->redirtbl[i].mask = 0x1;
+        viosapic->redirtbl[i].fields.mask = 0x1;
     }
     spin_lock_init(&viosapic->lock);
 }
@@ -292,11 +299,11 @@ void viosapic_set_irq(struct domain *d, int irq, int level)
     if ( (irq < 0) || (irq >= VIOSAPIC_NUM_PINS) )
         goto out;
 
-    if ( viosapic->redirtbl[irq].mask )
+    if ( viosapic->redirtbl[irq].fields.mask )
         goto out;
 
     bit = 1UL << irq;
-    if ( viosapic->redirtbl[irq].trig_mode == SAPIC_LEVEL )
+    if ( viosapic->redirtbl[irq].fields.trig_mode == SAPIC_LEVEL )
     {
         if ( level )
             viosapic->irr |= bit;
@@ -315,10 +322,6 @@ out:
     spin_unlock(&viosapic->lock);
 }
 
-#define hvm_pci_intx_gsi(dev, intx)  \
-    (((((dev) << 2) + ((dev) >> 3) + (intx)) & 31) + 16)
-        
-
 void viosapic_set_pci_irq(struct domain *d, int device, int intx, int level)
 {
     int irq;
index 4e1497514b0782174bc32696269f15674f6ab3cd..29f73be647cb5f1466e74b884d1547356411be7d 100644 (file)
@@ -312,7 +312,7 @@ IA64FAULT vmx_vcpu_itr_d(VCPU *vcpu, u64 slot, u64 pte, u64 itir, u64 ifa)
     */   
     if (ps != _PAGE_SIZE_16M)
         thash_purge_entries(vcpu, va, ps);
-    gpfn = (pte & _PAGE_PPN_MASK)>> PAGE_SHIFT;
+    gpfn = pte_pfn(__pte(pte));
     vcpu_get_rr(vcpu, va, &rid);
     rid &= RR_RID_MASK;
     p_dtr = (thash_data_t *)&vcpu->arch.dtrs[slot];
@@ -446,7 +446,7 @@ IA64FAULT vmx_vcpu_ptc_ga(VCPU *vcpu, u64 va, u64 ps)
         do {
             cpu = v->processor;
             if (cpu != current->processor) {
-                spin_unlock_wait(&per_cpu(schedule_data, cpu).schedule_lock);
+                spin_barrier(&per_cpu(schedule_data, cpu).schedule_lock);
                 /* Flush VHPT on remote processors. */
                 smp_call_function_single(cpu, &ptc_ga_remote_func,
                                          &args, 0, 1);
index 5a6ed0a163685a72a2a57cb1912ec329f42f39a0..53d0693876659a052f61bda5fab6686068947bda 100644 (file)
@@ -54,6 +54,7 @@
 #include <asm/shadow.h>
 #include <asm/sioemu.h>
 #include <public/arch-ia64/sioemu.h>
+#include <xen/hvm/irq.h>
 
 /* reset all PSR field to 0, except up,mfl,mfh,pk,dt,rt,mc,it */
 #define INITIAL_PSR_VALUE_AT_INTERRUPTION 0x0000001808028034
@@ -73,7 +74,14 @@ static const u16 vec2off[68] = {0x0,0x400,0x800,0xc00,0x1000,0x1400,0x1800,
     0x7f00
 };
 
-
+void vmx_lazy_load_fpu(struct vcpu *vcpu)
+{
+    if (FP_PSR(vcpu) & IA64_PSR_DFH) {
+        FP_PSR(vcpu) = IA64_PSR_MFH;
+        if (__ia64_per_cpu_var(fp_owner) != vcpu)
+            __ia64_load_fpu(vcpu->arch._thread.fph);
+    }
+}
 
 void vmx_reflect_interruption(u64 ifa, u64 isr, u64 iim,
                               u64 vec, REGS *regs)
@@ -97,11 +105,7 @@ void vmx_reflect_interruption(u64 ifa, u64 isr, u64 iim,
     case 25:   // IA64_DISABLED_FPREG_VECTOR
         if (!(vpsr & IA64_PSR_IC))
             goto nested_fault;
-        if (FP_PSR(vcpu) & IA64_PSR_DFH) {
-            FP_PSR(vcpu) = IA64_PSR_MFH;
-            if (__ia64_per_cpu_var(fp_owner) != vcpu)
-                __ia64_load_fpu(vcpu->arch._thread.fph);
-        }
+        vmx_lazy_load_fpu(vcpu);
         if (!(VCPU(vcpu, vpsr) & IA64_PSR_DFH)) {
             regs->cr_ipsr &= ~IA64_PSR_DFH;
             return;
@@ -118,8 +122,7 @@ void vmx_reflect_interruption(u64 ifa, u64 isr, u64 iim,
         if (!status) {
             vcpu_increment_iip(vcpu);
             return;
-        } else if (IA64_RETRY == status)
-            return;
+        }
         break;
 
     case 33:   // IA64_FP_TRAP_VECTOR
@@ -129,10 +132,6 @@ void vmx_reflect_interruption(u64 ifa, u64 isr, u64 iim,
         status = handle_fpu_swa(0, regs, isr);
         if (!status)
             return;
-        else if (IA64_RETRY == status) {
-            vcpu_decrement_iip(vcpu);
-            return;
-        }
         break;
 
     case 29: // IA64_DEBUG_VECTOR
@@ -306,6 +305,7 @@ void leave_hypervisor_tail(void)
                 viosapic_set_irq(d, callback_irq, 0);
             }
         }
+        hvm_dirq_assist(v);
     }
 
     rmb();
@@ -374,7 +374,7 @@ vmx_hpw_miss(u64 vadr, u64 vec, REGS* regs)
                 pte = lookup_domain_mpa(v->domain, pa_clear_uc(vadr), NULL);
                 if (v->domain != dom0 && (pte & _PAGE_IO)) {
                     emulate_io_inst(v, pa_clear_uc(vadr), 4,
-                                    (pte & _PFN_MASK) >> PAGE_SHIFT);
+                                    pte_pfn(__pte(pte)));
                     return IA64_FAULT;
                 }
                 physical_tlb_miss(v, vadr, type);
@@ -411,7 +411,7 @@ try_again:
                                  " pte=0x%lx\n", data->page_flags);
                 if (data->pl >= ((regs->cr_ipsr >> IA64_PSR_CPL0_BIT) & 3))
                     emulate_io_inst(v, gppa, data->ma, 
-                                    (pte & _PFN_MASK) >> PAGE_SHIFT);
+                                    pte_pfn(__pte(pte)));
                 else {
                     vcpu_set_isr(v, misr.val);
                     data_access_rights(v, vadr);
index 00c23ff168100628f2c5462f045e40ae24d0e96e..cc898a446c4e4fb5d36e33598a5c5042a4bcfbab 100644 (file)
@@ -146,7 +146,7 @@ vmx_init_env(void *start, unsigned long end_in_pa)
                        VM_BUFFER_ALIGN_UP((unsigned long)start);
                unsigned long e_vm_buffer = s_vm_buffer + buffer_size;
                if (__pa(e_vm_buffer) < end_in_pa) {
-                       init_xenheap_pages(__pa(start), __pa(s_vm_buffer));
+                       init_boot_pages(__pa(start), __pa(s_vm_buffer));
                        start = (void*)e_vm_buffer;
                        vm_buffer = virt_to_xenva(s_vm_buffer);
                        printk("vm_buffer: 0x%lx\n", vm_buffer);
@@ -457,7 +457,7 @@ int vmx_set_ioreq_page(
        pte = *lookup_noalloc_domain_pte(d, gpfn << PAGE_SHIFT);
        if (!pte_present(pte) || !pte_mem(pte))
                return -EINVAL;
-       mfn = (pte_val(pte) & _PFN_MASK) >> PAGE_SHIFT;
+       mfn = pte_pfn(pte);
        ASSERT(mfn_valid(mfn));
 
        page = mfn_to_page(mfn);
index 50e19f0872957152d6d4211a1722ed0dcdff1a7e..aa3b6296613bfd01595936cdb6778af293e5dd51 100644 (file)
@@ -112,3 +112,56 @@ inject_guest_interruption(VCPU *vcpu, u64 vec)
     debugger_event(vec == IA64_EXTINT_VECTOR ?
                    XEN_IA64_DEBUG_ON_EXTINT : XEN_IA64_DEBUG_ON_EXCEPT);
 }
+
+void hvm_pci_intx_assert(
+        struct domain *d, unsigned int device, unsigned int intx)
+{
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+    unsigned int gsi;
+
+    ASSERT((device <= 31) && (intx <= 3));
+
+    if ( __test_and_set_bit(device * 4 + intx, &hvm_irq->pci_intx.i) )
+        return;
+    gsi = hvm_pci_intx_gsi(device, intx);
+    if ( ++hvm_irq->gsi_assert_count[gsi] == 1 )
+        viosapic_set_irq(d, gsi, 1);
+}
+
+void hvm_pci_intx_deassert(
+        struct domain *d, unsigned int device, unsigned int intx)
+{
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+    unsigned int gsi;
+
+    ASSERT((device <= 31) && (intx <= 3));
+
+    if ( !__test_and_clear_bit(device * 4 + intx, &hvm_irq->pci_intx.i) )
+        return;
+
+    gsi = hvm_pci_intx_gsi(device, intx);
+
+    if (--hvm_irq->gsi_assert_count[gsi] == 0)
+        viosapic_set_irq(d, gsi, 0);
+}
+
+void hvm_isa_irq_assert(struct domain *d, unsigned int isa_irq)
+{
+    /* dummy */
+}
+
+void hvm_isa_irq_deassert(struct domain *d, unsigned int isa_irq)
+{
+    /* dummy */
+}
+
+int msixtbl_pt_register(struct domain *d, int pirq, uint64_t gtable)
+{
+    /* dummy */
+    return -ENOSYS;
+}
+
+void msixtbl_pt_unregister(struct domain *d, int pirq)
+{
+    /* dummy */
+}
index 65a6b39802a3417de7c64cb527e3effbb095ab44..20ff4022167c2f4f5b95c2edcc8bd5728597c11f 100644 (file)
@@ -314,7 +314,7 @@ vmx_alt_itlb_miss_vmm:
     movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
     ;;
     and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    extr.u r18=r16,XEN_VIRT_UC_BIT, 15    // extract UC bit
+    extr.u r18=r16,XEN_VIRT_UC_BIT, 1     // extract UC bit
     ;;
     or r19=r17,r19      // insert PTE control bits into r19
     mov r20=IA64_GRANULE_SHIFT<<2
@@ -343,7 +343,7 @@ END(vmx_alt_itlb_miss)
 // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
 ENTRY(vmx_alt_dtlb_miss)
     VMX_DBG_FAULT(4)
-    mov r29=cr.ipsr
+    mov r29=cr.ipsr    //frametable_miss needs ipsr is saved in r29.
     mov r31=pr
     adds r22=IA64_VCPU_MMU_MODE_OFFSET, r21
     ;;
@@ -356,7 +356,7 @@ vmx_alt_dtlb_miss_vmm:
     // Test for the address of virtual frame_table
     shr r22=r16,56;;
     cmp.eq p8,p0=((VIRT_FRAME_TABLE_ADDR>>56)&0xff)-0x100,r22
-(p8)br.cond.sptk frametable_miss ;;
+(p8)br.cond.sptk frametable_miss ;; //Make sure ipsr is saved in r29
 #endif
     movl r17=PAGE_KERNEL
     mov r20=cr.isr
index da8f877764c7ad9a070031e3dd0fbcdb08a35a5e..74b0da7f41e09ebc380f7df501de398beaa12bf7 100644 (file)
@@ -522,7 +522,7 @@ static u64 translate_phy_pte(VCPU *v, u64 pte, u64 itir, u64 va)
      * which is required by vga acceleration since qemu maps shared
      * vram buffer with WB.
      */
-    if (phy_pte.ma != VA_MATTR_NATPAGE)
+    if (mfn_valid(pte_pfn(__pte(maddr))) && phy_pte.ma != VA_MATTR_NATPAGE)
         phy_pte.ma = VA_MATTR_WB;
 
     maddr = ((maddr & _PAGE_PPN_MASK) & PAGE_MASK) | (paddr & ~PAGE_MASK);
@@ -678,11 +678,20 @@ thash_data_t *vtlb_lookup(VCPU *v, u64 va,int is_data)
         cch = vtlb_thash(hcb->pta, va, vrr.rrval, &tag);
         do {
             if (cch->etag == tag && cch->ps == ps)
-                return cch;
+                goto found;
             cch = cch->next;
         } while(cch);
     }
     return NULL;
+found:
+    if (unlikely(!cch->ed && is_data == ISIDE_TLB)) {
+        /*The case is very rare, and it may lead to incorrect setting
+          for itlb's ed bit! Purge it from hash vTLB and let guest os
+          determin the ed bit of the itlb entry.*/
+        vtlb_purge(v, va, ps);
+        cch = NULL;
+    }
+    return cch;
 }
 
 
index b26f8550cf2f0e6ee705e162c7c4ee85799f555d..fdd688810823a75a189893e7590c8fa1c7251a0c 100644 (file)
@@ -154,6 +154,7 @@ processor_set_freq (struct acpi_cpufreq_data *data,
        cpufreq_statistic_update(cpu, data->acpi_data->state, state);
 
        data->acpi_data->state = state;
+       policy->cur = data->freq_table[state].frequency;
 
        return 0;
 }
@@ -209,21 +210,6 @@ acpi_cpufreq_cpu_init (struct cpufreq_policy *policy)
 
        data->acpi_data = &processor_pminfo[cpu]->perf;
 
-       /* capability check */
-       if (data->acpi_data->state_count <= 1) {
-               printk(KERN_WARNING "P-States\n");
-               result = -ENODEV;
-               goto err_unreg;
-       }
-
-       if ((data->acpi_data->control_register.space_id !=
-                               ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-                       (data->acpi_data->status_register.space_id !=
-                        ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-               result = -ENODEV;
-               goto err_unreg;
-       }
-
        data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
                        (data->acpi_data->state_count + 1));
        if (!data->freq_table) {
@@ -240,7 +226,8 @@ acpi_cpufreq_cpu_init (struct cpufreq_policy *policy)
                                data->acpi_data->states[i].transition_latency * 1000;
                }
        }
-       policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+
+       policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
 
        policy->cur = acpi_cpufreq_get(policy->cpu);
        printk(KERN_INFO "Current freq of CPU %u is %u\n", cpu, policy->cur);
@@ -289,6 +276,7 @@ acpi_cpufreq_cpu_exit (struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver acpi_cpufreq_driver = {
+       .name       = "acpi-cpufreq",
        .verify     = acpi_cpufreq_verify,
        .target     = acpi_cpufreq_target,
        .get        = acpi_cpufreq_get,
index b21cb42e70cdbfef9479d122654b5c82466ce382..c2c1ea96e8a0577a5d612833d9c16b8551f0b050 100644 (file)
@@ -18,6 +18,7 @@
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/guest_access.h>
+#include <xen/pci.h>
 #include <asm/vmx.h>
 #include <asm/dom_fw.h>
 #include <asm/vhpt.h>
@@ -203,7 +204,7 @@ long arch_do_domctl(xen_domctl_t *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
             ret = 0;
         else {
             if (op->u.ioport_permission.allow_access)
-                ret = ioports_permit_access(d, fp, lp);
+                ret = ioports_permit_access(d, fp, fp, lp);
             else
                 ret = ioports_deny_access(d, fp, lp);
         }
@@ -256,6 +257,266 @@ long arch_do_domctl(xen_domctl_t *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_get_device_group:
+    {
+        struct domain *d;
+        u32 max_sdevs;
+        u8 bus, devfn;
+        XEN_GUEST_HANDLE_64(uint32) sdevs;
+        int num_sdevs;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
+            break;
+
+        bus = (op->u.get_device_group.machine_bdf >> 16) & 0xff;
+        devfn = (op->u.get_device_group.machine_bdf >> 8) & 0xff;
+        max_sdevs = op->u.get_device_group.max_sdevs;
+        sdevs = op->u.get_device_group.sdev_array;
+
+        num_sdevs = iommu_get_device_group(d, bus, devfn, sdevs, max_sdevs);
+        if ( num_sdevs < 0 )
+        {
+            dprintk(XENLOG_ERR, "iommu_get_device_group() failed!\n");
+            ret = -EFAULT;
+            op->u.get_device_group.num_sdevs = 0;
+        }
+        else
+        {
+            ret = 0;
+            op->u.get_device_group.num_sdevs = num_sdevs;
+        }
+        if ( copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_test_assign_device:
+    {
+        u8 bus, devfn;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        bus = (op->u.assign_device.machine_bdf >> 16) & 0xff;
+        devfn = (op->u.assign_device.machine_bdf >> 8) & 0xff;
+
+        if ( device_assigned(bus, devfn) )
+        {
+            printk( "XEN_DOMCTL_test_assign_device: "
+                     "%x:%x:%x already assigned, or non-existent\n",
+                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+            break;
+        }
+        ret = 0;
+    }
+    break;
+
+    case XEN_DOMCTL_assign_device:
+    {
+        struct domain *d;
+        u8 bus, devfn;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        if ( unlikely((d = get_domain_by_id(op->domain)) == NULL) )
+        {
+            gdprintk(XENLOG_ERR,
+                "XEN_DOMCTL_assign_device: get_domain_by_id() failed\n");
+            break;
+        }
+        bus = (op->u.assign_device.machine_bdf >> 16) & 0xff;
+        devfn = (op->u.assign_device.machine_bdf >> 8) & 0xff;
+
+        if ( !iommu_pv_enabled && !is_hvm_domain(d) )
+        {
+            ret = -ENOSYS;
+            break;
+        }
+
+        if ( device_assigned(bus, devfn) )
+        {
+            gdprintk(XENLOG_ERR, "XEN_DOMCTL_assign_device: "
+                     "%x:%x:%x already assigned, or non-existent\n",
+                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+            break;
+        }
+
+        ret = assign_device(d, bus, devfn);
+        gdprintk(XENLOG_INFO, "XEN_DOMCTL_assign_device: bdf = %x:%x:%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        put_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_deassign_device:
+    {
+        struct domain *d;
+        u8 bus, devfn;
+
+        ret = -ENOSYS;
+        if ( !iommu_enabled )
+            break;
+
+        ret = -EINVAL;
+        if ( unlikely((d = get_domain_by_id(op->domain)) == NULL) )
+        {
+            gdprintk(XENLOG_ERR,
+                "XEN_DOMCTL_deassign_device: get_domain_by_id() failed\n");
+            break;
+        }
+        bus = (op->u.assign_device.machine_bdf >> 16) & 0xff;
+        devfn = (op->u.assign_device.machine_bdf >> 8) & 0xff;
+
+        if ( !iommu_pv_enabled && !is_hvm_domain(d) )
+        {
+            ret = -ENOSYS;
+            break;
+        }
+
+        if ( !device_assigned(bus, devfn) )
+            break;
+
+        ret = 0;
+        deassign_device(d, bus, devfn);
+        gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n",
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        put_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_bind_pt_irq:
+    {
+        struct domain * d;
+        xen_domctl_bind_pt_irq_t * bind;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
+            break;
+        bind = &(op->u.bind_pt_irq);
+        if ( iommu_enabled )
+            ret = pt_irq_create_bind_vtd(d, bind);
+        if ( ret < 0 )
+            gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n");
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_unbind_pt_irq:
+    {
+        struct domain * d;
+        xen_domctl_bind_pt_irq_t * bind;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(op->domain)) == NULL )
+            break;
+        bind = &(op->u.bind_pt_irq);
+        if ( iommu_enabled )
+            ret = pt_irq_destroy_bind_vtd(d, bind);
+        if ( ret < 0 )
+            gdprintk(XENLOG_ERR, "pt_irq_destroy_bind failed!\n");
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_memory_mapping:
+    {
+        struct domain *d;
+        unsigned long gfn = op->u.memory_mapping.first_gfn;
+        unsigned long mfn = op->u.memory_mapping.first_mfn;
+        unsigned long nr_mfns = op->u.memory_mapping.nr_mfns;
+        int i;
+
+        ret = -EINVAL;
+        if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */
+            break;
+
+        ret = -ESRCH;
+        if ( unlikely((d = rcu_lock_domain_by_id(op->domain)) == NULL) )
+            break;
+
+        ret=0;
+        if ( op->u.memory_mapping.add_mapping )
+        {
+            gdprintk(XENLOG_INFO,
+                "memory_map:add: gfn=%lx mfn=%lx nr_mfns=%lx\n",
+                gfn, mfn, nr_mfns);
+
+            ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
+            for ( i = 0; i < nr_mfns; i++ )
+                assign_domain_mmio_page(d, (gfn+i)<<PAGE_SHIFT,
+                           (mfn+i)<<PAGE_SHIFT, PAGE_SIZE,
+                           ASSIGN_writable | ASSIGN_nocache);
+        }
+        else
+        {
+            gdprintk(XENLOG_INFO,
+                "memory_map:remove: gfn=%lx mfn=%lx nr_mfns=%lx\n",
+                 gfn, mfn, nr_mfns);
+
+            for ( i = 0; i < nr_mfns; i++ )
+                deassign_domain_mmio_page(d, (gfn+i)<<PAGE_SHIFT,
+                        (mfn+i)<<PAGE_SHIFT, PAGE_SIZE);
+            ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
+        }
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_ioport_mapping:
+    {
+
+#define MAX_IOPORTS    0x10000
+        struct domain *d;
+        unsigned int fgp = op->u.ioport_mapping.first_gport;
+        unsigned int fmp = op->u.ioport_mapping.first_mport;
+        unsigned int np = op->u.ioport_mapping.nr_ports;
+
+        ret = -EINVAL;
+        if ( (np == 0) || (fgp > MAX_IOPORTS) || (fmp > MAX_IOPORTS) ||
+            ((fgp + np) > MAX_IOPORTS) || ((fmp + np) > MAX_IOPORTS) )
+        {
+            gdprintk(XENLOG_ERR,
+                "ioport_map:invalid:gport=%x mport=%x nr_ports=%x\n",
+                fgp, fmp, np);
+            break;
+        }
+
+        ret = -ESRCH;
+        if ( unlikely((d = rcu_lock_domain_by_id(op->domain)) == NULL) )
+            break;
+
+        if ( op->u.ioport_mapping.add_mapping )
+        {
+            gdprintk(XENLOG_INFO,
+                    "ioport_map:add f_gport=%x f_mport=%x np=%x\n",
+                    fgp, fmp, np);
+
+            ret = ioports_permit_access(d, fgp, fmp, fmp + np - 1);
+        }
+        else
+        {
+            gdprintk(XENLOG_INFO,
+                    "ioport_map:remove f_gport=%x f_mport=%x np=%x\n",
+                    fgp, fmp, np);
+
+            ret = ioports_deny_access(d,  fgp, fgp + np - 1);
+        }
+        rcu_unlock_domain(d);
+    }
+    break;
+
     case XEN_DOMCTL_sethvmcontext:
     { 
         struct hvm_domain_context c;
@@ -388,9 +649,38 @@ long arch_do_domctl(xen_domctl_t *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
     }
     break;
 
-    case XEN_DOMCTL_assign_device:
-        ret = -ENOSYS;
-        break;
+    case XEN_DOMCTL_set_address_size:
+    {
+        struct domain *d = rcu_lock_domain_by_id(op->domain);
+
+        ret = -ESRCH;
+        if (d == NULL)
+            break;
+
+        ret = -EINVAL;
+        if (op->u.address_size.size == BITS_PER_LONG)
+            ret = 0;
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_get_address_size:
+    {
+        struct domain *d = rcu_lock_domain_by_id(op->domain);
+
+        ret = -ESRCH;
+        if (d  == NULL)
+            break;
+
+        ret = 0;
+        op->u.address_size.size = BITS_PER_LONG;
+        rcu_unlock_domain(d);
+
+        if (copy_to_guest(u_domctl, op, 1))
+            ret = -EFAULT;
+    }
+    break;
 
     default:
         printk("arch_do_domctl: unrecognized domctl: %d!!!\n",op->cmd);
@@ -522,7 +812,7 @@ dom0vp_add_io_space(struct domain *d, unsigned long phys_base,
     fp = space_number << IO_SPACE_BITS;
     lp = fp | 0xffff;
 
-    return ioports_permit_access(d, fp, lp);
+    return ioports_permit_access(d, fp, fp, lp);
 }
 
 unsigned long
@@ -543,7 +833,7 @@ do_dom0vp_op(unsigned long cmd,
             dprintk(XENLOG_INFO, "%s: INVALID_MFN ret: 0x%lx\n",
                      __func__, ret);
         } else {
-            ret = (ret & _PFN_MASK) >> PAGE_SHIFT;//XXX pte_pfn()
+            ret = pte_pfn(__pte(ret));
         }
         perfc_incr(dom0vp_phystomach);
         break;
index 9a89c168b509dfa0e1b8dd8d1d1241f0f7949872..560f94e6216f523794bf6d30480c0717298c449b 100644 (file)
@@ -1,3 +1,5 @@
+#include <asm/dom_fw.h>
+
 // moved from xenasm.S to be shared by xen and libxc
 /*
  * Assembly support routines for Xen/ia64
 xen_ia64_pal_call_stub:
        {
         .mii
-       addl r2=0x1000,r0       //  Hypercall number (Value is patched).
+       addl r2=FW_HYPERCALL_PAL_CALL_ASM,r0    //  Hypercall number (Value is patched).
        mov r9=256
        ;; 
        cmp.gtu p7,p8=r9,r28            /* r32 <= 255? */
        }
        {
         .mbb
-       break 0x1000            //  Hypercall vector (Value is patched).
+       break __IA64_XEN_HYPERCALL_DEFAULT      //  Hypercall vector (Value is patched).
 (p7)   br.cond.sptk.few rp
 (p8)   br.ret.sptk.few rp
        }
index d7929f0dff3b264e5985d5512263a4609375490f..01adf4d110e39f6fcbbcead4b50a780931f1d0b3 100644 (file)
@@ -142,6 +142,117 @@ build_pal_hypercall_bundles(uint64_t *imva, uint64_t brkimm, uint64_t hypnum)
        ia64_fc(imva + 3);
 }
 
+/* xen fpswa call stub. 14 bundles */
+extern const unsigned long xen_ia64_fpswa_call_stub[];
+extern const unsigned long xen_ia64_fpswa_call_stub_end[];
+extern const unsigned long xen_ia64_fpswa_call_stub_patch[];
+asm(
+       ".align 32\n"
+       ".proc xen_ia64_fpswa_call_stub;\n"
+       "xen_ia64_fpswa_call_stub:\n"
+       ".prologue\n"
+       "alloc r3 = ar.pfs, 8, 0, 0, 0\n"
+       ".body\n"
+       "mov r14 = in0\n"
+       "ld8 r15 = [in1], 8\n"
+       ";;\n"
+       "ld8 r16 = [in1]\n"
+       "ld8 r17 = [in2]\n"
+       "ld8 r18 = [in3]\n"
+       "ld8 r19 = [in4]\n"
+       "ld8 r20 = [in5]\n"
+       "ld8 r21 = [in6]\n"
+       "ld8 r22 = [in7], 8\n"
+       ";;\n"
+       "ld8 r23 = [in7], 8\n"
+       ";;\n"
+       "ld8 r24 = [in7], 8\n"
+       ";;\n"
+       "cmp.ne p6, p0 = r24, r0\n"
+       "ld8 r25 = [in7], 8\n"
+       ";;\n"
+       "(p6) tpa r24 = r24\n"
+       "cmp.ne p7, p0 = r25, r0\n"
+       "ld8 r26 = [in7], 8\n"
+       ";;\n"
+       "(p7)tpa r25 = r25\n"
+       "cmp.ne p8, p0 = r26, r0\n"
+       "ld8 r27 = [in7], 8\n"
+       ";;\n"
+       "(p8)tpa r26 = r26\n"
+       "cmp.ne p9, p0 = r27, r0\n"
+       ";;\n"
+       "tpa r27 = r27\n"
+       "xen_ia64_fpswa_call_stub_patch:"
+       "{\n"
+       "mov r2 = " FW_HYPERCALL_FPSWA_STR "\n"
+       "break " __IA64_XEN_HYPERCALL_DEFAULT_STR "\n"
+       "nop.i 0\n"
+       "}\n"
+       "st8 [in2] = r17\n"
+       "st8 [in3] = r18\n"
+       "st8 [in4] = r19\n"
+       "st8 [in5] = r20\n"
+       "st8 [in6] = r21\n"
+       "br.ret.sptk.many rp\n"
+       "xen_ia64_fpswa_call_stub_end:"
+       ".endp xen_ia64_fpswa_call_stub\n"
+);
+
+static void
+build_fpswa_hypercall_bundle(uint64_t *imva, uint64_t brkimm, uint64_t hypnum)
+{
+       INST64_A5 slot0;
+       INST64_I19 slot1;
+       INST64_I18 slot2;
+       IA64_BUNDLE bundle;
+
+       /* slot0: mov r2 = hypnum (low 20 bits) */
+       slot0.inst = 0;
+       slot0.qp = 0;
+       slot0.r1 = 2;
+       slot0.r3 = 0;
+       slot0.major = 0x9;
+
+       slot0.s = 0;
+       slot0.imm9d = hypnum >> 7;
+       slot0.imm5c = hypnum >> 16;
+       slot0.imm7b = hypnum;
+
+       /* slot1: break brkimm */
+       slot1.inst = 0;
+       slot1.qp = 0;
+       slot1.x6 = 0;
+       slot1.x3 = 0;
+       slot1.major = 0x0;
+       slot1.i = brkimm >> 20;
+       slot1.imm20 = brkimm;
+
+       /* slot2: nop.i */
+       slot2.inst = 0;
+       slot2.qp = 0;
+       slot2.imm20 = 0;
+       slot2.y = 0;
+       slot2.x6 = 1;
+       slot2.x3 = 0;
+       slot2.i = 0;
+       slot2.major = 0;
+
+       /* MII bundle */
+       bundle.i64[0] = 0;
+       bundle.i64[1] = 0;
+       bundle.template = 0x0; /* MII */
+       bundle.slot0 = slot0.inst;
+       bundle.slot1a = slot1.inst;
+       bundle.slot1b = slot1.inst >> 18;
+       bundle.slot2 = slot2.inst;
+       
+       imva[0] = bundle.i64[0];
+       imva[1] = bundle.i64[1];
+       ia64_fc(imva);
+       ia64_fc(imva + 1);
+}
+
 // builds a hypercall bundle at domain physical address
 static void
 dom_fpswa_hypercall_patch(uint64_t brkimm, unsigned long imva)
@@ -149,6 +260,10 @@ dom_fpswa_hypercall_patch(uint64_t brkimm, unsigned long imva)
        unsigned long *entry_imva, *patch_imva;
        const unsigned long entry_paddr = FW_HYPERCALL_FPSWA_ENTRY_PADDR;
        const unsigned long patch_paddr = FW_HYPERCALL_FPSWA_PATCH_PADDR;
+       const size_t stub_size =
+               (char*)xen_ia64_fpswa_call_stub_end -
+               (char*)xen_ia64_fpswa_call_stub;
+       size_t i;
 
        entry_imva = (unsigned long *)(imva + entry_paddr -
                                       FW_HYPERCALL_BASE_PADDR);
@@ -159,7 +274,17 @@ dom_fpswa_hypercall_patch(uint64_t brkimm, unsigned long imva)
        *entry_imva++ = patch_paddr;
        *entry_imva   = 0;
 
-       build_hypercall_bundle(patch_imva, brkimm, FW_HYPERCALL_FPSWA, 1);
+       /* see dom_fw.h */
+       BUG_ON((char*)xen_ia64_fpswa_call_stub_end -
+              (char*)xen_ia64_fpswa_call_stub > 0xff - 16 + 1);
+
+       /* call stub */
+       memcpy(patch_imva, xen_ia64_fpswa_call_stub, stub_size);
+       for (i = 0; i < stub_size; i++)
+               ia64_fc(imva + i);
+       patch_imva +=
+               xen_ia64_fpswa_call_stub_patch - xen_ia64_fpswa_call_stub;
+       build_fpswa_hypercall_bundle(patch_imva, brkimm, FW_HYPERCALL_FPSWA);
 }
 
 // builds a hypercall bundle at domain physical address
index ade43c0a4e1fc2d5f762a431fd4c5b5aa4855efb..33789f2b55868c340d17ae168499be74a361877a 100644 (file)
@@ -31,7 +31,7 @@
 #include <xen/event.h>
 #include <xen/console.h>
 #include <xen/version.h>
-#include <public/libelf.h>
+#include <xen/libelf.h>
 #include <asm/pgalloc.h>
 #include <asm/offsets.h>  /* for IA64_THREAD_INFO_SIZE */
 #include <asm/vcpu.h>   /* for function declarations */
@@ -405,6 +405,33 @@ void relinquish_vcpu_resources(struct vcpu *v)
        kill_timer(&v->arch.hlt_timer);
 }
 
+struct domain *alloc_domain_struct(void)
+{
+#ifdef CONFIG_IA64_PICKLE_DOMAIN
+       struct domain *d;
+       /*
+        * We pack the MFN of the domain structure into a 32-bit field within
+        * the page_info structure. Hence the MEMF_bits() restriction.
+        */
+       d = alloc_xenheap_pages(get_order_from_bytes(sizeof(*d)),
+                               MEMF_bits(32 + PAGE_SHIFT));
+       if ( d != NULL )
+               memset(d, 0, sizeof(*d));
+       return d;
+#else
+       return xmalloc(struct domain);
+#endif
+}
+
+void free_domain_struct(struct domain *d)
+{
+#ifdef CONFIG_IA64_PICKLE_DOMAIN
+       free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
+#else
+       xfree(d);
+#endif
+}
+
 struct vcpu *alloc_vcpu_struct(void)
 {
        struct page_info *page;
@@ -509,7 +536,7 @@ int vcpu_late_initialise(struct vcpu *v)
 
        /* Create privregs page. */
        order = get_order_from_shift(XMAPPEDREGS_SHIFT);
-       v->arch.privregs = alloc_xenheap_pages(order);
+       v->arch.privregs = alloc_xenheap_pages(order, 0);
        if (v->arch.privregs == NULL)
                return -ENOMEM;
        BUG_ON(v->arch.privregs == NULL);
@@ -561,7 +588,7 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
 
        // the following will eventually need to be negotiated dynamically
        d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
-       d->arch.breakimm = 0x1000;
+       d->arch.breakimm = __IA64_XEN_HYPERCALL_DEFAULT;
        for (i = 0; i < NR_CPUS; i++) {
                d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
        }
@@ -569,6 +596,7 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
        if (is_idle_domain(d))
            return 0;
 
+       INIT_LIST_HEAD(&d->arch.pdev_list);
        foreign_p2m_init(d);
 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
        d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
@@ -577,7 +605,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
 #endif
        if (tlb_track_create(d) < 0)
                goto fail_nomem1;
-       d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
+       d->shared_info = alloc_xenheap_pages(
+               get_order_from_shift(XSI_SHIFT), 0);
        if (d->shared_info == NULL)
                goto fail_nomem;
        BUG_ON(d->shared_info == NULL);
@@ -596,11 +625,14 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
        memset(&d->arch.mm, 0, sizeof(d->arch.mm));
        d->arch.relres = RELRES_not_started;
        d->arch.mm_teardown_offset = 0;
-       INIT_LIST_HEAD(&d->arch.relmem_list);
+       INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
 
        if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
            goto fail_nomem;
 
+       if(iommu_domain_init(d) != 0)
+               goto fail_iommu;
+
        /*
         * grant_table_create() can't fully initialize grant table for domain
         * because it is called before arch_domain_create().
@@ -617,6 +649,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
        dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
        return 0;
 
+fail_iommu:
+       iommu_domain_destroy(d);
 fail_nomem:
        tlb_track_destroy(d);
 fail_nomem1:
@@ -636,6 +670,11 @@ void arch_domain_destroy(struct domain *d)
                free_xenheap_pages(d->shared_info,
                                   get_order_from_shift(XSI_SHIFT));
 
+       if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) )     {
+               pci_release_devices(d);
+               iommu_domain_destroy(d);
+       }
+
        tlb_track_destroy(d);
 
        /* Clear vTLB for the next domain.  */
@@ -706,6 +745,48 @@ nats_update(unsigned int* nats, unsigned int reg, char nat)
                *nats &= ~(1UL << reg);
 }
 
+static unsigned long
+__vcpu_get_itc(struct vcpu *v)
+{
+       unsigned long itc_last;
+       unsigned long itc_offset;
+       unsigned long itc;
+
+       if (unlikely(v->arch.privregs == NULL))
+               return ia64_get_itc();
+       
+       itc_last = v->arch.privregs->itc_last;
+       itc_offset = v->arch.privregs->itc_offset;
+       itc = ia64_get_itc();
+       itc += itc_offset;
+       if (itc_last >= itc)
+               itc = itc_last;
+       return itc;
+}
+
+static void
+__vcpu_set_itc(struct vcpu *v, u64 val)
+{
+       unsigned long itc;
+       unsigned long itc_offset;
+       unsigned long itc_last;
+
+       BUG_ON(v->arch.privregs == NULL);
+
+       if (v != current)
+               vcpu_pause(v);
+       
+       itc = ia64_get_itc();
+       itc_offset = val - itc;
+       itc_last = val;
+       
+       v->arch.privregs->itc_offset = itc_offset;
+       v->arch.privregs->itc_last = itc_last;
+
+       if (v != current)
+               vcpu_unpause(v);
+}
+
 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
 {
        int i;
@@ -744,6 +825,10 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
                unw_get_ar(&info, UNW_AR_LC, &c.nat->regs.ar.lc);
                unw_get_ar(&info, UNW_AR_EC, &c.nat->regs.ar.ec);
        }
+
+       if (!is_hvm)
+               c.nat->regs.ar.itc = __vcpu_get_itc(v);
+
        c.nat->regs.ar.csd = uregs->ar_csd;
        c.nat->regs.ar.ssd = uregs->ar_ssd;
 
@@ -1231,6 +1316,10 @@ int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
                unw_set_ar(&info, UNW_AR_LC, c.nat->regs.ar.lc);
                unw_set_ar(&info, UNW_AR_EC, c.nat->regs.ar.ec);
        }
+
+       if (!is_hvm_domain(d) && (c.nat->flags & VGCF_SET_AR_ITC))
+               __vcpu_set_itc(v, c.nat->regs.ar.itc);
+
        uregs->ar_csd = c.nat->regs.ar.csd;
        uregs->ar_ssd = c.nat->regs.ar.ssd;
        
@@ -1554,9 +1643,8 @@ int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
        return rc;
 }
 
-static int relinquish_memory(struct domain *d, struct list_head *list)
+static int relinquish_memory(struct domain *d, struct page_list_head *list)
 {
-    struct list_head *ent;
     struct page_info *page;
 #ifndef __ia64__
     unsigned long     x, y;
@@ -1565,16 +1653,14 @@ static int relinquish_memory(struct domain *d, struct list_head *list)
 
     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
     spin_lock_recursive(&d->page_alloc_lock);
-    ent = list->next;
-    while ( ent != list )
+
+    while ( (page = page_list_remove_head(list)) )
     {
-        page = list_entry(ent, struct page_info, list);
         /* Grab a reference to the page so it won't disappear from under us. */
         if ( unlikely(!get_page(page, d)) )
         {
             /* Couldn't get a reference -- someone is freeing this page. */
-            ent = ent->next;
-            list_move_tail(&page->list, &d->arch.relmem_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
             continue;
         }
 
@@ -1609,9 +1695,8 @@ static int relinquish_memory(struct domain *d, struct list_head *list)
 #endif
 
         /* Follow the list chain and /then/ potentially free the page. */
-        ent = ent->next;
         BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
-        list_move_tail(&page->list, &d->arch.relmem_list);
+        page_list_add_tail(page, &d->arch.relmem_list);
         put_page(page);
 
         if (hypercall_preempt_check()) {
@@ -1620,7 +1705,7 @@ static int relinquish_memory(struct domain *d, struct list_head *list)
         }
     }
 
-    list_splice_init(&d->arch.relmem_list, list);
+    page_list_splice_init(&d->arch.relmem_list, list);
 
  out:
     spin_unlock_recursive(&d->page_alloc_lock);
@@ -1640,6 +1725,11 @@ int domain_relinquish_resources(struct domain *d)
                /*fallthrough*/
 
        case RELRES_mm_teardown:
+               if (d->arch.pirq_eoi_map != NULL) {
+                       put_page(virt_to_page(d->arch.pirq_eoi_map));
+                       d->arch.pirq_eoi_map = NULL;
+               }
+
                /* Tear down shadow mode stuff. */
                ret = mm_teardown(d);
                if (ret != 0)
@@ -1673,9 +1763,6 @@ int domain_relinquish_resources(struct domain *d)
        if (is_hvm_domain(d) && d->arch.sal_data)
                xfree(d->arch.sal_data);
 
-       /* Free page used by xen oprofile buffer */
-       free_xenoprof_pages(d);
-
        return 0;
 }
 
@@ -1936,6 +2023,7 @@ static void __init calc_dom0_size(void)
        unsigned long p2m_pages;
        unsigned long spare_hv_pages;
        unsigned long max_dom0_size;
+       unsigned long iommu_pg_table_pages = 0;
 
        /* Estimate maximum memory we can safely allocate for dom0
         * by subtracting the p2m table allocation and a chunk of memory
@@ -1946,8 +2034,13 @@ static void __init calc_dom0_size(void)
        domheap_pages = avail_domheap_pages();
        p2m_pages = domheap_pages / PTRS_PER_PTE;
        spare_hv_pages = 8192 + (domheap_pages / 4096);
-       max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
-                        * PAGE_SIZE;
+
+       if (iommu_enabled)
+               iommu_pg_table_pages = domheap_pages * 4 / 512;
+               /* There are 512 ptes in one 4K vtd page. */
+
+       max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages) -
+                       iommu_pg_table_pages) * PAGE_SIZE;
        printk("Maximum permitted dom0 size: %luMB\n",
               max_dom0_size / (1024*1024));
 
@@ -1993,7 +2086,7 @@ static void __init physdev_init_dom0(struct domain *d)
                BUG();
        if (irqs_permit_access(d, 0, NR_IRQS-1))
                BUG();
-       if (ioports_permit_access(d, 0, 0xffff))
+       if (ioports_permit_access(d, 0, 0, 0xffff))
                BUG();
 }
 
index 1798c32e67be0cb645881ce6c30f0b6c904e8f03..75a0cb095d3d825321225eae207e45a35a432668 100644 (file)
@@ -314,10 +314,10 @@ fp_emulate(int fp_fault, void *bundle, unsigned long *ipsr,
 unsigned long
 handle_fpu_swa(int fp_fault, struct pt_regs *regs, unsigned long isr)
 {
-       struct vcpu *v = current;
        IA64_BUNDLE bundle;
        unsigned long fault_ip;
        fpswa_ret_t ret;
+       unsigned long rc;
 
        fault_ip = regs->cr_iip;
        /*
@@ -329,24 +329,29 @@ handle_fpu_swa(int fp_fault, struct pt_regs *regs, unsigned long isr)
                fault_ip -= 16;
 
        if (VMX_DOMAIN(current)) {
-               if (IA64_RETRY == __vmx_get_domain_bundle(fault_ip, &bundle))
-                       return IA64_RETRY;
-       } else
-               bundle = __get_domain_bundle(fault_ip);
-
-       if (!bundle.i64[0] && !bundle.i64[1]) {
-               printk("%s: floating-point bundle at 0x%lx not mapped\n",
-                      __FUNCTION__, fault_ip);
-               return -1;
+               rc = __vmx_get_domain_bundle(fault_ip, &bundle);
+       } else {
+               rc = 0;
+               if (vcpu_get_domain_bundle(current, regs, fault_ip,
+                                          &bundle) == 0)
+                       rc = IA64_RETRY;
+       }
+       if (rc == IA64_RETRY) {
+               PSCBX(current, fpswa_ret) = (fpswa_ret_t){IA64_RETRY, 0, 0, 0};
+               gdprintk(XENLOG_DEBUG,
+                        "%s(%s): floating-point bundle at 0x%lx not mapped\n",
+                        __FUNCTION__, fp_fault ? "fault" : "trap", fault_ip);
+               return IA64_RETRY;
        }
 
        ret = fp_emulate(fp_fault, &bundle, &regs->cr_ipsr, &regs->ar_fpsr,
                         &isr, &regs->pr, &regs->cr_ifs, regs);
 
        if (ret.status) {
-               PSCBX(v, fpswa_ret) = ret;
-               printk("%s(%s): fp_emulate() returned %ld\n",
-                      __FUNCTION__, fp_fault ? "fault" : "trap", ret.status);
+               PSCBX(current, fpswa_ret) = ret;
+               gdprintk(XENLOG_ERR, "%s(%s): fp_emulate() returned %ld\n",
+                        __FUNCTION__, fp_fault ? "fault" : "trap",
+                        ret.status);
        }
 
        return ret.status;
@@ -408,6 +413,13 @@ ia64_fault(unsigned long vector, unsigned long isr, unsigned long ifa,
                printk("Dirty-bit.\n");
                break;
 
+       case 10:
+               /* __domain_get_bundle() may cause fault. */
+               if (ia64_done_with_exception(regs))
+                       return;
+               printk("Data Access-bit.\n");
+               break;
+
        case 20:
                printk("Page Not Found.\n");
                break;
@@ -565,6 +577,17 @@ ia64_handle_privop(unsigned long ifa, struct pt_regs *regs, unsigned long isr,
        }
 }
 
+void
+ia64_lazy_load_fpu(struct vcpu *v)
+{
+       if (PSCB(v, hpsr_dfh)) {
+               PSCB(v, hpsr_dfh) = 0;
+               PSCB(v, hpsr_mfh) = 1;
+               if (__ia64_per_cpu_var(fp_owner) != v)
+                       __ia64_load_fpu(v->arch._thread.fph);
+       }
+}
+
 void
 ia64_handle_reflection(unsigned long ifa, struct pt_regs *regs,
                        unsigned long isr, unsigned long iim,
@@ -613,12 +636,7 @@ ia64_handle_reflection(unsigned long ifa, struct pt_regs *regs,
                vector = IA64_GENEX_VECTOR;
                break;
        case 25:
-               if (PSCB(v, hpsr_dfh)) {
-                       PSCB(v, hpsr_dfh) = 0;
-                       PSCB(v, hpsr_mfh) = 1;
-                       if (__ia64_per_cpu_var(fp_owner) != v)
-                               __ia64_load_fpu(v->arch._thread.fph);
-               }
+               ia64_lazy_load_fpu(v);
                if (!PSCB(v, vpsr_dfh)) {
                        regs->cr_ipsr &= ~IA64_PSR_DFH;
                        return;
@@ -678,20 +696,12 @@ ia64_handle_reflection(unsigned long ifa, struct pt_regs *regs,
                        vcpu_increment_iip(v);
                        return;
                }
-               // fetch code fail
-               if (IA64_RETRY == status)
-                       return;
-               printk("ia64_handle_reflection: handling FP fault\n");
                vector = IA64_FP_FAULT_VECTOR;
                break;
        case 33:
                status = handle_fpu_swa(0, regs, isr);
                if (!status)
                        return;
-               // fetch code fail
-               if (IA64_RETRY == status)
-                       return;
-               printk("ia64_handle_reflection: handling FP trap\n");
                vector = IA64_FP_TRAP_VECTOR;
                break;
        case 34:
index 3b64bd14d4291e519ed7d1a60a4dfe02d86462c6..2c9235a38cb4ad3377e271e68620b94c1a4a56fe 100644 (file)
@@ -95,7 +95,7 @@ void get_state_info_on(void *data) {
                       rec_name[arg->type], smp_processor_id(), arg->ret);
        if (arg->corrected) {
                sal_record->severity = sal_log_severity_corrected;
-               IA64_SAL_DEBUG("%s: IA64_SAL_CLEAR_STATE_INFO(SAL_INFO_TYPE_MCA)"
+               IA64_SAL_DEBUG("%s: IA64_SAL_GET_STATE_INFO(SAL_INFO_TYPE_MCA)"
                               " force\n", __FUNCTION__);
        }
        if (arg->ret > 0) {
@@ -293,9 +293,7 @@ sal_emulator (long index, unsigned long in1, unsigned long in2,
                        }
                        r9 = arg.ret;
                        status = arg.status;
-                       if (r9 == 0) {
-                               xfree(e);
-                       } else {
+                       if (r9 != 0) {
                                /* Re-add the entry to sal_queue */
                                spin_lock_irqsave(&sal_queue_lock, flags);
                                list_add(&e->list, &sal_queue[in1]);
@@ -359,7 +357,12 @@ sal_emulator (long index, unsigned long in1, unsigned long in2,
                        }
                        r9 = arg.ret;
                        status = arg.status;
-                       xfree(e);
+                       if (r9 >= 0) {
+                               IA64_SAL_DEBUG("SAL_CLEAR_STATE_INFO: more errors are available\n");
+                               spin_lock_irqsave(&sal_queue_lock, flags);
+                               list_add(&e->list, &sal_queue[in1]);
+                               spin_unlock_irqrestore(&sal_queue_lock, flags);
+                       }
                }
                break;
            case SAL_MC_RENDEZ:
@@ -1334,6 +1337,10 @@ efi_emulate_set_virtual_address_map(
        efi_desc_size = sizeof(efi_memory_desc_t);
 
        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+               struct page_info *efi_runtime_page = NULL;
+               struct page_info *fpswa_inf_page = NULL;
+               struct page_info *fw_table_page = NULL;
+               
                if (copy_from_user(&entry, p, sizeof(efi_memory_desc_t))) {
                        printk ("efi_emulate_set_virtual_address_map: copy_from_user() fault. addr=0x%p\n", p);
                        return EFI_UNSUPPORTED;
@@ -1343,6 +1350,27 @@ efi_emulate_set_virtual_address_map(
                 if (md->type != EFI_PAL_CODE)
                         continue;
 
+               /* get pages to prevend them from being freed 
+                * during touching them.
+                * those entres are in [FW_TABLES_BASE_PADDR, ...]
+                * see dom_fw.h for its layout.
+                */
+               efi_runtime_page = virt_to_page(efi_runtime);
+               fpswa_inf_page = virt_to_page(fpswa_inf);
+               fw_table_page = virt_to_page(
+                       domain_mpa_to_imva(d, FW_TABLES_BASE_PADDR));
+               if (get_page(efi_runtime_page, d) == 0)
+                       return EFI_INVALID_PARAMETER;
+               if (get_page(fpswa_inf_page, d) == 0) {
+                       put_page(efi_runtime_page);
+                       return EFI_INVALID_PARAMETER;
+               }
+               if (get_page(fw_table_page, d) == 0) {
+                       put_page(fpswa_inf_page);
+                       put_page(efi_runtime_page);
+                       return EFI_INVALID_PARAMETER;
+               }
+
 #define EFI_HYPERCALL_PATCH_TO_VIRT(tgt,call) \
        do { \
                vfn = (unsigned long *) domain_mpa_to_imva(d, tgt); \
@@ -1365,6 +1393,10 @@ efi_emulate_set_virtual_address_map(
                *vfn++ = FW_HYPERCALL_FPSWA_PATCH_INDEX * 16UL + md->virt_addr;
                *vfn   = 0;
                fpswa_inf->fpswa = (void *) (FW_HYPERCALL_FPSWA_ENTRY_INDEX * 16UL + md->virt_addr);
+
+               put_page(fw_table_page);
+               put_page(fpswa_inf_page);
+               put_page(efi_runtime_page);
                break;
        }
 
index 743e10386a6baf847e76079b8beeec0c45484ae4..884bf7b4c3344581633e68d1a18cc78908f43510 100644 (file)
@@ -17,6 +17,7 @@
 #include <asm/sal.h>   /* FOR struct ia64_sal_retval */
 #include <asm/fpswa.h> /* FOR struct fpswa_ret_t */
 
+#include <asm/vmx.h>
 #include <asm/vmx_vcpu.h>
 #include <asm/vcpu.h>
 #include <asm/dom_fw.h>
@@ -35,6 +36,7 @@
 #include <public/arch-ia64/debug_op.h>
 #include <asm/sioemu.h>
 #include <public/arch-ia64/sioemu.h>
+#include <xen/pci.h>
 
 static IA64FAULT
 xen_hypercall (struct pt_regs *regs)
@@ -59,11 +61,19 @@ xen_fast_hypercall (struct pt_regs *regs)
        return IA64_NO_FAULT;
 }
 
+static long __do_pirq_guest_eoi(struct domain *d, int pirq)
+{
+       if ( pirq < 0 || pirq >= NR_IRQS )
+               return -EINVAL;
+       if ( d->arch.pirq_eoi_map )
+               evtchn_unmask(d->pirq_to_evtchn[pirq]);
+       return pirq_guest_eoi(d, pirq);
+}
+
 long do_pirq_guest_eoi(int pirq)
 {
-       return pirq_guest_eoi(current->domain, pirq);
+       return __do_pirq_guest_eoi(current->domain, pirq);
 }
-    
 
 static void
 fw_hypercall_ipi (struct pt_regs *regs)
@@ -117,10 +127,135 @@ fw_hypercall_ipi (struct pt_regs *regs)
        return;
 }
 
+static int
+fpswa_get_domain_addr(struct vcpu *v, unsigned long gpaddr, size_t size,
+                     void **virt, struct page_info **page, const char *name)
+{
+       int cross_page_boundary;
+
+       if (gpaddr == 0) {
+               *virt = 0;
+               return 0;
+       }
+
+       cross_page_boundary = (((gpaddr & ~PAGE_MASK) + size) > PAGE_SIZE);
+       if (unlikely(cross_page_boundary)) {
+               /* this case isn't implemented */
+               gdprintk(XENLOG_ERR,
+                        "%s: fpswa hypercall is called with "
+                        "page crossing argument %s 0x%lx\n",
+                        __func__, name, gpaddr);
+               return -ENOSYS;
+       }
+
+again:
+        *virt = domain_mpa_to_imva(v->domain, gpaddr);
+        *page = virt_to_page(*virt);
+        if (get_page(*page, current->domain) == 0) {
+                if (page_get_owner(*page) != current->domain) {
+                       *page = NULL;
+                       return -EFAULT;
+               }
+                goto again;
+        }
+
+       return 0;
+}
+
+static fpswa_ret_t
+fw_hypercall_fpswa (struct vcpu *v, struct pt_regs *regs)
+{
+       fpswa_ret_t ret = {-1, 0, 0, 0};
+       unsigned long bundle[2] = { regs->r15, regs->r16};
+       fp_state_t fp_state;
+       struct page_info *lp_page = NULL;
+       struct page_info *lv_page = NULL;
+       struct page_info *hp_page = NULL;
+       struct page_info *hv_page = NULL;
+       XEN_EFI_RR_DECLARE(rr6, rr7);
+
+       if (unlikely(PSCBX(v, fpswa_ret).status != 0 && 
+                    PSCBX(v, fpswa_ret).status != IA64_RETRY)) {
+               ret = PSCBX(v, fpswa_ret);
+               PSCBX(v, fpswa_ret) = (fpswa_ret_t){0, 0, 0, 0};
+               return ret;
+       }
+
+       if (!fpswa_interface)
+               goto error;
+
+       memset(&fp_state, 0, sizeof(fp_state));
+       fp_state.bitmask_low64 = regs->r22;
+       fp_state.bitmask_high64 = regs->r23;
+
+       /* bit6..bit11 */
+       if ((fp_state.bitmask_low64 & 0xfc0) != 0xfc0) {
+               /* other cases isn't supported yet */
+               gdprintk(XENLOG_ERR, "%s unsupported bitmask_low64 0x%lx\n",
+                        __func__, fp_state.bitmask_low64);
+               goto error;
+       }
+       if (regs->r25 == 0)
+               /* fp_state.fp_state_low_volatile must be supplied */
+               goto error;
+
+       /* eager save/lazy restore fpu: f32...f127 */
+       if ((~fp_state.bitmask_low64 & ((1UL << 31) - 1)) != 0 ||
+           ~fp_state.bitmask_high64 != 0) {
+               if (VMX_DOMAIN(v))
+                       vmx_lazy_load_fpu(v);
+               else
+                       ia64_lazy_load_fpu(v);
+       }
+
+       if (fpswa_get_domain_addr(v, regs->r24,
+                                 sizeof(fp_state.fp_state_low_preserved), 
+                                 (void*)&fp_state.fp_state_low_preserved,
+                                 &lp_page, "fp_state_low_preserved") < 0)
+               goto error;
+       if (fpswa_get_domain_addr(v, regs->r25,
+                                 sizeof(fp_state.fp_state_low_volatile),
+                                 (void*)&fp_state.fp_state_low_volatile,
+                                 &lv_page, "fp_state_low_volatile") < 0)
+               goto error;
+       if (fpswa_get_domain_addr(v, regs->r26,
+                                 sizeof(fp_state.fp_state_high_preserved),
+                                 (void*)&fp_state.fp_state_high_preserved,
+                                 &hp_page, "fp_state_low_preserved") < 0)
+               goto error;
+       if (fpswa_get_domain_addr(v, regs->r27,
+                                 sizeof(fp_state.fp_state_high_volatile),
+                                 (void*)&fp_state.fp_state_high_volatile,
+                                 &hv_page, "fp_state_high_volatile") < 0)
+               goto error;
+
+       XEN_EFI_RR_ENTER(rr6, rr7);
+       ret = (*fpswa_interface->fpswa)(regs->r14,
+                                       bundle,
+                                       &regs->r17,     /* pipsr */
+                                       &regs->r18,     /* pfsr */
+                                       &regs->r19,     /* pisr */
+                                       &regs->r20,     /* ppreds */
+                                       &regs->r21,     /* pifs */
+                                       &fp_state);
+       XEN_EFI_RR_LEAVE(rr6, rr7);
+
+error:
+       if (lp_page != NULL)
+               put_page(lp_page);
+       if (lv_page != NULL)
+               put_page(lv_page);
+       if (hp_page != NULL)
+               put_page(hp_page);
+       if (hv_page != NULL)
+               put_page(hv_page);
+       return ret;
+}
+
 static fpswa_ret_t
-fw_hypercall_fpswa (struct vcpu *v)
+fw_hypercall_fpswa_error(void)
 {
-       return PSCBX(v, fpswa_ret);
+       return (fpswa_ret_t) {-1, 0, 0, 0};
 }
 
 IA64FAULT
@@ -175,7 +310,7 @@ ia64_hypercall(struct pt_regs *regs)
                                stop_timer(&v->arch.hlt_timer);
                                /* do_block() calls
                                 * local_event_delivery_enable(),
-                                * but PALL CALL must be called with
+                                * but PAL CALL must be called with
                                 * psr.i = 0 and psr.i is unchanged.
                                 * SDM vol.2 Part I 11.10.2
                                 * PAL Calling Conventions.
@@ -225,8 +360,24 @@ ia64_hypercall(struct pt_regs *regs)
        case FW_HYPERCALL_SET_SHARED_INFO_VA:
                regs->r8 = domain_set_shared_info_va (regs->r28);
                break;
-       case FW_HYPERCALL_FPSWA:
-               fpswa_ret = fw_hypercall_fpswa (v);
+       case FW_HYPERCALL_FPSWA_BASE:
+               switch (regs->r2) {
+               case FW_HYPERCALL_FPSWA_BROKEN:
+                       gdprintk(XENLOG_WARNING,
+                                "Old fpswa hypercall was called (0x%lx).\n"
+                                "Please update your domain builder. ip 0x%lx\n",
+                                FW_HYPERCALL_FPSWA_BROKEN, regs->cr_iip);
+                       fpswa_ret = fw_hypercall_fpswa_error();
+                       break;
+               case FW_HYPERCALL_FPSWA:
+                       fpswa_ret = fw_hypercall_fpswa(v, regs);
+                       break;
+               default:
+                       gdprintk(XENLOG_ERR, "unknown fpswa hypercall %lx\n",
+                                regs->r2);
+                       fpswa_ret = fw_hypercall_fpswa_error();
+                       break;
+               }
                regs->r8  = fpswa_ret.status;
                regs->r9  = fpswa_ret.err0;
                regs->r10 = fpswa_ret.err1;
@@ -313,6 +464,25 @@ extern int
 iosapic_guest_write(
     unsigned long physbase, unsigned int reg, u32 pval);
 
+
+/*
+ * XXX: We don't support MSI for PCI passthrough at present, so make the
+ * following 2 functions dummy for now. They shouldn't return -ENOSYS
+ * because xend invokes them (the x86 version of them is necessary for
+ * x86 Xen); if they return -ENOSYS, xend would disallow us to create
+ * IPF HVM guest with devices assigned so here they can return 0.
+ */
+static int physdev_map_pirq(struct physdev_map_pirq *map)
+{
+       return 0;
+}
+
+static int physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
+{
+       return 0;
+}
+
+
 long do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
 {
     int irq;
@@ -325,7 +495,34 @@ long do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         ret = -EFAULT;
         if ( copy_from_guest(&eoi, arg, 1) != 0 )
             break;
-        ret = pirq_guest_eoi(current->domain, eoi.irq);
+        ret = __do_pirq_guest_eoi(current->domain, eoi.irq);
+        break;
+    }
+
+    case PHYSDEVOP_pirq_eoi_gmfn: {
+        struct physdev_pirq_eoi_gmfn info;
+        unsigned long mfn;
+
+        BUILD_BUG_ON(NR_IRQS > (PAGE_SIZE * 8));
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&info, arg, 1) != 0 )
+            break;
+
+        ret = -EINVAL;
+        mfn = gmfn_to_mfn(current->domain, info.gmfn);
+        if ( !mfn_valid(mfn) || !get_page(mfn_to_page(mfn), current->domain) )
+            break;
+
+        if ( cmpxchg(&current->domain->arch.pirq_eoi_map_mfn, 0, mfn) != 0 )
+        {
+            put_page(mfn_to_page(mfn));
+            ret = -EBUSY;
+            break;
+        }
+
+        current->domain->arch.pirq_eoi_map = mfn_to_virt(mfn);
+        ret = 0;
         break;
     }
 
@@ -346,7 +543,7 @@ long do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
             break;
         irq_status_query.flags = 0;
         /* Edge-triggered interrupts don't need an explicit unmask downcall. */
-        if ( !strstr(irq_desc[irq_to_vector(irq)].handler->typename, "edge") )
+        if ( !strstr(irq_descp(irq)->handler->typename, "edge") )
             irq_status_query.flags |= XENIRQSTAT_needs_eoi;
         ret = copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0;
         break;
@@ -426,18 +623,82 @@ long do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
-    /*
-     * XXX We don't support MSI for PCI passthrough, so just return success
-     */
-    case PHYSDEVOP_map_pirq:
-    case PHYSDEVOP_unmap_pirq:
-        ret = 0;
+       case PHYSDEVOP_map_pirq: {
+        struct physdev_map_pirq map;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&map, arg, 1) != 0 )
+             break;
+
+        ret = physdev_map_pirq(&map);
+
+        if ( copy_to_guest(arg, &map, 1) != 0 )
+             ret = -EFAULT;
         break;
+    }
+
+    case PHYSDEVOP_unmap_pirq: {
+        struct physdev_unmap_pirq unmap;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&unmap, arg, 1) != 0 )
+            break;
+
+        ret = physdev_unmap_pirq(&unmap);
+            break;
+    }
+
+    case PHYSDEVOP_manage_pci_add: {
+        struct physdev_manage_pci manage_pci;
+        ret = -EPERM;
+        if ( !IS_PRIV(current->domain) )
+            break;
+        ret = -EFAULT;
+        if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
+            break;
+
+        ret = pci_add_device(manage_pci.bus, manage_pci.devfn);
+            break;
+    }
+
+    case PHYSDEVOP_manage_pci_remove: {
+        struct physdev_manage_pci manage_pci;
+        ret = -EPERM;
+        if ( !IS_PRIV(current->domain) )
+            break;
+        ret = -EFAULT;
+        if ( copy_from_guest(&manage_pci, arg, 1) != 0 )
+            break;
+
+        ret = pci_remove_device(manage_pci.bus, manage_pci.devfn);
+            break;
+    }
+
+    case PHYSDEVOP_manage_pci_add_ext: {
+        struct physdev_manage_pci_ext manage_pci_ext;
+        struct pci_dev_info pdev_info;
+
+        ret = -EPERM;
+        if ( !IS_PRIV(current->domain) )
+            break;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&manage_pci_ext, arg, 1) != 0 )
+            break;
+
+        pdev_info.is_extfn = manage_pci_ext.is_extfn;
+        pdev_info.is_virtfn = manage_pci_ext.is_virtfn;
+        pdev_info.physfn.bus = manage_pci_ext.physfn.bus;
+        pdev_info.physfn.devfn = manage_pci_ext.physfn.devfn;
+        ret = pci_add_device_ext(manage_pci_ext.bus,
+                                 manage_pci_ext.devfn,
+                                 &pdev_info);
+            break;
+    }
 
-    case PHYSDEVOP_manage_pci_add:
-    case PHYSDEVOP_manage_pci_remove:
     default:
         ret = -ENOSYS;
+        printk("not implemented do_physdev_op: %d\n", cmd);
         break;
     }
 
index 2c250148c52b7cedcb94bbba9e14f8d05a257c41..bce8c6c237dad2ae331ddff78e9b8c1a15792ab2 100644 (file)
@@ -74,7 +74,7 @@ unsigned int __ia64_local_vector_to_irq (ia64_vector vec)
 /*
  * Controller mappings for all interrupt sources:
  */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+irq_desc_t irq_desc[NR_IRQS] = {
        [0 ... NR_IRQS-1] = {
                .status = IRQ_DISABLED,
                .handler = &no_irq_type,
@@ -228,11 +228,11 @@ out:
  * disabled.
  */
 
-int setup_vector(unsigned int irq, struct irqaction * new)
+int setup_vector(unsigned int vector, struct irqaction * new)
 {
        unsigned long flags;
        struct irqaction *old, **p;
-       irq_desc_t *desc = irq_descp(irq);
+       irq_desc_t *desc = irq_descp(vector);
 
        /*
         * The following block of code has to be executed atomically
@@ -248,8 +248,8 @@ int setup_vector(unsigned int irq, struct irqaction * new)
 
        desc->depth = 0;
        desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_GUEST);
-       desc->handler->startup(irq);
-       desc->handler->enable(irq);
+       desc->handler->startup(vector);
+       desc->handler->enable(vector);
        spin_unlock_irqrestore(&desc->lock,flags);
 
        return 0;
@@ -258,13 +258,11 @@ int setup_vector(unsigned int irq, struct irqaction * new)
 /* Vectors reserved by xen (and thus not sharable with domains).  */
 unsigned long ia64_xen_vector[BITS_TO_LONGS(NR_IRQS)];
 
-int setup_irq(unsigned int irq, struct irqaction * new)
+int setup_irq_vector(unsigned int vec, struct irqaction * new)
 {
-       unsigned int vec;
        int res;
 
-       /* Get vector for IRQ.  */
-       if (acpi_gsi_to_irq (irq, &vec) < 0)
+       if ( vec == IA64_INVALID_VECTOR )
                return -ENOSYS;
        /* Reserve the vector (and thus the irq).  */
        if (test_and_set_bit(vec, ia64_xen_vector))
@@ -273,14 +271,12 @@ int setup_irq(unsigned int irq, struct irqaction * new)
        return res;
 }
 
-void free_irq(unsigned int irq)
+void release_irq_vector(unsigned int vec)
 {
-       unsigned int vec;
        unsigned long flags;
        irq_desc_t *desc;
 
-       /* Get vector for IRQ.  */
-       if (acpi_gsi_to_irq(irq, &vec) < 0)
+       if ( vec == IA64_INVALID_VECTOR )
                return;
 
        desc = irq_descp(vec);
@@ -312,12 +308,50 @@ typedef struct {
     struct domain *guest[IRQ_MAX_GUESTS];
 } irq_guest_action_t;
 
+static inline void set_pirq_eoi(struct domain *d, unsigned int irq)
+{
+    if ( d->arch.pirq_eoi_map )
+        set_bit(irq, d->arch.pirq_eoi_map);
+}
+
+static inline void clear_pirq_eoi(struct domain *d, unsigned int irq)
+{
+    if ( d->arch.pirq_eoi_map )
+        clear_bit(irq, d->arch.pirq_eoi_map);
+}
+
+static void _irq_guest_eoi(irq_desc_t *desc)
+{
+    irq_guest_action_t *action = (irq_guest_action_t *)desc->action;
+    unsigned int i, vector = desc - irq_desc;
+
+    if ( !(desc->status & IRQ_GUEST_EOI_PENDING) )
+        return;
+
+    for ( i = 0; i < action->nr_guests; ++i )
+        clear_pirq_eoi(action->guest[i], vector);
+
+    desc->status &= ~(IRQ_INPROGRESS|IRQ_GUEST_EOI_PENDING);
+    desc->handler->enable(vector);
+}
+
+static struct timer irq_guest_eoi_timer[NR_IRQS];
+static void irq_guest_eoi_timer_fn(void *data)
+{
+       irq_desc_t *desc = data;
+       unsigned long flags;
+
+       spin_lock_irqsave(&desc->lock, flags);
+       _irq_guest_eoi(desc);
+       spin_unlock_irqrestore(&desc->lock, flags);
+}
+
 void __do_IRQ_guest(int irq)
 {
     irq_desc_t         *desc = &irq_desc[irq];
     irq_guest_action_t *action = (irq_guest_action_t *)desc->action;
     struct domain      *d;
-    int                 i;
+    int                 i, already_pending = 0;
 
     for ( i = 0; i < action->nr_guests; i++ )
     {
@@ -325,11 +359,46 @@ void __do_IRQ_guest(int irq)
         if ( (action->ack_type != ACKTYPE_NONE) &&
              !test_and_set_bit(irq, &d->pirq_mask) )
             action->in_flight++;
-        send_guest_pirq(d, irq);
-    }
+               if ( hvm_do_IRQ_dpci(d, irq) )
+               {
+                       if ( action->ack_type == ACKTYPE_NONE )
+                       {
+                               already_pending += !!(desc->status & IRQ_INPROGRESS);
+                               desc->status |= IRQ_INPROGRESS; /* cleared during hvm eoi */
+                       }
+               }
+               else if ( send_guest_pirq(d, irq) &&
+                               (action->ack_type == ACKTYPE_NONE) )
+               {
+                       already_pending++;
+               }
+       }
+
+       if ( already_pending == action->nr_guests )
+       {
+               stop_timer(&irq_guest_eoi_timer[irq]);
+               desc->handler->disable(irq);
+        desc->status |= IRQ_GUEST_EOI_PENDING;
+        for ( i = 0; i < already_pending; ++i )
+        {
+            d = action->guest[i];
+            set_pirq_eoi(d, irq);
+            /*
+             * Could check here whether the guest unmasked the event by now
+             * (or perhaps just re-issue the send_guest_pirq()), and if it
+             * can now accept the event,
+             * - clear all the pirq_eoi bits we already set,
+             * - re-enable the vector, and
+             * - skip the timer setup below.
+             */
+        }
+               init_timer(&irq_guest_eoi_timer[irq],
+                               irq_guest_eoi_timer_fn, desc, smp_processor_id());
+               set_timer(&irq_guest_eoi_timer[irq], NOW() + MILLISECS(1));
+       }
 }
 
-int pirq_acktype(int irq)
+static int pirq_acktype(int irq)
 {
     irq_desc_t *desc = &irq_desc[irq];
 
@@ -345,16 +414,25 @@ int pirq_acktype(int irq)
 int pirq_guest_eoi(struct domain *d, int irq)
 {
     irq_desc_t *desc;
+    irq_guest_action_t *action;
 
     if ( (irq < 0) || (irq >= NR_IRQS) )
         return -EINVAL;
 
     desc = &irq_desc[irq];
     spin_lock_irq(&desc->lock);
-    if ( test_and_clear_bit(irq, &d->pirq_mask) &&
-         (--((irq_guest_action_t *)desc->action)->in_flight == 0) )
+    action = (irq_guest_action_t *)desc->action;
+
+    if ( action->ack_type == ACKTYPE_NONE )
     {
-        ASSERT(((irq_guest_action_t*)desc->action)->ack_type == ACKTYPE_UNMASK);
+        ASSERT(!test_bit(irq, d->pirq_mask));
+        stop_timer(&irq_guest_eoi_timer[irq]);
+        _irq_guest_eoi(desc);
+    }
+
+    if ( test_and_clear_bit(irq, &d->pirq_mask) && (--action->in_flight == 0) )
+    {
+        ASSERT(action->ack_type == ACKTYPE_UNMASK);
         desc->handler->end(irq);
     }
     spin_unlock_irq(&desc->lock);
@@ -454,6 +532,11 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
 
     action->guest[action->nr_guests++] = v->domain;
 
+    if ( action->ack_type != ACKTYPE_NONE )
+        set_pirq_eoi(v->domain, irq);
+    else
+        clear_pirq_eoi(v->domain, irq);
+
  out:
     spin_unlock_irqrestore(&desc->lock, flags);
     return rc;
index 7ad9b6c35f28c07c69146dbf3349e64d17f3ba5f..7c5ac5f04c811183ec52f6325016322ee68209a9 100644 (file)
@@ -184,10 +184,12 @@ ENTRY(alt_dtlb_miss)
 late_alt_dtlb_miss:
        mov r20=cr.isr
        movl r17=PAGE_KERNEL
-       mov r21=cr.ipsr
+       mov r29=cr.ipsr // frametable_miss is shared by paravirtual and HVM sides
+                       // and it assumes ipsr is saved in r29. If change the
+                       // registers usage here, please check both sides!   
        movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
        ;;
-       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
+       extr.u r23=r29,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
        and r22=IA64_ISR_CODE_MASK,r20          // get the isr.code field
        tbit.nz p6,p7=r20,IA64_ISR_SP_BIT       // is speculation bit on?
        extr.u r18=r16,XEN_VIRT_UC_BIT,1        // extract UC bit
@@ -234,7 +236,7 @@ late_alt_dtlb_miss:
        br.cond.spnt page_fault
        ;;
 alt_dtlb_miss_identity_map:
-       dep r21=-1,r21,IA64_PSR_ED_BIT,1
+       dep r29=-1,r29,IA64_PSR_ED_BIT,1
        or r19=r19,r17          // insert PTE control bits into r19
        mov cr.itir=r20         // set itir with cleared key
        ;;
@@ -243,7 +245,7 @@ alt_dtlb_miss_identity_map:
        cmp.eq.or p8,p0=0x18,r22        // Region 6 is UC for EFI
        ;;
 (p8)   dep r19=-1,r19,4,1      // set bit 4 (uncached) if access to UC area
-(p6)   mov cr.ipsr=r21
+(p6)   mov cr.ipsr=r29
        ;;
 (p7)   itc.d r19               // insert the TLB entry
        mov pr=r31,-1
@@ -288,17 +290,17 @@ GLOBAL_ENTRY(frametable_miss)
        rfi
 END(frametable_miss)
 
-ENTRY(frametable_fault)
+ENTRY(frametable_fault)                //ipsr saved in r29 before coming here!
        ssm psr.dt              // switch to using virtual data addressing
        mov r18=cr.iip
        movl r19=ia64_frametable_probe
        ;;
        cmp.eq p6,p7=r18,r19    // is faulting addrress ia64_frametable_probe?
        mov r8=0                // assumes that 'probe.r' uses r8
-       dep r21=-1,r21,IA64_PSR_RI_BIT+1,1 // return to next instruction in
+       dep r29=-1,r29,IA64_PSR_RI_BIT+1,1 // return to next instruction in
                                           //   bundle 2
        ;;
-(p6)   mov cr.ipsr=r21
+(p6)   mov cr.ipsr=r29
        mov r19=4               // FAULT(4)
 (p7)   br.spnt.few dispatch_to_fault_handler
        ;;
@@ -469,10 +471,17 @@ ENTRY(daccess_bit)
        DBG_FAULT(10)
        mov r16=cr.isr
        mov r17=cr.ifa
+       mov r18=cr.ipsr
        mov r31=pr
        mov r19=10
+       ;;
        mov r20=0x2800
-       br.sptk.many fast_access_reflect
+       extr.u r18=r18,IA64_PSR_CPL0_BIT,2
+       ;;
+       cmp.ne p6,p0=r0,r18     /* cpl != 0? */
+(p6)   br.sptk.many fast_access_reflect
+       /* __domain_get_bundle() may cause this fault. */
+       br.sptk.few dispatch_to_fault_handler
        ;;
 END(daccess_bit)
 
index 9fcd189d66da7e353045d7e33de500399ab2d40d..9558735433eaf505cc572f028ac89c35ff7ff8fc 100644 (file)
@@ -159,8 +159,10 @@ static int machine_kexec_get_xen(xen_kexec_range_t *range)
 static int machine_kexec_get_xenheap(xen_kexec_range_t *range)
 {
        range->start = (ia64_tpa(_end) + (ELF_PAGE_SIZE - 1)) & ELF_PAGE_MASK;
-       range->size = (unsigned long)xenheap_phys_end -
-                     (unsigned long)range->start;
+       range->size =
+               (((unsigned long)range->start + KERNEL_TR_PAGE_SIZE) &
+         ~(KERNEL_TR_PAGE_SIZE - 1))
+               - (unsigned long)range->start;
        return 0;
 }
 
@@ -199,7 +201,6 @@ void arch_crash_save_vmcoreinfo(void)
        VMCOREINFO_SYMBOL(dom_io);
        VMCOREINFO_SYMBOL(xen_pstart);
        VMCOREINFO_SYMBOL(frametable_pg_dir);
-       VMCOREINFO_SYMBOL_ALIAS(xen_heap_start, xen_pickle_offset);
 }
 
 /*
index 4f18bf7349c2360e80632c9bc5bfbd6857c1ae87..c98272a0b3debbfc139a380065ea26763bd79692 100644 (file)
 #include <asm/event.h>
 #include <asm/debugger.h>
 
+
+#define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING, _f "\n", ## _a)
+
 static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr,
                                       volatile pte_t* ptep, pte_t old_pte, 
                                       struct page_info* page);
@@ -463,7 +466,7 @@ share_xen_page_with_guest(struct page_info *page,
 
     page_set_owner(page, d);
     wmb(); /* install valid domain ptr before updating refcnt. */
-    ASSERT(page->count_info == 0);
+    ASSERT((page->count_info & ~PGC_xen_heap)== 0);
 
     /* Only add to the allocation list if the domain isn't dying. */
     if ( !d->is_dying )
@@ -471,7 +474,7 @@ share_xen_page_with_guest(struct page_info *page,
         page->count_info |= PGC_allocated | 1;
         if ( unlikely(d->xenheap_pages++ == 0) )
             get_knownalive_domain(d);
-        list_add_tail(&page->list, &d->xenpage_list);
+        page_list_add_tail(page, &d->xenpage_list);
     }
 
     // grant_table_destroy() releases these pages.
@@ -917,11 +920,20 @@ __assign_domain_page(struct domain *d,
 
     old_pte = __pte(0);
     new_pte = pfn_pte(physaddr >> PAGE_SHIFT, __pgprot(prot));
+ again_hvm_page_io:
     ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
     if (pte_val(ret_pte) == pte_val(old_pte)) {
         smp_mb();
         return 0;
     }
+    /* in HVM guest, when VTD is enabled,
+     * P2M entry may change from _PAGE_IO type to real MMIO page 
+     */
+    if(is_hvm_domain(d) && (pte_val(ret_pte) & _PAGE_IO) &&
+       !mfn_valid(physaddr >> PAGE_SHIFT)) {
+        old_pte = ret_pte;
+        goto again_hvm_page_io;
+    }
 
     // dom0 tries to map real machine's I/O region, but failed.
     // It is very likely that dom0 doesn't boot correctly because
@@ -975,7 +987,7 @@ assign_domain_page(struct domain *d,
     struct page_info* page = mfn_to_page(physaddr >> PAGE_SHIFT);
 
     BUG_ON((physaddr & _PAGE_PPN_MASK) != physaddr);
-    BUG_ON(page->count_info != (PGC_allocated | 1));
+    BUG_ON((page->count_info & ~PGC_xen_heap) != (PGC_allocated | 1));
     set_gpfn_from_mfn(physaddr >> PAGE_SHIFT, mpaddr >> PAGE_SHIFT);
     // because __assign_domain_page() uses set_pte_rel() which has
     // release semantics, smp_mb() isn't needed.
@@ -983,15 +995,48 @@ assign_domain_page(struct domain *d,
                                ASSIGN_writable | ASSIGN_pgc_allocated);
 }
 
+static void
+ioports_get_mmio_addr(const struct io_space *space,
+                      unsigned long fp, unsigned long lp,
+                      unsigned long *mmio_start, unsigned long *mmio_end)
+{
+    if (space->sparse) {
+        *mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
+        *mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
+    } else {
+        *mmio_start = fp & PAGE_MASK;
+        *mmio_end = PAGE_ALIGN(lp);
+    }
+}
+
+static unsigned long
+ioports_get_mmio_base(const struct io_space *space, struct domain *d)
+{
+    if (VMX_DOMAIN(d->vcpu[0]))
+        return LEGACY_IO_START;
+
+    if (space == &io_space[0] && d != dom0)
+        return IO_PORTS_PADDR;
+
+    return __pa(space->mmio_base);
+}
+
+/* 
+ * Inpurt
+ * fgp: first guest port
+ * fmp: first machine port
+ * lmp: last machine port
+ */
 int
-ioports_permit_access(struct domain *d, unsigned int fp, unsigned int lp)
+ioports_permit_access(struct domain *d, unsigned int fgp,
+        unsigned int fmp, unsigned int lmp)
 {
     struct io_space *space;
-    unsigned long mmio_start, mmio_end, mach_start;
+    unsigned long mmio_start, mach_start, mach_end;
     int ret;
 
-    if (IO_SPACE_NR(fp) >= num_io_spaces) {
-        dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fp, lp);
+    if (IO_SPACE_NR(fmp) >= num_io_spaces) {
+        dprintk(XENLOG_WARNING, "Unknown I/O Port range 0x%x - 0x%x\n", fmp, lmp);
         return -EFAULT;
     }
 
@@ -1005,42 +1050,32 @@ ioports_permit_access(struct domain *d, unsigned int fp, unsigned int lp)
      * I/O port spaces and thus will number port spaces differently.
      * This is ok, they don't make use of this interface.
      */
-    ret = rangeset_add_range(d->arch.ioport_caps, fp, lp);
+    ret = rangeset_add_range(d->arch.ioport_caps, fmp, lmp);
     if (ret != 0)
         return ret;
 
-    space = &io_space[IO_SPACE_NR(fp)];
+    space = &io_space[IO_SPACE_NR(fmp)];
 
     /* Legacy I/O on dom0 is already setup */
     if (d == dom0 && space == &io_space[0])
         return 0;
 
-    fp = IO_SPACE_PORT(fp);
-    lp = IO_SPACE_PORT(lp);
+    fmp = IO_SPACE_PORT(fmp);
+    lmp = IO_SPACE_PORT(lmp);
 
-    if (space->sparse) {
-        mmio_start = IO_SPACE_SPARSE_ENCODING(fp) & PAGE_MASK;
-        mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp));
-    } else {
-        mmio_start = fp & PAGE_MASK;
-        mmio_end = PAGE_ALIGN(lp);
-    }
+    ioports_get_mmio_addr(space, fmp, lmp, &mach_start, &mach_end);
 
     /*
      * The "machine first port" is not necessarily identity mapped
      * to the guest first port.  At least for the legacy range.
      */
-    mach_start = mmio_start | __pa(space->mmio_base);
+    mach_start = mach_start | __pa(space->mmio_base);
+    mach_end = mach_end | __pa(space->mmio_base);
 
-    if (space == &io_space[0]) {
-        mmio_start |= IO_PORTS_PADDR;
-        mmio_end |= IO_PORTS_PADDR;
-    } else {
-        mmio_start |= __pa(space->mmio_base);
-        mmio_end |= __pa(space->mmio_base);
-    }
+    mmio_start = IO_SPACE_SPARSE_ENCODING(fgp) & PAGE_MASK;
+    mmio_start |= ioports_get_mmio_base(space, d);
 
-    while (mmio_start <= mmio_end) {
+    while (mach_start < mach_end) {
         (void)__assign_domain_page(d, mmio_start, mach_start, ASSIGN_nocache); 
         mmio_start += PAGE_SIZE;
         mach_start += PAGE_SIZE;
@@ -1081,18 +1116,9 @@ ioports_deny_access(struct domain *d, unsigned int fp, unsigned int lp)
     fp_base = IO_SPACE_PORT(fp);
     lp_base = IO_SPACE_PORT(lp);
 
-    if (space->sparse) {
-        mmio_start = IO_SPACE_SPARSE_ENCODING(fp_base) & PAGE_MASK;
-        mmio_end = PAGE_ALIGN(IO_SPACE_SPARSE_ENCODING(lp_base));
-    } else {
-        mmio_start = fp_base & PAGE_MASK;
-        mmio_end = PAGE_ALIGN(lp_base);
-    }
+    ioports_get_mmio_addr(space, fp_base, lp_base, &mmio_start, &mmio_end);
 
-    if (space == &io_space[0] && d != dom0)
-        mmio_base = IO_PORTS_PADDR;
-    else
-        mmio_base = __pa(space->mmio_base);
+    mmio_base = ioports_get_mmio_base(space, d);
 
     for (; mmio_start < mmio_end; mmio_start += PAGE_SIZE) {
         unsigned int port, range;
@@ -1235,17 +1261,18 @@ adjust_page_count_info(struct page_info* page)
         int ret = get_page(page, d);
         BUG_ON(ret == 0);
     } else {
-        u64 x, nx, y;
+        unsigned long x, nx, y;
 
-        y = *((u64*)&page->count_info);
+        y = page->count_info;
         do {
             x = y;
             nx = x + 1;
 
             BUG_ON((x >> 32) != 0);
             BUG_ON((nx & PGC_count_mask) != 2);
-            y = cmpxchg((u64*)&page->count_info, x, nx);
+            y = cmpxchg(&page->count_info, x, nx);
         } while (unlikely(y != x));
+        BUG_ON(page_get_owner(page) != NULL);
     }
 }
 
@@ -1413,6 +1440,8 @@ zap_domain_page_one(struct domain *d, unsigned long mpaddr,
     if (mfn == INVALID_MFN) {
         // clear pte
         old_pte = ptep_get_and_clear(mm, mpaddr, pte);
+        if(!pte_mem(old_pte))
+            return;
         mfn = pte_pfn(old_pte);
     } else {
         unsigned long old_arflags;
@@ -1422,7 +1451,8 @@ zap_domain_page_one(struct domain *d, unsigned long mpaddr,
     again:
         // memory_exchange() calls guest_physmap_remove_page() with
         // a stealed page. i.e. page owner = NULL.
-        BUG_ON(page_get_owner(mfn_to_page(mfn)) != d &&
+        BUG_ON(mfn_valid(mfn) &&
+               page_get_owner(mfn_to_page(mfn)) != d &&
                page_get_owner(mfn_to_page(mfn)) != NULL);
         old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
         old_pte = pfn_pte(mfn, __pgprot(old_arflags));
@@ -1445,12 +1475,46 @@ zap_domain_page_one(struct domain *d, unsigned long mpaddr,
         BUG_ON(mfn != pte_pfn(ret_pte));
     }
 
+    perfc_incr(zap_domain_page_one);
+    if(!mfn_valid(mfn))
+        return;
+
+    if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ){
+        int i, j;
+        j = 1 << (PAGE_SHIFT-PAGE_SHIFT_4K);
+        for(i = 0 ; i < j; i++)
+            iommu_unmap_page(d, (mpaddr>>PAGE_SHIFT)*j + i);
+    }
+
     page = mfn_to_page(mfn);
     BUG_ON((page->count_info & PGC_count_mask) == 0);
 
     BUG_ON(clear_PGC_allocate && (page_get_owner(page) == NULL));
     domain_put_page(d, mpaddr, pte, old_pte, clear_PGC_allocate);
-    perfc_incr(zap_domain_page_one);
+}
+
+int
+deassign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
+                        unsigned long phys_addr, unsigned long size )
+{
+    unsigned long addr = mpaddr & PAGE_MASK;
+    unsigned long end = PAGE_ALIGN(mpaddr + size);
+
+    if (size == 0) {
+        gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
+                __func__, d, mpaddr, size);
+    }
+    if (!efi_mmio(phys_addr, size)) {
+#ifndef NDEBUG
+        gdprintk(XENLOG_INFO, "%s: domain %p mpaddr 0x%lx size = 0x%lx\n",
+                __func__, d, mpaddr, size);
+#endif
+        return -EINVAL;
+    }
+
+    for (; addr < end; addr += PAGE_SIZE )
+        zap_domain_page_one(d, addr, 0, INVALID_MFN);
+    return 0;
 }
 
 unsigned long
@@ -2685,8 +2749,7 @@ steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
 #if 0 /* if big endian */
 # error "implement big endian version of steal_page()"
 #endif
-    u32 _d, _nd;
-    u64 x, nx, y;
+    unsigned long x, y;
 
     if (page_get_owner(page) != d) {
         gdprintk(XENLOG_INFO, "%s d 0x%p owner 0x%p\n",
@@ -2734,68 +2797,77 @@ steal_page(struct domain *d, struct page_info *page, unsigned int memflags)
     }
 
     spin_lock(&d->page_alloc_lock);
+    /* check again */
+    if (is_xen_heap_page(page) || page_get_owner(page) != d) {
+        goto fail;
+    }
 
     /*
-     * The tricky bit: atomically release ownership while there is just one
-     * benign reference to the page (PGC_allocated). If that reference
-     * disappears then the deallocation routine will safely spin.
+     * We require there is just one reference (PGC_allocated). We temporarily
+     * drop this reference now so that we can safely swizzle the owner.
      */
-    _d  = pickle_domptr(d);
-    y = *((u64*)&page->count_info);
+    y = page->count_info;
     do {
         x = y;
-        nx = x & 0xffffffff;
-        // page->count_info: untouched
-        // page->u.inused._domain = 0;
-        _nd = x >> 32;
 
         if (unlikely(((x & (PGC_count_mask | PGC_allocated)) !=
-                      (1 | PGC_allocated))) ||
-            unlikely(_nd != _d)) {
-            struct domain* nd = unpickle_domptr(_nd);
+                      (1 | PGC_allocated)))) {
+            struct domain* nd = page_get_owner(page);
             if (nd == NULL) {
                 gdprintk(XENLOG_INFO, "gnttab_transfer: "
-                        "Bad page %p: ed=%p(%u) 0x%x, "
-                        "sd=%p 0x%x,"
+                        "Bad page %p: ed=%p(%u), "
+                        "sd=%p,"
                         " caf=%016lx, taf=%" PRtype_info
                         " memflags 0x%x\n",
                         (void *) page_to_mfn(page),
-                        d, d->domain_id, _d,
-                        nd, _nd,
+                        d, d->domain_id,
+                        nd,
                         x,
                         page->u.inuse.type_info,
                         memflags);
             } else {
                 gdprintk(XENLOG_WARNING, "gnttab_transfer: "
-                        "Bad page %p: ed=%p(%u) 0x%x, "
-                        "sd=%p(%u) 0x%x,"
+                        "Bad page %p: ed=%p(%u), "
+                        "sd=%p(%u),"
                         " caf=%016lx, taf=%" PRtype_info
                         " memflags 0x%x\n",
                         (void *) page_to_mfn(page),
-                        d, d->domain_id, _d,
-                        nd, nd->domain_id, _nd,
+                        d, d->domain_id,
+                        nd, nd->domain_id,
                         x,
                         page->u.inuse.type_info,
                         memflags);
             }
-            spin_unlock(&d->page_alloc_lock);
-            return -1;
+            goto fail;
         }
 
-        y = cmpxchg((u64*)&page->count_info, x, nx);
+        y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
     } while (unlikely(y != x));
 
-    /*
-     * Unlink from 'd'. At least one reference remains (now anonymous), so
-     * noone else is spinning to try to delete this page from 'd'.
-     */
+    /* Swizzle the owner then reinstate the PGC_allocated reference. */
+    page_set_owner(page, NULL);
+    y = page->count_info;
+    do {
+        x = y;
+        BUG_ON((x & (PGC_count_mask | PGC_allocated)) != PGC_allocated);
+        y = cmpxchg(&page->count_info, x, x | 1);
+    } while (unlikely(y != x));
+
+    /* Unlink from original owner. */
     if ( !(memflags & MEMF_no_refcount) )
         d->tot_pages--;
-    list_del(&page->list);
+    page_list_del(page, &d->page_list);
 
     spin_unlock(&d->page_alloc_lock);
     perfc_incr(steal_page);
     return 0;
+
+ fail:
+    spin_unlock(&d->page_alloc_lock);
+    MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%016lx, taf=%" PRtype_info,
+            (void *)page_to_mfn(page), d, d->domain_id,
+            page_get_owner(page), page->count_info, page->u.inuse.type_info);
+    return -1;
 }
 
 static void
@@ -2806,6 +2878,12 @@ __guest_physmap_add_page(struct domain *d, unsigned long gpfn,
     smp_mb();
     assign_domain_page_replace(d, gpfn << PAGE_SHIFT, mfn,
                                ASSIGN_writable | ASSIGN_pgc_allocated);
+    if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ){
+        int i, j;
+        j = 1 << (PAGE_SHIFT-PAGE_SHIFT_4K);
+        for(i = 0 ; i < j; i++)
+            iommu_map_page(d, gpfn*j + i, mfn*j + i);
+    }
 }
 
 int
@@ -2816,7 +2894,8 @@ guest_physmap_add_page(struct domain *d, unsigned long gpfn,
 
     for (i = 0; i < (1UL << page_order); i++) {
         BUG_ON(!mfn_valid(mfn));
-        BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
+        BUG_ON((mfn_to_page(mfn)->count_info & ~PGC_xen_heap) !=
+               (PGC_allocated | 1));
         __guest_physmap_add_page(d, gpfn, mfn);
         mfn++;
         gpfn++;
@@ -2977,48 +3056,27 @@ void domain_cache_flush (struct domain *d, int sync_only)
     //printk ("domain_cache_flush: %d %d pages\n", d->domain_id, nbr_page);
 }
 
-#ifdef VERBOSE
-#define MEM_LOG(_f, _a...)                           \
-  printk("DOM%u: (file=mm.c, line=%d) " _f "\n", \
-         current->domain->domain_id , __LINE__ , ## _a )
-#else
-#define MEM_LOG(_f, _a...) ((void)0)
-#endif
-
-static void free_page_type(struct page_info *page, u32 type)
+static void free_page_type(struct page_info *page, unsigned long type)
 {
 }
 
-static int alloc_page_type(struct page_info *page, u32 type)
+static int alloc_page_type(struct page_info *page, unsigned long type)
 {
        return 1;
 }
 
-static int opt_p2m_xenheap;
-boolean_param("p2m_xenheap", opt_p2m_xenheap);
-
 void *pgtable_quicklist_alloc(void)
 {
+    struct page_info *page;
     void *p;
 
     BUG_ON(dom_p2m == NULL);
-    if (!opt_p2m_xenheap) {
-        struct page_info *page = alloc_domheap_page(dom_p2m, 0);
-        if (page == NULL)
-            return NULL;
-        p = page_to_virt(page);
-        clear_page(p);
-        return p;
-    }
-    p = alloc_xenheap_pages(0);
-    if (p) {
-        clear_page(p);
-        /*
-         * This page should be read only.  At this moment, the third
-         * argument doesn't make sense.  It should be 1 when supported.
-         */
-        share_xen_page_with_guest(virt_to_page(p), dom_p2m, 0);
-    }
+    page = alloc_domheap_page(dom_p2m, 0);
+    if (page == NULL)
+        return NULL;
+
+    p = page_to_virt(page);
+    clear_page(p);
     return p;
 }
 
@@ -3030,8 +3088,6 @@ void pgtable_quicklist_free(void *pgtable_entry)
     BUG_ON(page->count_info != (1 | PGC_allocated));
 
     put_page(page);
-    if (opt_p2m_xenheap)
-        free_xenheap_page(pgtable_entry);
 }
 
 void put_page_type(struct page_info *page)
@@ -3095,7 +3151,7 @@ static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
 }
 
 
-int get_page_type(struct page_info *page, u32 type)
+int get_page_type(struct page_info *page, unsigned long type)
 {
     u64 nx, x, y = page->u.inuse.type_info;
 
@@ -3143,7 +3199,7 @@ int get_page_type(struct page_info *page, u32 type)
         {
             if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
                  (type != PGT_l1_page_table) )
-                MEM_LOG("Bad type (saw %08lx != exp %08x) "
+                MEM_LOG("Bad type (saw %08lx != exp %08lx) "
                         "for mfn %016lx (pfn %016lx)",
                         x, type, page_to_mfn(page),
                         get_gpfn_from_mfn(page_to_mfn(page)));
@@ -3164,8 +3220,8 @@ int get_page_type(struct page_info *page, u32 type)
         /* Try to validate page type; drop the new reference on failure. */
         if ( unlikely(!alloc_page_type(page, type)) )
         {
-            MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08x"
-                    ": caf=%08x taf=%" PRtype_info,
+            MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %08lx"
+                    ": caf=%016lx taf=%" PRtype_info,
                     page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
                     type, page->count_info, page->u.inuse.type_info);
             /* Noone else can get a reference. We hold the only ref. */
@@ -3180,9 +3236,56 @@ int get_page_type(struct page_info *page, u32 type)
     return 1;
 }
 
-int memory_is_conventional_ram(paddr_t p)
+int page_is_ram_type(unsigned long mfn, unsigned long type)
 {
-    return (efi_mem_type(p) == EFI_CONVENTIONAL_MEMORY);
+    u32 mem_type = efi_mem_type(pfn_to_paddr(mfn));
+
+    if (type & RAM_TYPE_CONVENTIONAL)
+    {
+        switch (mem_type)
+        {
+        case EFI_BOOT_SERVICES_CODE:
+        case EFI_BOOT_SERVICES_DATA:
+        case EFI_LOADER_CODE:
+        case EFI_LOADER_DATA:
+        case EFI_CONVENTIONAL_MEMORY:
+            return 1;
+        default:
+            break;
+        }       
+    }
+    if (type & RAM_TYPE_RESERVED)
+    {
+        switch (mem_type)
+        {
+        case EFI_RUNTIME_SERVICES_CODE:
+        case EFI_RUNTIME_SERVICES_DATA:
+        case EFI_RESERVED_TYPE:
+        case EFI_MEMORY_MAPPED_IO:
+        case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+        case EFI_PAL_CODE:
+            return 1;
+        default:
+            break;
+        }
+    }
+    if (type & RAM_TYPE_ACPI)
+    {
+        switch (mem_type)
+        {
+        case EFI_ACPI_RECLAIM_MEMORY:
+        case EFI_ACPI_MEMORY_NVS:
+            return 1;
+        default:
+            break;
+        }
+    }
+    else if (type & RAM_TYPE_UNUSABLE)
+    {
+        return (mem_type == EFI_UNUSABLE_MEMORY);
+    }
+
+    return 0;
 }
 
 
@@ -3229,38 +3332,39 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
 
             spin_unlock(&d->grant_table->lock);
             break;
-        case XENMAPSPACE_mfn:
-        {
-            if ( get_page_from_pagenr(xatp.idx, d) ) {
-                struct xen_ia64_memmap_info memmap_info;
-                efi_memory_desc_t md;
-                int ret;
-
-                mfn = xatp.idx;
-                page = mfn_to_page(mfn);
-
-                memmap_info.efi_memmap_size = sizeof(md);
-                memmap_info.efi_memdesc_size = sizeof(md);
-                memmap_info.efi_memdesc_version =
-                    EFI_MEMORY_DESCRIPTOR_VERSION;
-
-                md.type = EFI_CONVENTIONAL_MEMORY;
-                md.pad = 0;
-                md.phys_addr = xatp.gpfn << PAGE_SHIFT;
-                md.virt_addr = 0;
-                md.num_pages = 1UL << (PAGE_SHIFT - EFI_PAGE_SHIFT);
-                md.attribute = EFI_MEMORY_WB;
-
-                ret = __dom0vp_add_memdesc(d, &memmap_info, (char*)&md);
-                if (ret != 0) {
-                    put_page(page);
-                    rcu_unlock_domain(d);
-                    gdprintk(XENLOG_DEBUG,
-                             "%s:%d td %d gpfn 0x%lx mfn 0x%lx ret %d\n",
-                             __func__, __LINE__,
-                             d->domain_id, xatp.gpfn, xatp.idx, ret);
-                    return ret;
-                }
+        case XENMAPSPACE_gmfn: {
+            struct xen_ia64_memmap_info memmap_info;
+            efi_memory_desc_t md;
+            int ret;
+
+            xatp.idx = gmfn_to_mfn(d, xatp.idx);
+            if ( !get_page_from_pagenr(xatp.idx, d) )
+                break;
+
+            mfn = xatp.idx;
+            page = mfn_to_page(mfn);
+
+            memmap_info.efi_memmap_size = sizeof(md);
+            memmap_info.efi_memdesc_size = sizeof(md);
+            memmap_info.efi_memdesc_version =
+                EFI_MEMORY_DESCRIPTOR_VERSION;
+
+            md.type = EFI_CONVENTIONAL_MEMORY;
+            md.pad = 0;
+            md.phys_addr = xatp.gpfn << PAGE_SHIFT;
+            md.virt_addr = 0;
+            md.num_pages = 1UL << (PAGE_SHIFT - EFI_PAGE_SHIFT);
+            md.attribute = EFI_MEMORY_WB;
+
+            ret = __dom0vp_add_memdesc(d, &memmap_info, (char*)&md);
+            if (ret != 0) {
+                put_page(page);
+                rcu_unlock_domain(d);
+                gdprintk(XENLOG_DEBUG,
+                         "%s:%d td %d gpfn 0x%lx mfn 0x%lx ret %d\n",
+                         __func__, __LINE__,
+                         d->domain_id, xatp.gpfn, xatp.idx, ret);
+                return ret;
             }
             break;
         }
@@ -3300,7 +3404,6 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         /* Map at new location. */
         /* Here page->count_info = PGC_allocated | N where N >= 1*/
         __guest_physmap_add_page(d, xatp.gpfn, mfn);
-        page = NULL; /* prevent put_page() */
 
     out:
         domain_unlock(d);
@@ -3313,34 +3416,6 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
-    case XENMEM_remove_from_physmap:
-    {
-        struct xen_remove_from_physmap xrfp;
-        unsigned long mfn;
-        struct domain *d;
-
-        if ( copy_from_guest(&xrfp, arg, 1) )
-            return -EFAULT;
-
-        rc = rcu_lock_target_domain_by_id(xrfp.domid, &d);
-        if ( rc != 0 )
-            return rc;
-
-        domain_lock(d);
-
-        mfn = gmfn_to_mfn(d, xrfp.gpfn);
-
-        if ( mfn_valid(mfn) )
-            guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
-
-        domain_unlock(d);
-
-        rcu_unlock_domain(d);
-
-        break;
-    }
-
-
     case XENMEM_machine_memory_map:
     {
         struct xen_memory_map memmap;
@@ -3372,6 +3447,45 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         return 0;
     }
 
+    case XENMEM_get_pod_target:
+    case XENMEM_set_pod_target: {
+        /* XXX: PoD populate on demand isn't supported yet. */
+        xen_pod_target_t target;
+        struct domain *d;
+
+        /* Support DOMID_SELF? */
+        if ( !IS_PRIV(current->domain) )
+            return -EINVAL;
+
+        if ( copy_from_guest(&target, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(target.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        if ( op == XENMEM_set_pod_target )
+        {
+            /* if -ENOSYS is returned,
+               domain builder aborts domain creation. */
+            /* rc = -ENOSYS; */
+        }
+
+        target.tot_pages       = d->tot_pages;
+        target.pod_cache_pages = 0;
+        target.pod_entries     = 0;
+
+        if ( copy_to_guest(arg, &target, 1) )
+        {
+            rc= -EFAULT;
+            goto pod_target_out_unlock;
+        }
+        
+    pod_target_out_unlock:
+        rcu_unlock_domain(d);
+        return rc;
+    }
+
     default:
         return -ENOSYS;
     }
@@ -3403,16 +3517,21 @@ void xencomm_mark_dirty(unsigned long addr, unsigned int len)
     __xencomm_mark_dirty(current->domain, addr, len);
 }
 
-int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+/* stubs for populate on demand */
+int
+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                      unsigned int order)
 {
-    /* STUB to compile */
+    gdprintk(XENLOG_WARNING, "populate on demand isn't supported yet\n");
     return -ENOSYS;
 }
 
-int iommu_unmap_page(struct domain *d, unsigned long gfn)
+int
+p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn,
+                             unsigned int order)
 {
-    /* STUB to compile */
-    return -ENOSYS;
+    gdprintk(XENLOG_WARNING, "populate on demand isn't supported yet\n");
+    return 0;
 }
 
 /*
index 69c6bd5af8184b7193552478b0c5c22c6118465a..81acf4f6c3b2e726cecddd5d971e64acd4d4cd3d 100644 (file)
@@ -10,6 +10,7 @@
 #include <xen/types.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/domain.h>
 #include <xen/guest_access.h>
 #include <xen/acpi.h>
 #include <public/platform.h>
@@ -20,15 +21,6 @@ DEFINE_SPINLOCK(xenpf_lock);
 extern int set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf);
 extern long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power);
 
-int xenpf_copy_px_states(struct processor_performance *pxpt,
-        struct xen_processor_performance *dom0_px_info)
-{
-    if (!pxpt || !dom0_px_info)
-        return -EINVAL;
-    return  copy_from_guest(pxpt->states, dom0_px_info->states,
-                    dom0_px_info->state_count);
-}
-
 long do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
 {
     long ret = 0;
@@ -50,6 +42,11 @@ long do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
         switch ( op->u.set_pminfo.type )
         {
         case XEN_PM_PX:
+            if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) )
+            {
+                ret = -ENOSYS;
+                break;
+            }
             ret = set_px_pminfo(op->u.set_pminfo.id,
                     &op->u.set_pminfo.perf);
             break;
index f2ef96c06060f8c73d7c25efec8d7fe653ec8daf..c23cb903e8dac95e8c99282df4fcf6d5eb8a4a1f 100644 (file)
@@ -100,6 +100,7 @@ static unsigned long allocate_metaphysical_rr(struct domain *d, int n)
 
 static int implemented_rid_bits = 0;
 static int mp_rid_shift;
+static DEFINE_SPINLOCK(ridblock_lock);
 static struct domain *ridblock_owner[MAX_RID_BLOCKS] = { 0 };
 
 void __init init_rid_allocator (void)
@@ -169,6 +170,7 @@ int allocate_rid_range(struct domain *d, unsigned long ridbits)
        n_rid_blocks = 1UL << (ridbits - IA64_MIN_IMPL_RID_BITS);
        
        // skip over block 0, reserved for "meta-physical mappings (and Xen)"
+       spin_lock(&ridblock_lock);
        for (i = n_rid_blocks; i < MAX_RID_BLOCKS; i += n_rid_blocks) {
                if (ridblock_owner[i] == NULL) {
                        for (j = i; j < i + n_rid_blocks; ++j) {
@@ -182,16 +184,19 @@ int allocate_rid_range(struct domain *d, unsigned long ridbits)
                                break;
                }
        }
-       
-       if (i >= MAX_RID_BLOCKS)
+
+       if (i >= MAX_RID_BLOCKS) {
+               spin_unlock(&ridblock_lock);
                return 0;
-       
+       }
+
        // found an unused block:
        //   (i << min_rid_bits) <= rid < ((i + n) << min_rid_bits)
        // mark this block as owned
        for (j = i; j < i + n_rid_blocks; ++j)
                ridblock_owner[j] = d;
-       
+       spin_unlock(&ridblock_lock);
+
        // setup domain struct
        d->arch.rid_bits = ridbits;
        d->arch.starting_rid = i << IA64_MIN_IMPL_RID_BITS;
@@ -221,11 +226,12 @@ int deallocate_rid_range(struct domain *d)
        if (d->arch.rid_bits == 0)
                return 1;
 
-       
+       spin_lock(&ridblock_lock);
        for (i = rid_block_start; i < rid_block_end; ++i) {
                ASSERT(ridblock_owner[i] == d);
                ridblock_owner[i] = NULL;
        }
+       spin_unlock(&ridblock_lock);
 
        d->arch.rid_bits = 0;
        d->arch.starting_rid = 0;
index d9e30a2db561b62cedd5b63c5feaaa84c0fbd5c4..d400359233f309754614d18fb9b636a99e8c0db7 100644 (file)
@@ -56,7 +56,7 @@ tlb_track_allocate_entries(struct tlb_track* tlb_track)
         return -ENOMEM;
     }
 
-    list_add(&entry_page->list, &tlb_track->page_list);
+    page_list_add(entry_page, &tlb_track->page_list);
     track_entries = (struct tlb_track_entry*)page_to_virt(entry_page);
     allocated = PAGE_SIZE / sizeof(track_entries[0]);
     tlb_track->num_entries += allocated;
@@ -93,7 +93,7 @@ tlb_track_create(struct domain* d)
     tlb_track->limit = TLB_TRACK_LIMIT_ENTRIES;
     tlb_track->num_entries = 0;
     tlb_track->num_free = 0;
-    INIT_LIST_HEAD(&tlb_track->page_list);
+    INIT_PAGE_LIST_HEAD(&tlb_track->page_list);
     if (tlb_track_allocate_entries(tlb_track) < 0)
         goto out;
 
@@ -136,8 +136,8 @@ tlb_track_destroy(struct domain* d)
     spin_lock(&tlb_track->free_list_lock);
     BUG_ON(tlb_track->num_free != tlb_track->num_entries);
 
-    list_for_each_entry_safe(page, next, &tlb_track->page_list, list) {
-        list_del(&page->list);
+    page_list_for_each_safe(page, next, &tlb_track->page_list) {
+        page_list_del(page, &tlb_track->page_list);
         free_domheap_page(page);
     }
 
index 529717fe7ae3c9763423f705227e889fe69d189f..40e3821e86d0616ab6ce2836d39bfc471ee55268 100644 (file)
@@ -1355,6 +1355,26 @@ vcpu_get_domain_bundle(VCPU * vcpu, REGS * regs, u64 gip,
                // copy its value to the variable, tr, before use.
                TR_ENTRY tr;
 
+               // fast path:
+               // try to access gip with guest virtual address directly.
+               // This may cause tlb miss. see vcpu_translate(). Be careful!
+               swap_rr0 = (!region && PSCB(vcpu, metaphysical_mode));
+               if (swap_rr0) {
+                       set_virtual_rr0();
+               }
+               *bundle = __get_domain_bundle(gip);
+               if (swap_rr0) {
+                       set_metaphysical_rr0();
+               }
+               
+               if (!bundle->i64[0] && !bundle->i64[1]) {
+                       dprintk(XENLOG_INFO, "%s gip 0x%lx\n", __func__, gip);
+               } else {
+                       // Okay, mDTC successed
+                       return 1;
+               }
+               // mDTC failed, so try vTLB.
+
                trp = vcpu_tr_lookup(vcpu, gip, rid, 0);
                if (trp != NULL) {
                        tr = *trp;
@@ -1374,28 +1394,13 @@ vcpu_get_domain_bundle(VCPU * vcpu, REGS * regs, u64 gip,
                        tr = *trp;
                        goto found;
                }
-#if 0
                tr = PSCBX(vcpu, dtlb);
                if (vcpu_match_tr_entry(&tr, gip, rid)) {
                        goto found;
                }
-#endif
 
-               // try to access gip with guest virtual address
-               // This may cause tlb miss. see vcpu_translate(). Be careful!
-               swap_rr0 = (!region && PSCB(vcpu, metaphysical_mode));
-               if (swap_rr0) {
-                       set_virtual_rr0();
-               }
-               *bundle = __get_domain_bundle(gip);
-               if (swap_rr0) {
-                       set_metaphysical_rr0();
-               }
-               if (bundle->i64[0] == 0 && bundle->i64[1] == 0) {
-                       dprintk(XENLOG_INFO, "%s gip 0x%lx\n", __func__, gip);
-                       return 0;
-               }
-               return 1;
+               // mDTC and vTLB failed. so reflect tlb miss into the guest.
+               return 0;
 
        found:
                gpip = ((tr.pte.ppn >> (tr.ps - 12)) << tr.ps) |
index a7a6076073318f4eeabcc2ac271f2bf7c63166fa..f4ea1453c20554fb5a1bb5dc8fb952cfd48214d9 100644 (file)
@@ -49,6 +49,7 @@ paging_init (void)
 {
        unsigned int mpt_order;
        unsigned long mpt_table_size;
+       struct page_info *page;
        unsigned long i;
 
        if (!opt_contig_mem) {
@@ -64,9 +65,11 @@ paging_init (void)
        mpt_table_size = max_page * sizeof(unsigned long);
        mpt_order = get_order(mpt_table_size);
        ASSERT(mpt_order <= MAX_ORDER);
-       if ((mpt_table = alloc_xenheap_pages(mpt_order)) == NULL)
+       page = alloc_domheap_pages(NULL, mpt_order, 0);
+       if (page == NULL)
                panic("Not enough memory to bootstrap Xen.\n");
 
+       mpt_table = page_to_virt(page);
        printk("machine to physical table: 0x%lx mpt_table_size 0x%lx\n"
               "mpt_order %u max_page 0x%lx\n",
               (u64)mpt_table, mpt_table_size, mpt_order, max_page);
index fb242708d35266180f25932284a377706231dc05..c3affb4b0d2dd49d7044edc887bb5692968e3f1f 100644 (file)
@@ -33,7 +33,7 @@
 #include <asm/sn/simulator.h>
 #include <asm/sal.h>
 
-unsigned long xenheap_phys_end, total_pages;
+unsigned long total_pages;
 
 char saved_command_line[COMMAND_LINE_SIZE];
 char __initdata dom0_command_line[COMMAND_LINE_SIZE];
@@ -72,27 +72,10 @@ integer_param("xencons", opt_xencons);
 static int __initdata opt_xencons_poll;
 boolean_param("xencons_poll", opt_xencons_poll);
 
+#define XENHEAP_DEFAULT_SIZE    KERNEL_TR_PAGE_SIZE
+#define XENHEAP_SIZE_MIN        (16 * 1024 * 1024)      /* 16MBytes */
 unsigned long xenheap_size = XENHEAP_DEFAULT_SIZE;
 unsigned long xen_pstart;
-void *xen_pickle_offset __read_mostly;
-
-static void __init parse_xenheap_megabytes(char *s)
-{
-    unsigned long megabytes = simple_strtoll(s, NULL, 0);
-
-#define XENHEAP_MEGABYTES_MIN   16UL
-    if (megabytes < XENHEAP_MEGABYTES_MIN)
-        megabytes = XENHEAP_MEGABYTES_MIN;
-
-#define XENHEAP_MEGABYTES_MAX   4096UL  /* need more? If so,
-                                           __pickle()/__unpickle() must be
-                                           revised. */
-    if (megabytes > XENHEAP_MEGABYTES_MAX)
-        megabytes = XENHEAP_MEGABYTES_MAX;
-
-    xenheap_size =  megabytes * 1024 * 1024;
-}
-custom_param("xenheap_megabytes", parse_xenheap_megabytes);
 
 static int __init
 xen_count_pages(u64 start, u64 end, void *arg)
@@ -318,7 +301,7 @@ init_xenheap_mds(unsigned long start, unsigned long end, void *arg)
             unsigned long s = max(start, max(__pa(desc->xen_heap_start),
                                              md->phys_addr));
             unsigned long e = min(end, min(md_end, desc->xenheap_phys_end));
-            init_xenheap_pages(s, e);
+            init_boot_pages(s, e);
         }
     }
 
@@ -354,6 +337,8 @@ is_platform_hp_ski(void)
 static int __initdata dom0_vhpt_size_log2;
 integer_param("dom0_vhpt_size_log2", dom0_vhpt_size_log2);
 #endif
+unsigned long xen_fixed_mfn_start __read_mostly;
+unsigned long xen_fixed_mfn_end __read_mostly;
 
 void __init start_kernel(void)
 {
@@ -365,6 +350,7 @@ void __init start_kernel(void)
     struct domain *idle_domain;
     struct vcpu *dom0_vcpu0;
     efi_memory_desc_t *kern_md, *last_md, *md;
+    unsigned long xenheap_phys_end;
     void *xen_heap_start;
     struct xen_heap_desc heap_desc;
 #ifdef CONFIG_SMP
@@ -425,11 +411,9 @@ void __init start_kernel(void)
      * for the actual xenheap.
      */
     max_page = efi_get_max_addr() >> PAGE_SHIFT;
-    while ((max_page >> 3) > xenheap_size - (XENHEAP_MEGABYTES_MIN << 20))
+    while ((max_page >> 3) > xenheap_size - XENHEAP_SIZE_MIN)
         xenheap_size <<= 1;
 
-    BUG_ON(xenheap_size > (XENHEAP_MEGABYTES_MAX << 20));
-
     xenheap_phys_end = xen_pstart + xenheap_size;
     printk("xen image pstart: 0x%lx, xenheap pend: 0x%lx\n",
            xen_pstart, xenheap_phys_end);
@@ -530,14 +514,6 @@ skip_move:
     printk("find_memory: efi_memmap_walk returns max_page=%lx\n",max_page);
     efi_print();
     
-    /*
-     * later [__init_begin, __init_end) will be freed up as xen heap
-     * so that struct domain might be allocated from the init area
-     * which is < xen_heap_start. so we can't simply set
-     * xen_pickle_offset = xen_heap_start.
-     */
-    xen_pickle_offset = ia64_imva(__init_begin);
-
     xen_heap_start = memguard_init(ia64_imva(&_end));
     printk("Before xen_heap_start: %p\n", xen_heap_start);
     xen_heap_start = __va(init_boot_allocator(__pa(xen_heap_start)));
@@ -582,6 +558,10 @@ skip_move:
            (xenheap_phys_end-__pa(xen_heap_start)) >> 20,
            (xenheap_phys_end-__pa(xen_heap_start)) >> 10);
 
+    /* for is_xen_fixed_mfn() */
+    xen_fixed_mfn_start = virt_to_mfn(&_start);
+    xen_fixed_mfn_end = virt_to_mfn(xen_heap_start);
+
     end_boot_allocator();
 
     softirq_init();
@@ -740,3 +720,10 @@ void arch_get_xen_caps(xen_capabilities_info_t *info)
     }
 }
 
+int xen_in_range(paddr_t start, paddr_t end)
+{
+    paddr_t xs = __pa(&_start);
+    paddr_t xe = __pa(&_end);
+
+    return (start < xe) && (end > xs);
+}
index eb9dd08f47f4610e582d573ac40167f194b105de..e38e5414ce7475093aa68a8733815477607912b6 100644 (file)
@@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
 obj-y += apic.o
 obj-y += bitops.o
 obj-y += clear_page.o
+obj-y += copy_page.o
 obj-y += compat.o
 obj-y += delay.o
 obj-y += dmi_scan.o
@@ -36,7 +37,6 @@ obj-y += nmi.o
 obj-y += numa.o
 obj-y += pci.o
 obj-y += physdev.o
-obj-y += rwlock.o
 obj-y += setup.o
 obj-y += shutdown.o
 obj-y += smp.o
@@ -53,6 +53,7 @@ obj-y += machine_kexec.o
 obj-y += crash.o
 obj-y += tboot.o
 obj-y += hpet.o
+obj-y += bzimage.o
 
 obj-$(crash_debug) += gdbstub.o
 
@@ -78,10 +79,10 @@ $(TARGET)-syms: $(ALL_OBJS) xen.lds
            $(@D)/.$(@F).1.o -o $@
        rm -f $(@D)/.$(@F).[0-9]*
 
-asm-offsets.s: $(TARGET_SUBARCH)/asm-offsets.c $(HDRS)
+asm-offsets.s: $(TARGET_SUBARCH)/asm-offsets.c
        $(CC) $(CFLAGS) -S -o $@ $<
 
-xen.lds: $(TARGET_SUBARCH)/xen.lds.S $(HDRS)
+xen.lds: $(TARGET_SUBARCH)/xen.lds.S
        $(CC) -P -E -Ui386 $(AFLAGS) -o $@ $<
 
 boot/mkelf32: boot/mkelf32.c
@@ -90,4 +91,5 @@ boot/mkelf32: boot/mkelf32.c
 .PHONY: clean
 clean::
        rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32
-       rm -f $(BASEDIR)/.xen-syms.[0-9]*
+       rm -f $(BASEDIR)/.xen-syms.[0-9]* boot/.*.d
+       rm -f boot/reloc.S boot/reloc.lnk boot/reloc.bin
index 9380ccd780ca374b5e25b104bf8e66828686df81..e9e1d5b95255a8629daa06bdc2b59b633981929f 100644 (file)
@@ -26,9 +26,9 @@ CFLAGS += -I$(BASEDIR)/include/asm-x86/mach-default
 CFLAGS += -msoft-float
 
 # Disable PIE/SSP if GCC supports them. They can break us.
-CFLAGS += $(call cc-option,$(CC),-nopie,)
-CFLAGS += $(call cc-option,$(CC),-fno-stack-protector,)
-CFLAGS += $(call cc-option,$(CC),-fno-stack-protector-all,)
+$(call cc-option-add,CFLAGS,CC,-nopie)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector-all)
 
 ifeq ($(supervisor_mode_kernel),y)
 CFLAGS += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
@@ -45,16 +45,12 @@ ifeq ($(TARGET_SUBARCH),x86_64)
 CFLAGS += -mno-red-zone -fpic -fno-reorder-blocks
 CFLAGS += -fno-asynchronous-unwind-tables
 # -fvisibility=hidden reduces -fpic cost, if it's available
-CFLAGS += $(call cc-option,$(CC),-fvisibility=hidden,)
-CFLAGS := $(subst -fvisibility=hidden,-DGCC_HAS_VISIBILITY_ATTRIBUTE,$(CFLAGS))
+ifneq ($(call cc-option,$(CC),-fvisibility=hidden,n),n)
+CFLAGS += -DGCC_HAS_VISIBILITY_ATTRIBUTE
+endif
 x86_32 := n
 x86_64 := y
 endif
 
-HDRS += $(wildcard $(BASEDIR)/include/asm-x86/hvm/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-x86/hvm/svm/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-x86/hvm/vmx/*.h)
-HDRS += $(wildcard $(BASEDIR)/include/asm-x86/mach-*/*.h)
-
 # Require GCC v3.4+ (to avoid issues with alignment constraints in Xen headers)
 $(call cc-ver-check,CC,0x030400,"Xen requires at least gcc-3.4")
index 055e3d7b51c705b947f1fd720f53104af2f3bcd2..95e7fff4674f7f8db59cb0d2023acfbf7eb5d763 100644 (file)
@@ -283,25 +283,6 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
 
 #endif /* CONFIG_X86_IO_APIC */
 
-static unsigned long __init
-acpi_scan_rsdp(unsigned long start, unsigned long length)
-{
-       unsigned long offset = 0;
-       unsigned long sig_len = sizeof("RSD PTR ") - 1;
-
-       /*
-        * Scan all 16-byte boundaries of the physical memory region for the
-        * RSDP signature.
-        */
-       for (offset = 0; offset < length; offset += 16) {
-               if (strncmp((char *)(start + offset), "RSD PTR ", sig_len))
-                       continue;
-               return (start + offset);
-       }
-
-       return 0;
-}
-
 static int __init acpi_parse_sbf(struct acpi_table_header *table)
 {
        struct acpi_table_boot *sb;
@@ -371,16 +352,9 @@ extern u32 pmtmr_ioport;
 static void __init
 acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt)
 {
-       struct acpi_table_rsdp *rsdp;
-       unsigned long rsdp_phys;
        struct acpi_table_facs *facs = NULL;
        uint64_t facs_pa;
 
-       rsdp_phys = acpi_find_rsdp();
-       if (!rsdp_phys || acpi_disabled)
-               goto bad;
-       rsdp = __va(rsdp_phys);
-
        acpi_fadt_copy_address(pm1a_cnt, pm1a_control, pm1_control);
        acpi_fadt_copy_address(pm1b_cnt, pm1b_control, pm1_control);
        acpi_fadt_copy_address(pm1a_evt, pm1a_event, pm1_event);
@@ -483,29 +457,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
        return 0;
 }
 
-unsigned long __init acpi_find_rsdp(void)
-{
-       unsigned long rsdp_phys = 0;
-
-#if 0
-       if (efi_enabled) {
-               if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
-                       return efi.acpi20;
-               else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
-                       return efi.acpi;
-       }
-#endif
-       /*
-        * Scan memory looking for the RSDP signature. First search EBDA (low
-        * memory) paragraphs and then search upper memory (E0000-FFFFF).
-        */
-       rsdp_phys = acpi_scan_rsdp(0, 0x400);
-       if (!rsdp_phys)
-               rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
-
-       return rsdp_phys;
-}
-
 #ifdef CONFIG_X86_LOCAL_APIC
 /*
  * Parse LAPIC entries in MADT
@@ -601,7 +552,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 
        count =
            acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
-                                 NR_IRQ_VECTORS);
+                                 MAX_IRQ_SOURCES);
        if (count < 0) {
                printk(KERN_ERR PREFIX
                       "Error parsing interrupt source overrides entry\n");
@@ -623,7 +574,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 
        count =
            acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
-                                 NR_IRQ_VECTORS);
+                                 MAX_IRQ_SOURCES);
        if (count < 0) {
                printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
                /* TBD: Cleanup to allow fallback to MPS */
index 176129489e598410f8bbb5f2d7e592da3adc5a3d..f8302d1f54627b2bf97306af05b777284915e537 100644 (file)
 
 #define DEBUG_PM_CX
 
-#define US_TO_PM_TIMER_TICKS(t)     ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
-#define PM_TIMER_TICKS_TO_US(t)     ((t * 1000) / (PM_TIMER_FREQUENCY / 1000))
-#define C2_OVERHEAD         4   /* 1us (3.579 ticks per us) */
-#define C3_OVERHEAD         4   /* 1us (3.579 ticks per us) */
-
 static void (*lapic_timer_off)(void);
 static void (*lapic_timer_on)(void);
 
 extern u32 pmtmr_ioport;
 extern void (*pm_idle) (void);
+extern void (*dead_idle) (void);
 
 static void (*pm_idle_save) (void) __read_mostly;
 unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1;
@@ -71,23 +67,32 @@ static struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
 
 static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
 {
-    uint32_t i;
+    uint32_t i, idle_usage = 0;
+    uint64_t res, idle_res = 0;
 
     printk("==cpu%d==\n", cpu);
     printk("active state:\t\tC%d\n",
-           power->last_state ? (int)(power->last_state - power->states) : -1);
+           power->last_state ? power->last_state->idx : -1);
     printk("max_cstate:\t\tC%d\n", max_cstate);
     printk("states:\n");
     
     for ( i = 1; i < power->count; i++ )
     {
-        printk((power->last_state == &power->states[i]) ? "   *" : "    ");
+        res = acpi_pm_tick_to_ns(power->states[i].time);
+        idle_usage += power->states[i].usage;
+        idle_res += res;
+
+        printk((power->last_state && power->last_state->idx == i) ?
+               "   *" : "    ");
         printk("C%d:\t", i);
         printk("type[C%d] ", power->states[i].type);
         printk("latency[%03d] ", power->states[i].latency);
         printk("usage[%08d] ", power->states[i].usage);
-        printk("duration[%"PRId64"]\n", power->states[i].time);
+        printk("duration[%"PRId64"]\n", res);
     }
+    printk("    C0:\tusage[%08d] duration[%"PRId64"]\n",
+           idle_usage, NOW() - idle_res);
+
 }
 
 static void dump_cx(unsigned char key)
@@ -139,39 +144,26 @@ static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 
 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-    if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+    int unused;
+
+    switch ( cx->entry_method )
     {
+    case ACPI_CSTATE_EM_FFH:
         /* Call into architectural FFH based C-state */
         acpi_processor_ffh_cstate_enter(cx);
-    }
-    else
-    {
-        int unused;
+        return;
+    case ACPI_CSTATE_EM_SYSIO:
         /* IO port based C-state */
         inb(cx->address);
         /* Dummy wait op - must do something useless after P_LVL2 read
            because chipsets cannot guarantee that STPCLK# signal
            gets asserted in time to freeze execution properly. */
         unused = inl(pmtmr_ioport);
-    }
-}
-
-static inline void acpi_idle_update_bm_rld(struct acpi_processor_power *power,
-                                           struct acpi_processor_cx *target)
-{
-    if ( !power->flags.bm_check )
         return;
-
-    if ( power->flags.bm_rld_set && target->type != ACPI_STATE_C3 )
-    {
-        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
-        power->flags.bm_rld_set = 0;
-    }
-
-    if ( !power->flags.bm_rld_set && target->type == ACPI_STATE_C3 )
-    {
-        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
-        power->flags.bm_rld_set = 1;
+    case ACPI_CSTATE_EM_HALT:
+        acpi_safe_halt();
+        local_irq_disable();
+        return;
     }
 }
 
@@ -222,21 +214,15 @@ static void acpi_processor_idle(void)
         if ( power->flags.bm_check && acpi_idle_bm_check()
              && cx->type == ACPI_STATE_C3 )
             cx = power->safe_state;
-        if ( cx - &power->states[0] > max_cstate )
+        if ( cx->idx > max_cstate )
             cx = &power->states[max_cstate];
     }
     if ( !cx )
     {
         if ( pm_idle_save )
-        {
-            printk(XENLOG_DEBUG "call pm_idle_save()\n");
             pm_idle_save();
-        }
         else
-        {
-            printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
             acpi_safe_halt();
-        }
         return;
     }
 
@@ -247,48 +233,22 @@ static void acpi_processor_idle(void)
      * ------
      * Invoke the current Cx state to put the processor to sleep.
      */
-    acpi_idle_update_bm_rld(power, cx);
-
     switch ( cx->type )
     {
     case ACPI_STATE_C1:
-        /* Trace cpu idle entry */
-        TRACE_1D(TRC_PM_IDLE_ENTRY, 1);
-
-        /*
-         * Invoke C1.
-         * Use the appropriate idle routine, the one that would
-         * be used without acpi C-states.
-         */
-        if ( pm_idle_save )
-            pm_idle_save();
-        else 
-            acpi_safe_halt();
-
-        /* Trace cpu idle exit */
-        TRACE_1D(TRC_PM_IDLE_EXIT, 1);
-
-        /*
-         * TBD: Can't get time duration while in C1, as resumes
-         *      go to an ISR rather than here.  Need to instrument
-         *      base interrupt handler.
-         */
-        sleep_ticks = 0xFFFFFFFF;
-        break;
-
     case ACPI_STATE_C2:
-        if ( local_apic_timer_c2_ok )
+        if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
         {
-            /* Trace cpu idle entry */
-            TRACE_1D(TRC_PM_IDLE_ENTRY, 2);
             /* Get start time (ticks) */
             t1 = inl(pmtmr_ioport);
+            /* Trace cpu idle entry */
+            TRACE_2D(TRC_PM_IDLE_ENTRY, cx->idx, t1);
             /* Invoke C2 */
             acpi_idle_do_entry(cx);
             /* Get end time (ticks) */
             t2 = inl(pmtmr_ioport);
             /* Trace cpu idle exit */
-            TRACE_1D(TRC_PM_IDLE_EXIT, 2);
+            TRACE_2D(TRC_PM_IDLE_EXIT, cx->idx, t2);
 
             /* Re-enable interrupts */
             local_irq_enable();
@@ -327,20 +287,18 @@ static void acpi_processor_idle(void)
             ACPI_FLUSH_CPU_CACHE();
         }
 
-        /* Trace cpu idle entry */
-        TRACE_1D(TRC_PM_IDLE_ENTRY, cx - &power->states[0]);
         /*
          * Before invoking C3, be aware that TSC/APIC timer may be 
          * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
          * deep C state can't work correctly.
          */
-        /* preparing TSC stop */
-        cstate_save_tsc();
         /* preparing APIC stop */
         lapic_timer_off();
 
         /* Get start time (ticks) */
         t1 = inl(pmtmr_ioport);
+        /* Trace cpu idle entry */
+        TRACE_2D(TRC_PM_IDLE_ENTRY, cx->idx, t1);
         /* Invoke C3 */
         acpi_idle_do_entry(cx);
         /* Get end time (ticks) */
@@ -349,7 +307,7 @@ static void acpi_processor_idle(void)
         /* recovering TSC */
         cstate_restore_tsc();
         /* Trace cpu idle exit */
-        TRACE_1D(TRC_PM_IDLE_EXIT, cx - &power->states[0]);
+        TRACE_2D(TRC_PM_IDLE_EXIT, cx->idx, t2);
 
         if ( power->flags.bm_check && power->flags.bm_control )
         {
@@ -377,7 +335,7 @@ static void acpi_processor_idle(void)
     cx->usage++;
     if ( sleep_ticks > 0 )
     {
-        power->last_residency = PM_TIMER_TICKS_TO_US(sleep_ticks);
+        power->last_residency = acpi_pm_tick_to_ns(sleep_ticks) / 1000UL;
         cx->time += sleep_ticks;
     }
 
@@ -385,11 +343,54 @@ static void acpi_processor_idle(void)
         cpuidle_current_governor->reflect(power);
 }
 
+static void acpi_dead_idle(void)
+{
+    struct acpi_processor_power *power;
+    struct acpi_processor_cx *cx;
+    int unused;
+
+    if ( (power = processor_powers[smp_processor_id()]) == NULL )
+        goto default_halt;
+
+    if ( (cx = &power->states[power->count-1]) == NULL )
+        goto default_halt;
+
+    for ( ; ; )
+    {
+        if ( !power->flags.bm_check && cx->type == ACPI_STATE_C3 )
+            ACPI_FLUSH_CPU_CACHE();
+
+        switch ( cx->entry_method )
+        {
+            case ACPI_CSTATE_EM_FFH:
+                /* Not treat interrupt as break event */
+                mwait_idle_with_hints(cx->address, 0);
+                break;
+            case ACPI_CSTATE_EM_SYSIO:
+                inb(cx->address);
+                unused = inl(pmtmr_ioport);
+                break;
+            default:
+                goto default_halt;
+        }
+    }
+
+default_halt:
+    for ( ; ; )
+        halt();
+}
+
 static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
 {
+    int i;
+
     memset(acpi_power, 0, sizeof(*acpi_power));
 
+    for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+        acpi_power->states[i].idx = i;
+
     acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+    acpi_power->states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_EM_HALT;
 
     acpi_power->states[ACPI_STATE_C0].valid = 1;
     acpi_power->states[ACPI_STATE_C1].valid = 1;
@@ -463,12 +464,22 @@ static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flag
     else if ( c->x86_vendor == X86_VENDOR_INTEL )
     {
         /*
-         * Today all CPUs that support C3 share cache.
-         * TBD: This needs to look at cache shared map, once
-         * multi-core detection patch makes to the base.
+         * Today all MP CPUs that support C3 share cache.
+         * And caches should not be flushed by software while
+         * entering C3 type state.
          */
         flags->bm_check = 1;
     }
+
+    /*
+     * On all recent platforms, ARB_DISABLE is a nop.
+     * So, set bm_control to zero to indicate that ARB_DISABLE
+     * is not required while entering C3 type state on
+     * P4, Core and beyond CPUs
+     */
+    if ( c->x86_vendor == X86_VENDOR_INTEL &&
+        (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)) )
+            flags->bm_control = 0;
 }
 
 #define VENDOR_INTEL                   (1)
@@ -476,7 +487,8 @@ static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flag
 
 static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
 {
-    static int bm_check_flag;
+    static int bm_check_flag = -1;
+    static int bm_control_flag = -1;
 
     switch ( cx->reg.space_id )
     {
@@ -486,16 +498,13 @@ static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
         break;
 
     case ACPI_ADR_SPACE_FIXED_HARDWARE:
-        if ( cx->type > ACPI_STATE_C1 )
-        {
-            if ( cx->reg.bit_width != VENDOR_INTEL || 
-                 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
-                return -EINVAL;
+        if ( cx->reg.bit_width != VENDOR_INTEL || 
+             cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+            return -EINVAL;
 
-            /* assume all logical cpu has the same support for mwait */
-            if ( acpi_processor_ffh_cstate_probe(cx) )
-                return -EINVAL;
-        }
+        /* assume all logical cpu has the same support for mwait */
+        if ( acpi_processor_ffh_cstate_probe(cx) )
+            return -EINVAL;
         break;
 
     default:
@@ -525,15 +534,17 @@ static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
         }
 
         /* All the logic here assumes flags.bm_check is same across all CPUs */
-        if ( !bm_check_flag )
+        if ( bm_check_flag == -1 )
         {
             /* Determine whether bm_check is needed based on CPU  */
             acpi_processor_power_init_bm_check(&(power->flags));
             bm_check_flag = power->flags.bm_check;
+            bm_control_flag = power->flags.bm_control;
         }
         else
         {
             power->flags.bm_check = bm_check_flag;
+            power->flags.bm_control = bm_control_flag;
         }
 
         if ( power->flags.bm_check )
@@ -554,6 +565,15 @@ static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
                         "C3 support without BM control\n"));
                 }
             }
+            /*
+             * On older chipsets, BM_RLD needs to be set
+             * in order for Bus Master activity to wake the
+             * system from C3.  Newer chipsets handle DMA
+             * during C3 automatically and BM_RLD is a NOP.
+             * In either case, the proper way to
+             * handle BM_RLD is to set it and leave it set.
+             */
+            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
         }
         else
         {
@@ -599,11 +619,27 @@ static void set_cx(
     cx->valid    = 1;
     cx->type     = xen_cx->type;
     cx->address  = xen_cx->reg.address;
-    cx->space_id = xen_cx->reg.space_id;
+
+    switch ( xen_cx->reg.space_id )
+    {
+    case ACPI_ADR_SPACE_FIXED_HARDWARE:
+        if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
+             xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT )
+            cx->entry_method = ACPI_CSTATE_EM_FFH;
+        else
+            cx->entry_method = ACPI_CSTATE_EM_HALT;
+        break;
+    case ACPI_ADR_SPACE_SYSTEM_IO:
+        cx->entry_method = ACPI_CSTATE_EM_SYSIO;
+        break;
+    default:
+        cx->entry_method = ACPI_CSTATE_EM_NONE;
+    }
+
     cx->latency  = xen_cx->latency;
     cx->power    = xen_cx->power;
     
-    cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+    cx->latency_ticks = ns_to_acpi_pm_tick(cx->latency * 1000UL);
     cx->target_residency = cx->latency * latency_factor;
     if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
         acpi_power->safe_state = cx;
@@ -737,6 +773,11 @@ long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
         pm_idle_save = pm_idle;
         pm_idle = acpi_processor_idle;
     }
+
+    if ( cpu_id == 0 )
+    {
+        dead_idle = acpi_dead_idle;
+    }
         
     return 0;
 }
@@ -749,8 +790,7 @@ uint32_t pmstat_get_cx_nr(uint32_t cpuid)
 int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
 {
     const struct acpi_processor_power *power = processor_powers[cpuid];
-    struct vcpu *v = idle_vcpu[cpuid];
-    uint64_t usage;
+    uint64_t usage, res, idle_usage = 0, idle_res = 0;
     int i;
 
     if ( power == NULL )
@@ -761,23 +801,28 @@ int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
         return 0;
     }
 
-    stat->last = (power->last_state) ?
-        (int)(power->last_state - &power->states[0]) : 0;
+    stat->last = power->last_state ? power->last_state->idx : 0;
     stat->nr = power->count;
-    stat->idle_time = v->runstate.time[RUNSTATE_running];
-    if ( v->is_running )
-        stat->idle_time += NOW() - v->runstate.state_entry_time;
+    stat->idle_time = get_cpu_idle_time(cpuid);
 
-    for ( i = 0; i < power->count; i++ )
+    for ( i = power->count - 1; i >= 0; i-- )
     {
-        usage = power->states[i].usage;
-        if ( copy_to_guest_offset(stat->triggers, i, &usage, 1) )
+        if ( i != 0 )
+        {
+            usage = power->states[i].usage;
+            res = acpi_pm_tick_to_ns(power->states[i].time);
+            idle_usage += usage;
+            idle_res += res;
+        }
+        else
+        {
+            usage = idle_usage;
+            res = NOW() - idle_res;
+        }
+        if ( copy_to_guest_offset(stat->triggers, i, &usage, 1) ||
+             copy_to_guest_offset(stat->residencies, i, &res, 1) )
             return -EFAULT;
     }
-    for ( i = 0; i < power->count; i++ )
-        if ( copy_to_guest_offset(stat->residencies, i, 
-                                  &power->states[i].time, 1) )
-            return -EFAULT;
 
     return 0;
 }
index 45a0a53094764572244a14532ac8d43d22f7bf74..cda7fb40aa28511e9fc92da5f455f9c4dba82ebd 100644 (file)
@@ -58,6 +58,9 @@ static struct acpi_cpufreq_data *drv_data[NR_CPUS];
 
 static struct cpufreq_driver acpi_cpufreq_driver;
 
+static unsigned int __read_mostly acpi_pstate_strict;
+integer_param("acpi_pstate_strict", acpi_pstate_strict);
+
 static int check_est_cpu(unsigned int cpuid)
 {
     struct cpuinfo_x86 *cpu = &cpu_data[cpuid];
@@ -131,10 +134,13 @@ struct drv_cmd {
     u32 val;
 };
 
-static void do_drv_read(struct drv_cmd *cmd)
+static void do_drv_read(void *drvcmd)
 {
+    struct drv_cmd *cmd;
     u32 h;
 
+    cmd = (struct drv_cmd *)drvcmd;
+
     switch (cmd->type) {
     case SYSTEM_INTEL_MSR_CAPABLE:
         rdmsr(cmd->addr.msr.reg, cmd->val, h);
@@ -174,7 +180,13 @@ static void drv_read(struct drv_cmd *cmd)
 {
     cmd->val = 0;
 
-    do_drv_read(cmd);
+    ASSERT(cpus_weight(cmd->mask) == 1);
+
+    /* to reduce IPI for the sake of performance */
+    if (likely(cpu_isset(smp_processor_id(), cmd->mask)))
+        do_drv_read((void *)cmd);
+    else
+        on_selected_cpus( cmd->mask, do_drv_read, (void *)cmd, 0, 1);
 }
 
 static void drv_write(struct drv_cmd *cmd)
@@ -184,20 +196,29 @@ static void drv_write(struct drv_cmd *cmd)
 
 static u32 get_cur_val(cpumask_t mask)
 {
+    struct cpufreq_policy *policy;
     struct processor_performance *perf;
     struct drv_cmd cmd;
+    unsigned int cpu = smp_processor_id();
 
     if (unlikely(cpus_empty(mask)))
         return 0;
 
-    switch (drv_data[first_cpu(mask)]->cpu_feature) {
+    if (!cpu_isset(cpu, mask))
+        cpu = first_cpu(mask);
+    policy = cpufreq_cpu_policy[cpu];
+
+    if (cpu >= NR_CPUS || !policy || !drv_data[policy->cpu])
+        return 0;    
+
+    switch (drv_data[policy->cpu]->cpu_feature) {
     case SYSTEM_INTEL_MSR_CAPABLE:
         cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
         cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
         break;
     case SYSTEM_IO_CAPABLE:
         cmd.type = SYSTEM_IO_CAPABLE;
-        perf = drv_data[first_cpu(mask)]->acpi_data;
+        perf = drv_data[policy->cpu]->acpi_data;
         cmd.addr.io.port = perf->control_register.address;
         cmd.addr.io.bit_width = perf->control_register.bit_width;
         break;
@@ -205,12 +226,32 @@ static u32 get_cur_val(cpumask_t mask)
         return 0;
     }
 
-    cmd.mask = mask;
+    cmd.mask = cpumask_of_cpu(cpu);
 
     drv_read(&cmd);
     return cmd.val;
 }
 
+struct perf_pair {
+    union {
+        struct {
+            uint32_t lo;
+            uint32_t hi;
+        } split;
+        uint64_t whole;
+    } aperf, mperf;
+};
+static DEFINE_PER_CPU(struct perf_pair, gov_perf_pair);
+static DEFINE_PER_CPU(struct perf_pair, usr_perf_pair);
+
+static void read_measured_perf_ctrs(void *_readin)
+{
+    struct perf_pair *readin = _readin;
+
+    rdmsr(MSR_IA32_APERF, readin->aperf.split.lo, readin->aperf.split.hi);
+    rdmsr(MSR_IA32_MPERF, readin->mperf.split.lo, readin->mperf.split.hi);
+}
+
 /*
  * Return the measured active (C0) frequency on this CPU since last call
  * to this function.
@@ -224,59 +265,111 @@ static u32 get_cur_val(cpumask_t mask)
  * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
  * no meaning should be associated with absolute values of these MSRs.
  */
-static void  __get_measured_perf(void *perf_percent)
+static unsigned int get_measured_perf(unsigned int cpu, unsigned int flag)
 {
-    unsigned int *ratio = perf_percent;
-    union {
-        struct {
-            uint32_t lo;
-            uint32_t hi;
-        } split;
-        uint64_t whole;
-    } aperf_cur, mperf_cur;
+    struct cpufreq_policy *policy;    
+    struct perf_pair readin, cur, *saved;
+    unsigned int perf_percent;
+    cpumask_t cpumask;
+    unsigned int retval;
+
+    if (!cpu_online(cpu))
+        return 0;
+
+    policy = cpufreq_cpu_policy[cpu];
+    if (!policy)
+        return 0;
+
+    switch (flag)
+    {
+    case GOV_GETAVG:
+    {
+        saved = &per_cpu(gov_perf_pair, cpu);
+        break;
+    }
+    case USR_GETAVG:
+    {
+        saved = &per_cpu(usr_perf_pair, cpu);
+        break;
+    }
+    default:
+        return 0;
+    }
+
+    if (cpu == smp_processor_id()) {
+        read_measured_perf_ctrs((void *)&readin);
+    } else {
+        cpumask = cpumask_of_cpu(cpu);
+        on_selected_cpus(cpumask, read_measured_perf_ctrs, 
+                        (void *)&readin, 0, 1);
+    }
 
-    rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
-    rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
+    cur.aperf.whole = readin.aperf.whole - saved->aperf.whole;
+    cur.mperf.whole = readin.mperf.whole - saved->mperf.whole;
+    saved->aperf.whole = readin.aperf.whole;
+    saved->mperf.whole = readin.mperf.whole;
 
-    wrmsr(MSR_IA32_APERF, 0,0);
-    wrmsr(MSR_IA32_MPERF, 0,0);
+#ifdef __i386__
+    /*
+     * We dont want to do 64 bit divide with 32 bit kernel
+     * Get an approximate value. Return failure in case we cannot get
+     * an approximate value.
+     */
+    if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
+        int shift_count;
+        uint32_t h;
 
-    if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
+        h = max_t(uint32_t, cur.aperf.split.hi, cur.mperf.split.hi);
+        shift_count = fls(h);
+
+        cur.aperf.whole >>= shift_count;
+        cur.mperf.whole >>= shift_count;
+    }
+
+    if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
         int shift_count = 7;
-        aperf_cur.whole >>= shift_count;
-        mperf_cur.whole >>= shift_count;
+        cur.aperf.split.lo >>= shift_count;
+        cur.mperf.split.lo >>= shift_count;
     }
 
-    if (aperf_cur.whole && mperf_cur.whole)
-        *ratio = (aperf_cur.whole * 100) / mperf_cur.whole;
+    if (cur.aperf.split.lo && cur.mperf.split.lo)
+        perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
     else
-        *ratio = 0;
-}
+        perf_percent = 0;
 
-static unsigned int get_measured_perf(unsigned int cpu)
-{
-    unsigned int retval, perf_percent;
-    cpumask_t cpumask;
+#else
+    if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
+        int shift_count = 7;
+        cur.aperf.whole >>= shift_count;
+        cur.mperf.whole >>= shift_count;
+    }
 
-    if (!cpu_online(cpu))
-        return 0;
+    if (cur.aperf.whole && cur.mperf.whole)
+        perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
+    else
+        perf_percent = 0;
 
-    cpumask = cpumask_of_cpu(cpu);
-    on_selected_cpus(cpumask, __get_measured_perf, (void *)&perf_percent,0,1);
+#endif
+
+    retval = drv_data[policy->cpu]->max_freq * perf_percent / 100;
 
-    retval = drv_data[cpu]->max_freq * perf_percent / 100;
     return retval;
 }
 
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
-    struct acpi_cpufreq_data *data = drv_data[cpu];
+    struct cpufreq_policy *policy;
+    struct acpi_cpufreq_data *data;
     unsigned int freq;
 
+    policy = cpufreq_cpu_policy[cpu];
+    if (!policy)
+        return 0;
+
+    data = drv_data[policy->cpu];
     if (unlikely(data == NULL ||
-        data->acpi_data == NULL || data->freq_table == NULL)) {
+        data->acpi_data == NULL || data->freq_table == NULL))
         return 0;
-    }
 
     freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data);
     return freq;
@@ -327,16 +420,10 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
     next_perf_state = data->freq_table[next_state].index;
     if (perf->state == next_perf_state) {
-        if (unlikely(policy->resume)) {
-            printk(KERN_INFO "Called after resume, resetting to P%d\n", 
-                next_perf_state);
+        if (unlikely(policy->resume))
             policy->resume = 0;
-        }
-        else {
-            printk(KERN_DEBUG "Already at target state (P%d)\n", 
-                next_perf_state);
+        else
             return 0;
-        }
     }
 
     switch (data->cpu_feature) {
@@ -367,10 +454,12 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
     drv_write(&cmd);
 
-    if (!check_freqs(cmd.mask, freqs.new, data))
+    if (acpi_pstate_strict && !check_freqs(cmd.mask, freqs.new, data)) {
+        printk(KERN_WARNING "Fail transfer to new freq %d\n", freqs.new);
         return -EAGAIN;
+    }
 
-    for_each_cpu_mask(j, cmd.mask)
+    for_each_cpu_mask(j, online_policy_cpus)
         cpufreq_statistic_update(j, perf->state, next_perf_state);
 
     perf->state = next_perf_state;
@@ -447,18 +536,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
     perf = data->acpi_data;
     policy->shared_type = perf->shared_type;
 
-    /* capability check */
-    if (perf->state_count <= 1) {
-        printk("No P-States\n");
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
-    if (perf->control_register.space_id != perf->status_register.space_id) {
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
     switch (perf->control_register.space_id) {
     case ACPI_ADR_SPACE_SYSTEM_IO:
         printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
@@ -494,7 +571,8 @@ acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
             policy->cpuinfo.transition_latency =
                 perf->states[i].transition_latency * 1000;
     }
-    policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+
+    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
 
     data->max_freq = perf->states[0].core_frequency * 1000;
     /* table init */
@@ -567,6 +645,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver acpi_cpufreq_driver = {
+    .name   = "acpi-cpufreq",
     .verify = acpi_cpufreq_verify,
     .target = acpi_cpufreq_target,
     .init   = acpi_cpufreq_cpu_init,
index 9d9897be6143d6827c0aed84cac0d099ba63ea14..f69b425a0cea9e437b439136f49e0a7e36520a5e 100644 (file)
@@ -129,6 +129,16 @@ static int powernow_cpufreq_target(struct cpufreq_policy *policy,
     return result;
 }
 
+static int powernow_cpufreq_verify(struct cpufreq_policy *policy)
+{
+    struct powernow_cpufreq_data *data;
+
+    if (!policy || !(data = drv_data[policy->cpu]))
+        return -EINVAL;
+
+    return cpufreq_frequency_table_verify(policy, data->freq_table);
+}
+
 static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
     unsigned int i;
@@ -229,9 +239,24 @@ err_unreg:
     return result;
 }
 
+static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
+
+    if (data) {
+        drv_data[policy->cpu] = NULL;
+        xfree(data->freq_table);
+        xfree(data);
+    }
+
+    return 0;
+}
+
 static struct cpufreq_driver powernow_cpufreq_driver = {
+    .verify = powernow_cpufreq_verify,
     .target = powernow_cpufreq_target,
     .init   = powernow_cpufreq_cpu_init,
+    .exit   = powernow_cpufreq_cpu_exit
 };
 
 int powernow_cpufreq_init(void)
index 621d1bdae1584d06fa5daaa87792d8f7365fda93..683dc3bb16aa6af40357a09a85612391dccf8fa7 100644 (file)
@@ -59,7 +59,7 @@ static int menu_select(struct acpi_processor_power *power)
     data->expected_us = (u32) get_sleep_length_ns() / 1000;
 
     /* find the deepest idle state that satisfies our constraints */
-    for ( i = 1; i < power->count; i++ )
+    for ( i = 2; i < power->count; i++ )
     {
         struct acpi_processor_cx *s = &power->states[i];
 
@@ -81,17 +81,7 @@ static void menu_reflect(struct acpi_processor_power *power)
     unsigned int last_residency; 
     unsigned int measured_us;
 
-    /*
-     * Ugh, this idle state doesn't support residency measurements, so we
-     * are basically lost in the dark.  As a compromise, assume we slept
-     * for one full standard timer tick.  However, be aware that this
-     * could potentially result in a suboptimal state transition.
-     */
-    if ( target->type == ACPI_STATE_C1 )
-        last_residency = USEC_PER_SEC / HZ;
-    else
-        last_residency = power->last_residency;
-
+    last_residency = power->last_residency;
     measured_us = last_residency + data->elapsed_us;
 
     /* if wrapping, set to max uint (-1) */
index 7e96bfc796926ecc04d710a24402f01dde476528..cf6730a4dc1d59bcc549266ea8a9262e783b81a7 100644 (file)
@@ -30,6 +30,8 @@
 
 #include <acpi/cpufreq/cpufreq.h>
 
+uint32_t system_reset_counter = 1;
+
 static char opt_acpi_sleep[20];
 string_param("acpi_sleep", opt_acpi_sleep);
 
@@ -42,16 +44,16 @@ void do_suspend_lowlevel(void);
 
 static int device_power_down(void)
 {
-    iommu_suspend();
-
     console_suspend();
 
     time_suspend();
 
     i8259A_suspend();
-    
+
     ioapic_suspend();
-    
+
+    iommu_suspend();
+
     lapic_suspend();
 
     return 0;
@@ -60,34 +62,62 @@ static int device_power_down(void)
 static void device_power_up(void)
 {
     lapic_resume();
-    
+
+    iommu_resume();
+
     ioapic_resume();
 
     i8259A_resume();
-    
+
     time_resume();
 
     console_resume();
-
-    iommu_resume();
 }
 
 static void freeze_domains(void)
 {
     struct domain *d;
+    struct vcpu *v;
 
+    rcu_read_lock(&domlist_read_lock);
     for_each_domain ( d )
-        if ( d->domain_id != 0 )
+    {
+        switch ( d->domain_id )
+        {
+        case 0:
+            for_each_vcpu ( d, v )
+                if ( v != current )
+                    vcpu_pause(v);
+            break;
+        default:
             domain_pause(d);
+            break;
+        }
+    }
+    rcu_read_unlock(&domlist_read_lock);
 }
 
 static void thaw_domains(void)
 {
     struct domain *d;
+    struct vcpu *v;
 
+    rcu_read_lock(&domlist_read_lock);
     for_each_domain ( d )
-        if ( d->domain_id != 0 )
+    {
+        switch ( d->domain_id )
+        {
+        case 0:
+            for_each_vcpu ( d, v )
+                if ( v != current )
+                    vcpu_unpause(v);
+            break;
+        default:
             domain_unpause(d);
+            break;
+        }
+    }
+    rcu_read_unlock(&domlist_read_lock);
 }
 
 static void acpi_sleep_prepare(u32 state)
@@ -99,20 +129,15 @@ static void acpi_sleep_prepare(u32 state)
 
     wakeup_vector_va = __acpi_map_table(
         acpi_sinfo.wakeup_vector, sizeof(uint64_t));
+
+    /* TBoot will set resume vector itself (when it is safe to do so). */
+    if ( tboot_in_measured_env() )
+        return;
+
     if ( acpi_sinfo.vector_width == 32 )
-    {
-            *(uint32_t *)wakeup_vector_va =
-                tboot_in_measured_env() ?
-                (uint32_t)g_tboot_shared->s3_tb_wakeup_entry :
-                (uint32_t)bootsym_phys(wakeup_start);
-    }
+        *(uint32_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
     else
-    {
-            *(uint64_t *)wakeup_vector_va =
-                tboot_in_measured_env() ?
-                (uint64_t)g_tboot_shared->s3_tb_wakeup_entry :
-                (uint64_t)bootsym_phys(wakeup_start);
-    }
+        *(uint64_t *)wakeup_vector_va = bootsym_phys(wakeup_start);
 }
 
 static void acpi_sleep_post(u32 state) {}
@@ -150,6 +175,7 @@ static int enter_state(u32 state)
     printk("Entering ACPI S%d state.\n", state);
 
     local_irq_save(flags);
+    spin_debug_disable();
 
     if ( (error = device_power_down()) )
     {
@@ -163,6 +189,8 @@ static int enter_state(u32 state)
     {
     case ACPI_STATE_S3:
         do_suspend_lowlevel();
+        system_reset_counter++;
+        error = tboot_s3_resume();
         break;
     case ACPI_STATE_S5:
         acpi_enter_sleep_state(ACPI_STATE_S5);
@@ -179,9 +207,13 @@ static int enter_state(u32 state)
 
     device_power_up();
 
-    printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state);
+    printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.\n", state);
+
+    if ( (state == ACPI_STATE_S3) && error )
+        panic("Memory integrity was lost on resume (%d)\n", error);
 
  done:
+    spin_debug_enable();
     local_irq_restore(flags);
     console_end_sync();
     acpi_sleep_post(state);
@@ -190,6 +222,7 @@ static int enter_state(u32 state)
 
  enable_cpu:
     cpufreq_add_cpu(0);
+    microcode_resume_cpu(0);
     enable_nonboot_cpus();
     thaw_domains();
     spin_unlock(&pm_lock);
@@ -247,39 +280,49 @@ static int acpi_get_wake_status(void)
 
 static void tboot_sleep(u8 sleep_state)
 {
-   uint32_t shutdown_type;
-
-   g_tboot_shared->acpi_sinfo.pm1a_cnt =
-                           (uint16_t)acpi_sinfo.pm1a_cnt_blk.address;
-   g_tboot_shared->acpi_sinfo.pm1b_cnt =
-                           (uint16_t)acpi_sinfo.pm1b_cnt_blk.address;
-   g_tboot_shared->acpi_sinfo.pm1a_evt =
-                           (uint16_t)acpi_sinfo.pm1a_evt_blk.address;
-   g_tboot_shared->acpi_sinfo.pm1b_evt =
-                           (uint16_t)acpi_sinfo.pm1b_evt_blk.address;
-   g_tboot_shared->acpi_sinfo.pm1a_cnt_val = acpi_sinfo.pm1a_cnt_val;
-   g_tboot_shared->acpi_sinfo.pm1b_cnt_val = acpi_sinfo.pm1b_cnt_val;
-
-   switch ( sleep_state )
-   {
-       case ACPI_STATE_S3:
-           shutdown_type = TB_SHUTDOWN_S3;
-           g_tboot_shared->s3_k_wakeup_entry =
-               (uint32_t)bootsym_phys(wakeup_start);
-           break;
-       case ACPI_STATE_S4:
-           shutdown_type = TB_SHUTDOWN_S4;
-           break;
-       case ACPI_STATE_S5:
-           shutdown_type = TB_SHUTDOWN_S5;
-           break;
-       default:
-           return;
-   }
-
-   tboot_shutdown(shutdown_type);
+    uint32_t shutdown_type;
+
+#define TB_COPY_GAS(tbg, g)             \
+    tbg.space_id = g.space_id;          \
+    tbg.bit_width = g.bit_width;        \
+    tbg.bit_offset = g.bit_offset;      \
+    tbg.access_width = g.access_width;  \
+    tbg.address = g.address;
+
+    /* sizes are not same (due to packing) so copy each one */
+    TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1a_cnt_blk,
+                acpi_sinfo.pm1a_cnt_blk);
+    TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1b_cnt_blk,
+                acpi_sinfo.pm1b_cnt_blk);
+    TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1a_evt_blk,
+                acpi_sinfo.pm1a_evt_blk);
+    TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1b_evt_blk,
+                acpi_sinfo.pm1b_evt_blk);
+    g_tboot_shared->acpi_sinfo.pm1a_cnt_val = acpi_sinfo.pm1a_cnt_val;
+    g_tboot_shared->acpi_sinfo.pm1b_cnt_val = acpi_sinfo.pm1b_cnt_val;
+    g_tboot_shared->acpi_sinfo.wakeup_vector = acpi_sinfo.wakeup_vector;
+    g_tboot_shared->acpi_sinfo.vector_width = acpi_sinfo.vector_width;
+    g_tboot_shared->acpi_sinfo.kernel_s3_resume_vector =
+                                              bootsym_phys(wakeup_start);
+
+    switch ( sleep_state )
+    {
+        case ACPI_STATE_S3:
+            shutdown_type = TB_SHUTDOWN_S3;
+            break;
+        case ACPI_STATE_S4:
+            shutdown_type = TB_SHUTDOWN_S4;
+            break;
+        case ACPI_STATE_S5:
+            shutdown_type = TB_SHUTDOWN_S5;
+            break;
+        default:
+            return;
+    }
+
+    tboot_shutdown(shutdown_type);
 }
-         
+
 /* System is really put into sleep state by this stub */
 acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
 {
index 3068590f70d20b365d8b5bafeb8ce3d57f9ccba6..0aac31da68870e51c9d82571150609c91814e806 100644 (file)
@@ -31,13 +31,9 @@ void save_rest_processor_state(void)
 
 void restore_rest_processor_state(void)
 {
-    int cpu = smp_processor_id();
-    struct tss_struct *t = &init_tss[cpu];
     struct vcpu *v = current;
 
-    /* Rewriting the TSS desc is necessary to clear the Busy flag. */
-    set_tss_desc(cpu, t);
-    load_TR(cpu);
+    load_TR();
 
 #if defined(CONFIG_X86_64)
     /* Recover syscall MSRs */
@@ -47,7 +43,7 @@ void restore_rest_processor_state(void)
     wrmsr(MSR_SYSCALL_MASK, EF_VM|EF_RF|EF_NT|EF_DF|EF_IE|EF_TF, 0U);    
 #else /* !defined(CONFIG_X86_64) */
     if ( supervisor_mode_kernel && cpu_has_sep )
-        wrmsr(MSR_IA32_SYSENTER_ESP, &t->esp1, 0);
+        wrmsr(MSR_IA32_SYSENTER_ESP, &init_tss[smp_processor_id()].esp1, 0);
 #endif
 
     /* Maybe load the debug registers. */
@@ -65,6 +61,9 @@ void restore_rest_processor_state(void)
     /* Reload FPU state on next FPU use. */
     stts();
 
+    if (cpu_has_pat)
+        wrmsrl(MSR_IA32_CR_PAT, host_pat);
+
     mtrr_ap_init();
     mcheck_init(&boot_cpu_data);
 }
index 55a25bce395845b374606bba3528c30b8c363fcc..2bbb003eaa4c263d484bade0fd986d7575359622 100644 (file)
@@ -40,7 +40,7 @@
 /*
  * Knob to control our willingness to enable the local APIC.
  */
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
 
 /*
  * Debug level
@@ -99,8 +99,11 @@ void __init apic_intr_init(void)
     /* Performance Counters Interrupt */
     set_intr_gate(PMU_APIC_VECTOR, pmu_apic_interrupt);
 
-    /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+    /* CMCI Correctable Machine Check Interrupt */
+    set_intr_gate(CMCI_APIC_VECTOR, cmci_interrupt);
+
+    /* thermal monitor LVT interrupt, for P4 and latest Intel CPU*/
+#ifdef CONFIG_X86_MCE_THERMAL
     set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
 }
@@ -172,12 +175,17 @@ void clear_local_APIC(void)
     }
 
 /* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_MCE_THERMAL
     if (maxlvt >= 5) {
         v = apic_read(APIC_LVTTHMR);
         apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
     }
 #endif
+
+    if (maxlvt >= 6) {
+        v = apic_read(APIC_CMCI);
+        apic_write_around(APIC_CMCI, v | APIC_LVT_MASKED);
+    }
     /*
      * Clean APIC state for other OSs:
      */
@@ -189,10 +197,13 @@ void clear_local_APIC(void)
     if (maxlvt >= 4)
         apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
 
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_MCE_THERMAL
     if (maxlvt >= 5)
         apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
 #endif
+    if (maxlvt >= 6)
+        apic_write_around(APIC_CMCI, APIC_LVT_MASKED);
+
     v = GET_APIC_VERSION(apic_read(APIC_LVR));
     if (APIC_INTEGRATED(v)) {  /* !82489DX */
         if (maxlvt > 3)        /* Due to Pentium errata 3AP and 11AP. */
@@ -597,6 +608,7 @@ static struct {
     unsigned int apic_spiv;
     unsigned int apic_lvtt;
     unsigned int apic_lvtpc;
+    unsigned int apic_lvtcmci;
     unsigned int apic_lvt0;
     unsigned int apic_lvt1;
     unsigned int apic_lvterr;
@@ -608,7 +620,7 @@ static struct {
 int lapic_suspend(void)
 {
     unsigned long flags;
-
+    int maxlvt = get_maxlvt();
     if (!apic_pm_state.active)
         return 0;
 
@@ -620,6 +632,11 @@ int lapic_suspend(void)
     apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
     apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
     apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+
+    if (maxlvt >= 6) {
+        apic_pm_state.apic_lvtcmci = apic_read(APIC_CMCI);
+    }
+
     apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
     apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
     apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
@@ -637,6 +654,7 @@ int lapic_resume(void)
 {
     unsigned int l, h;
     unsigned long flags;
+    int maxlvt = get_maxlvt();
 
     if (!apic_pm_state.active)
         return 0;
@@ -669,6 +687,11 @@ int lapic_resume(void)
     apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
     apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
     apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+
+    if (maxlvt >= 6) {
+        apic_write(APIC_CMCI, apic_pm_state.apic_lvtcmci);
+    }
+
     apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
     apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
     apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
@@ -719,7 +742,7 @@ static void apic_pm_activate(void)
 static void __init lapic_disable(char *str)
 {
     enable_local_apic = -1;
-    clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+    setup_clear_cpu_cap(X86_FEATURE_APIC);
 }
 custom_param("nolapic", lapic_disable);
 
index ce5c9350affa887beaf1945fdc6e55ccf86649bd..d9a7c3f963a5d673355cf58f099d376af6c413b4 100644 (file)
@@ -1,4 +1,7 @@
 obj-y += head.o
 
-head.o: head.S $(TARGET_SUBARCH).S trampoline.S mem.S video.S \
-       cmdline.S edd.S wakeup.S
+head.o: reloc.S
+
+# NB. BOOT_TRAMPOLINE == 0x8c000
+%.S: %.c
+       RELOC=0x8c000 $(MAKE) -f build32.mk $@
diff --git a/xen/arch/x86/boot/build32.mk b/xen/arch/x86/boot/build32.mk
new file mode 100644 (file)
index 0000000..a570d42
--- /dev/null
@@ -0,0 +1,25 @@
+XEN_ROOT=../../../..
+override XEN_TARGET_ARCH=x86_32
+CFLAGS =
+include $(XEN_ROOT)/Config.mk
+
+# Disable PIE/SSP if GCC supports them. They can break us.
+$(call cc-option-add,CFLAGS,CC,-nopie)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector)
+$(call cc-option-add,CFLAGS,CC,-fno-stack-protector-all)
+
+CFLAGS += -Werror -fno-builtin -msoft-float
+
+# NB. awk invocation is a portable alternative to 'head -n -1'
+%.S: %.bin
+       (od -v -t x $< | awk 'NR > 1 {print s} {s=$$0}' | \
+       sed 's/ /,0x/g' | sed 's/^[0-9]*,/ .long /') >$@
+
+%.bin: %.lnk
+       $(OBJCOPY) -O binary $< $@
+
+%.lnk: %.o
+       $(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x8c000 -o $@ $<
+
+%.o: %.c
+       $(CC) $(CFLAGS) -c $< -o $@
index c10aa62b2d8e830976161ad32cba126c002be824..3360107aac04dc4d5be0452637e1aec61369fabf 100644 (file)
@@ -79,8 +79,11 @@ __start:
         cmp     $0x2BADB002,%eax
         jne     not_multiboot
 
-        /* Save the Multiboot info structure for later use. */
-        mov     %ebx,sym_phys(multiboot_ptr)
+        /* Save the Multiboot info struct (after relocation) for later use. */
+        mov     $sym_phys(cpu0_stack)+1024,%esp
+        push    %ebx
+        call    reloc
+        mov     %eax,sym_phys(multiboot_ptr)
 
         /* Initialize BSS (no nasty surprises!) */
         mov     $sym_phys(__bss_start),%edi
@@ -192,6 +195,9 @@ __start:
 
 #include "cmdline.S"
 
+reloc:
+#include "reloc.S"
+
         .align 16
         .globl trampoline_start, trampoline_end
 trampoline_start:
index 94cd3d84cc80f78f7bebeec50e8fa16cfc541726..d616fb18603d6cff76c858050bc0a2b9211f879f 100644 (file)
@@ -25,7 +25,7 @@
 #define s16 int16_t
 #define s32 int32_t
 #define s64 int64_t
-#include "../../../include/public/elfstructs.h"
+#include "../../../include/xen/elfstructs.h"
 
 #define DYNAMICALLY_FILLED   0
 #define RAW_OFFSET         128
diff --git a/xen/arch/x86/boot/reloc.c b/xen/arch/x86/boot/reloc.c
new file mode 100644 (file)
index 0000000..e3333d3
--- /dev/null
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * reloc.c
+ * 
+ * 32-bit flat memory-map routines for relocating Multiboot structures
+ * and modules. This is most easily done early with paging disabled.
+ * 
+ * Copyright (c) 2009, Citrix Systems, Inc.
+ * 
+ * Authors:
+ *    Keir Fraser <keir.fraser@citrix.com>
+ */
+
+asm (
+    "    .text                         \n"
+    "    .globl _start                 \n"
+    "_start:                           \n"
+    "    mov  $_start,%edi             \n"
+    "    call 1f                       \n"
+    "1:  pop  %esi                     \n"
+    "    sub  $1b-_start,%esi          \n"
+    "    mov  $__bss_start-_start,%ecx \n"
+    "    rep  movsb                    \n"
+    "    xor  %eax,%eax                \n"
+    "    mov  $_end,%ecx               \n"
+    "    sub  %edi,%ecx                \n"
+    "    rep  stosb                    \n"
+    "    mov  $reloc,%eax              \n"
+    "    jmp  *%eax                    \n"
+    );
+
+typedef unsigned int u32;
+#include "../../../include/xen/multiboot.h"
+
+extern char _start[];
+
+static void *memcpy(void *dest, const void *src, unsigned int n)
+{
+    char *s = (char *)src, *d = dest;
+    while ( n-- )
+        *d++ = *s++;
+    return dest;
+}
+
+static void *reloc_mbi_struct(void *old, unsigned int bytes)
+{
+    static void *alloc = &_start;
+    alloc = (void *)(((unsigned long)alloc - bytes) & ~15ul);
+    return memcpy(alloc, old, bytes);
+}
+
+static char *reloc_mbi_string(char *old)
+{
+    char *p;
+    for ( p = old; *p != '\0'; p++ )
+        continue;
+    return reloc_mbi_struct(old, p - old + 1);
+}
+
+multiboot_info_t *reloc(multiboot_info_t *mbi_old)
+{
+    multiboot_info_t *mbi = reloc_mbi_struct(mbi_old, sizeof(*mbi));
+    int i;
+
+    if ( mbi->flags & MBI_CMDLINE )
+        mbi->cmdline = (u32)reloc_mbi_string((char *)mbi->cmdline);
+
+    if ( mbi->flags & MBI_MODULES )
+    {
+        module_t *mods = reloc_mbi_struct(
+            (module_t *)mbi->mods_addr, mbi->mods_count * sizeof(module_t));
+        mbi->mods_addr = (u32)mods;
+        for ( i = 0; i < mbi->mods_count; i++ )
+            if ( mods[i].string )
+                mods[i].string = (u32)reloc_mbi_string((char *)mods[i].string);
+    }
+
+    if ( mbi->flags & MBI_MEMMAP )
+        mbi->mmap_addr = (u32)reloc_mbi_struct(
+            (memory_map_t *)mbi->mmap_addr, mbi->mmap_length);
+
+    /* Mask features we don't understand or don't relocate. */
+    mbi->flags &= (MBI_MEMLIMITS |
+                   MBI_DRIVES |
+                   MBI_CMDLINE |
+                   MBI_MODULES |
+                   MBI_MEMMAP);
+
+    return mbi;
+}
index cc4cda556a6e013f75d69faa861d7b81618fae8b..cf40d8bfaa10c5d8d19e22e3087e6c51147bf6ea 100644 (file)
@@ -50,8 +50,7 @@ ENTRY(wakeup_start)
 
         movw    $1, %ax
         lmsw    %ax             # Turn on CR0.PE 
-        jmp     1f
-1:      ljmpl   $BOOT_CS32, $bootsym_phys(wakeup_32)
+        ljmpl   $BOOT_CS32, $bootsym_phys(wakeup_32)
 
 /* This code uses an extended set of video mode numbers. These include:
  * Aliases for standard modes
diff --git a/xen/arch/x86/bzimage.c b/xen/arch/x86/bzimage.c
new file mode 100644 (file)
index 0000000..4843677
--- /dev/null
@@ -0,0 +1,241 @@
+#include <xen/cache.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/mm.h>
+#include <xen/string.h>
+#include <xen/types.h>
+
+#define HEAPORDER 3
+
+static unsigned char *window;
+#define memptr long
+static memptr free_mem_ptr;
+static memptr free_mem_end_ptr;
+
+#define WSIZE           0x80000000
+
+static unsigned char    *inbuf;
+static unsigned         insize;
+
+/* Index of next byte to be processed in inbuf: */
+static unsigned         inptr;
+
+/* Bytes in output buffer: */
+static unsigned         outcnt;
+
+#define OF(args)        args
+#define STATIC          static
+
+#define memzero(s, n)   memset((s), 0, (n))
+
+typedef unsigned char   uch;
+typedef unsigned short  ush;
+typedef unsigned long   ulg;
+
+#define INIT __init
+
+#define get_byte()      (inptr < insize ? inbuf[inptr++] : fill_inbuf())
+
+/* Diagnostic functions */
+#ifdef DEBUG
+#  define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0)
+#  define Trace(x)      do { fprintf x; } while (0)
+#  define Tracev(x)     do { if (verbose) fprintf x ; } while (0)
+#  define Tracevv(x)    do { if (verbose > 1) fprintf x ; } while (0)
+#  define Tracec(c, x)  do { if (verbose && (c)) fprintf x ; } while (0)
+#  define Tracecv(c, x) do { if (verbose > 1 && (c)) fprintf x ; } while (0)
+#else
+#  define Assert(cond, msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c, x)
+#  define Tracecv(c, x)
+#endif
+
+static long bytes_out;
+static void flush_window(void);
+
+static __init void error(char *x)
+{
+    panic("%s\n", x);
+}
+
+static __init int fill_inbuf(void)
+{
+        error("ran out of input data");
+        return 0;
+}
+
+
+#include "../../common/inflate.c"
+
+static __init void flush_window(void)
+{
+    /*
+     * The window is equal to the output buffer therefore only need to
+     * compute the crc.
+     */
+    unsigned long c = crc;
+    unsigned n;
+    unsigned char *in, ch;
+
+    in = window;
+    for ( n = 0; n < outcnt; n++ )
+    {
+        ch = *in++;
+        c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+    }
+    crc = c;
+
+    bytes_out += (unsigned long)outcnt;
+    outcnt = 0;
+}
+
+static __init int gzip_length(char *image, unsigned long image_len)
+{
+    return *(uint32_t *)&image[image_len - 4];
+}
+
+static  __init int perform_gunzip(char *output, char **_image_start, unsigned long *image_len)
+{
+    char *image = *_image_start;
+    int rc;
+    unsigned char magic0 = (unsigned char)image[0];
+    unsigned char magic1 = (unsigned char)image[1];
+
+    if ( magic0 != 0x1f || ( (magic1 != 0x8b) && (magic1 != 0x9e) ) )
+        return 0;
+
+    window = (unsigned char *)output;
+
+    free_mem_ptr = (unsigned long)alloc_xenheap_pages(HEAPORDER, 0);
+    free_mem_end_ptr = free_mem_ptr + (PAGE_SIZE << HEAPORDER);
+
+    inbuf = (unsigned char *)image;
+    insize = *image_len;
+    inptr = 0;
+
+    makecrc();
+
+    if ( gunzip() < 0 )
+    {
+        rc = -EINVAL;
+    }
+    else
+    {
+        *_image_start = (char *)window;
+        *image_len = gzip_length(image, *image_len);
+        rc = 0;
+    }
+
+    free_xenheap_pages((void *)free_mem_ptr, HEAPORDER);
+
+    return rc;
+}
+
+struct setup_header {
+        uint8_t         _pad0[0x1f1];           /* skip uninteresting stuff */
+        uint8_t         setup_sects;
+        uint16_t        root_flags;
+        uint32_t        syssize;
+        uint16_t        ram_size;
+        uint16_t        vid_mode;
+        uint16_t        root_dev;
+        uint16_t        boot_flag;
+        uint16_t        jump;
+        uint32_t        header;
+#define HDR_MAGIC               "HdrS"
+#define HDR_MAGIC_SZ    4
+        uint16_t        version;
+#define VERSION(h,l)    (((h)<<8) | (l))
+        uint32_t        realmode_swtch;
+        uint16_t        start_sys;
+        uint16_t        kernel_version;
+        uint8_t         type_of_loader;
+        uint8_t         loadflags;
+        uint16_t        setup_move_size;
+        uint32_t        code32_start;
+        uint32_t        ramdisk_image;
+        uint32_t        ramdisk_size;
+        uint32_t        bootsect_kludge;
+        uint16_t        heap_end_ptr;
+        uint16_t        _pad1;
+        uint32_t        cmd_line_ptr;
+        uint32_t        initrd_addr_max;
+        uint32_t        kernel_alignment;
+        uint8_t         relocatable_kernel;
+        uint8_t         _pad2[3];
+        uint32_t        cmdline_size;
+        uint32_t        hardware_subarch;
+        uint64_t        hardware_subarch_data;
+        uint32_t        payload_offset;
+        uint32_t        payload_length;
+    } __attribute__((packed));
+
+static __init int bzimage_check(struct setup_header *hdr, unsigned long len)
+{
+    if ( len < sizeof(struct setup_header) )
+        return 0;
+
+    if ( memcmp(&hdr->header, HDR_MAGIC, HDR_MAGIC_SZ) != 0 )
+        return 0;
+
+    if ( hdr->version < VERSION(2,8) ) {
+        printk("Cannot load bzImage v%d.%02d at least v2.08 is required\n",
+           hdr->version >> 8, hdr->version & 0xff);
+        return -EINVAL;
+    }
+    return 1;
+}
+
+int __init bzimage_headroom(char *image_start, unsigned long image_length)
+{
+    struct setup_header *hdr = (struct setup_header *)image_start;
+    char *img;
+    int err, headroom;
+
+    err = bzimage_check(hdr, image_length);
+    if (err < 1)
+        return 0;
+
+    img = image_start + (hdr->setup_sects+1) * 512;
+    img += hdr->payload_offset;
+
+    headroom = gzip_length(img, hdr->payload_length);
+    headroom += headroom >> 12; /* Add 8 bytes for every 32K input block */
+    headroom += (32768 + 18); /* Add 32K + 18 bytes of extra headroom */
+    headroom = (headroom + 4095) & ~4095;
+
+    return headroom;
+}
+
+int __init bzimage_parse(char *image_base, char **image_start, unsigned long *image_len)
+{
+    struct setup_header *hdr = (struct setup_header *)(*image_start);
+    int err = bzimage_check(hdr, *image_len);
+
+    if (err < 1)
+        return err;
+
+    BUG_ON(!(image_base < *image_start));
+
+    *image_start += (hdr->setup_sects+1) * 512;
+    *image_start += hdr->payload_offset;
+    *image_len = hdr->payload_length;
+
+    if ( (err = perform_gunzip(image_base, image_start, image_len)) < 0 )
+        return err;
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/copy_page.S b/xen/arch/x86/copy_page.S
new file mode 100644 (file)
index 0000000..2fd3e53
--- /dev/null
@@ -0,0 +1,66 @@
+#include <xen/config.h>
+#include <asm/page.h>
+
+#ifdef __i386__
+#define src_reg %esi
+#define dst_reg %edi
+#define WORD_SIZE 4
+#define tmp1_reg %eax
+#define tmp2_reg %edx
+#define tmp3_reg %ebx
+#define tmp4_reg %ebp
+#else
+#define src_reg %rsi
+#define dst_reg %rdi
+#define WORD_SIZE 8
+#define tmp1_reg %r8
+#define tmp2_reg %r9
+#define tmp3_reg %r10
+#define tmp4_reg %r11
+#endif
+
+ENTRY(copy_page_sse2)
+#ifdef __i386__
+        push    %ebx
+        push    %ebp
+        push    %esi
+        push    %edi
+        mov     6*4(%esp), src_reg
+        mov     5*4(%esp), dst_reg
+#endif
+        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
+
+        prefetchnta 2*4*WORD_SIZE(src_reg)
+        mov     (src_reg), tmp1_reg
+        mov     WORD_SIZE(src_reg), tmp2_reg
+        mov     2*WORD_SIZE(src_reg), tmp3_reg
+        mov     3*WORD_SIZE(src_reg), tmp4_reg
+
+0:      prefetchnta 3*4*WORD_SIZE(src_reg)
+1:      add     $4*WORD_SIZE, src_reg
+        movnti  tmp1_reg, (dst_reg)
+        mov     (src_reg), tmp1_reg
+        dec     %ecx
+        movnti  tmp2_reg, WORD_SIZE(dst_reg)
+        mov     WORD_SIZE(src_reg), tmp2_reg
+        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
+        mov     2*WORD_SIZE(src_reg), tmp3_reg
+        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
+        lea     4*WORD_SIZE(dst_reg), dst_reg
+        mov     3*WORD_SIZE(src_reg), tmp4_reg
+        jg      0b
+        jpe     1b
+
+        movnti  tmp1_reg, (dst_reg)
+        movnti  tmp2_reg, WORD_SIZE(dst_reg)
+        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
+        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
+
+#ifdef __i386__
+        pop     %edi
+        pop     %esi
+        pop     %ebp
+        pop     %ebx
+#endif
+        sfence
+        ret
index 1f13d93a214444c8454cbcad7435b1265c8dbe69..f2ff228a16379335f0c8b3e0e30cc111cc9fa3ee 100644 (file)
@@ -461,8 +461,10 @@ static void __devinit init_amd(struct cpuinfo_x86 *c)
 
        if (cpuid_eax(0x80000000) >= 0x80000007) {
                c->x86_power = cpuid_edx(0x80000007);
-               if (c->x86_power & (1<<8))
+               if (c->x86_power & (1<<8)) {
                        set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+                       set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+               }
        }
 
 #ifdef CONFIG_X86_HT
index 42c6dc12107e260f0d4623f8905ff0d82a2a6250..9572c57682a55571a2c63e0b1c56d7f7cb4b1638 100644 (file)
@@ -29,6 +29,14 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
  */
 u64 host_pat = 0x050100070406;
 
+static unsigned int __cpuinitdata cleared_caps[NCAPINTS];
+
+void __init setup_clear_cpu_cap(unsigned int cap)
+{
+       __clear_bit(cap, boot_cpu_data.x86_capability);
+       __set_bit(cap, cleared_caps);
+}
+
 static void default_init(struct cpuinfo_x86 * c)
 {
        /* Not much we can do here... */
@@ -235,6 +243,7 @@ static void __init early_cpu_detect(void)
                if (c->x86 >= 0x6)
                        c->x86_model += ((tfms >> 16) & 0xF) << 4;
                c->x86_mask = tfms & 15;
+               cap0 &= ~cleared_caps[0];
                if (cap0 & (1<<19))
                        c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
                c->x86_capability[0] = cap0; /* Added for Xen bootstrap */
@@ -329,6 +338,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
        c->x86_vendor_id[0] = '\0'; /* Unset */
        c->x86_model_id[0] = '\0';  /* Unset */
        c->x86_max_cores = 1;
+       c->x86_num_siblings = 1;
        c->x86_clflush_size = 0;
        memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
@@ -395,6 +405,9 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
        if (disable_pse)
                clear_bit(X86_FEATURE_PSE, c->x86_capability);
 
+       for (i = 0 ; i < NCAPINTS ; ++i)
+               c->x86_capability[i] &= ~cleared_caps[i];
+
        /* If the model name is still unset, do table lookup. */
        if ( !c->x86_model_id[0] ) {
                char *p;
@@ -468,27 +481,27 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                return;
 
-       smp_num_siblings = (ebx & 0xff0000) >> 16;
+       c->x86_num_siblings = (ebx & 0xff0000) >> 16;
 
-       if (smp_num_siblings == 1) {
+       if (c->x86_num_siblings == 1) {
                printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-       } else if (smp_num_siblings > 1 ) {
+       } else if (c->x86_num_siblings > 1 ) {
 
-               if (smp_num_siblings > NR_CPUS) {
-                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
-                       smp_num_siblings = 1;
+               if (c->x86_num_siblings > NR_CPUS) {
+                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", c->x86_num_siblings);
+                       c->x86_num_siblings = 1;
                        return;
                }
 
-               index_msb = get_count_order(smp_num_siblings);
+               index_msb = get_count_order(c->x86_num_siblings);
                phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
 
                printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
                       phys_proc_id[cpu]);
 
-               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+               c->x86_num_siblings = c->x86_num_siblings / c->x86_max_cores;
 
-               index_msb = get_count_order(smp_num_siblings) ;
+               index_msb = get_count_order(c->x86_num_siblings) ;
 
                core_bits = get_count_order(c->x86_max_cores);
 
@@ -564,7 +577,10 @@ void __cpuinit cpu_init(void)
 {
        int cpu = smp_processor_id();
        struct tss_struct *t = &init_tss[cpu];
-       char gdt_load[10];
+       struct desc_ptr gdt_desc = {
+               .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
+               .limit = LAST_RESERVED_GDT_BYTE
+       };
 
        if (cpu_test_and_set(cpu, cpu_initialized)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -578,9 +594,7 @@ void __cpuinit cpu_init(void)
        /* Install correct page table. */
        write_ptbase(current);
 
-       *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
-       *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(current);
-       asm volatile ( "lgdt %0" : "=m" (gdt_load) );
+       asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
 
        /* No nested task. */
        asm volatile ("pushf ; andw $0xbfff,(%"__OP"sp) ; popf" );
@@ -600,8 +614,7 @@ void __cpuinit cpu_init(void)
        BUG_ON((get_stack_bottom() & 15) != 0);
        t->rsp0 = get_stack_bottom();
 #endif
-       set_tss_desc(cpu,t);
-       load_TR(cpu);
+       load_TR();
        asm volatile ( "lldt %%ax" : : "a" (0) );
 
        /* Clear all 6 debug registers: */
index f8fdb8af286a294a11be0c1d42963c2ef5bf8ebb..1574004f6cb7e48db31977a03945f6135172224b 100644 (file)
@@ -218,6 +218,10 @@ static void __devinit init_intel(struct cpuinfo_x86 *c)
        if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
                (c->x86 == 0x6 && c->x86_model >= 0x0e))
                set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+       if (cpuid_edx(0x80000007) & (1u<<8)) {
+               set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+               set_bit(X86_FEATURE_NOSTOP_TSC, c->x86_capability);
+       }
 
        start_vmx();
 }
index 3ecc791402ceafbd90ece603237681d8e008fdf3..ed0ae000580c3de10c5599bc878c071b55b4c118 100644 (file)
@@ -2,9 +2,9 @@ obj-y += amd_nonfatal.o
 obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
+obj-y += mctelem.o
 obj-y += mce.o
+obj-y += mce_intel.o
 obj-y += non-fatal.o
-obj-y += p4.o
 obj-$(x86_32) += p5.o
-obj-$(x86_32) += p6.o
 obj-$(x86_32) += winchip.o
index 9c26ef9fe85b74e020b22524f51d6acf437ef717..272f1fe674dfaa60081e7ad9639bd68941df15e4 100644 (file)
 #include "x86_mca.h"
 
 
-static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+static enum mca_extinfo
+amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
 {
        struct mcinfo_extended mc_ext;
 
        /* Family 0x10 introduced additional MSR that belong to the
         * northbridge bank (4). */
-       if (bank != 4)
-               return 0;
+       if (mi == NULL || bank != 4)
+               return MCA_EXTINFO_IGNORED;
 
        if (!(status & MCi_STATUS_VAL))
-               return 0;
+               return MCA_EXTINFO_IGNORED;
 
        if (!(status & MCi_STATUS_MISCV))
-               return 0;
+               return MCA_EXTINFO_IGNORED;
 
        memset(&mc_ext, 0, sizeof(mc_ext));
        mc_ext.common.type = MC_TYPE_EXTENDED;
@@ -73,28 +74,30 @@ static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
        mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2;
        mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3;
 
-       rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
-       rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
-       rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
+       mca_rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
+       mca_rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
+       mca_rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
        
        x86_mcinfo_add(mi, &mc_ext);
-       return 1;
+       return MCA_EXTINFO_LOCAL;
 }
 
 
 extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
 
 /* AMD Family10 machine check */
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
 { 
        uint64_t value;
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
-       mc_callback_bank_extended = amd_f10_handler;
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k8_machine_check);
+       x86_mce_callback_register(amd_f10_handler);
        cpu_nr = smp_processor_id();
-       wmb();
 
        rdmsrl(MSR_IA32_MCG_CAP, value);
        if (value & MCG_CTL_P)  /* Control register present ? */
@@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
                case 4: /* Northbridge */
-                       /* Enable error reporting of all errors,
-                        * enable error checking and
-                        * disable sync flooding */
-                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
                        wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
-
-                       /* XXX: We should write the value 0x1087821UL into
-                        * to register F3x180 here, which sits in
-                        * the PCI extended configuration space.
-                        * Since this is not possible here, we can only hope,
-                        * Dom0 is doing that.
-                        */
                        break;
 
                default:
@@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
 
        set_in_cr4(X86_CR4_MCE);
        printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr);
+       return 1;
 }
index 55910f2c69f32979cefcf768f49f3eddbfd6e4ae..03c36d3a1dd802e2a37342b1fce4817d620a6180 100644 (file)
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 
 /* Machine Check Handler for AMD K8 family series */
 void k8_machine_check(struct cpu_user_regs *regs, long error_code)
 {
-       struct vcpu *vcpu = current;
-       struct domain *curdom;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv, uc;
-       uint32_t i;
-       unsigned int cpu_nr;
-       uint32_t xen_impacted = 0;
-#define DOM_NORMAL     0
-#define DOM0_TRAP      1
-#define DOMU_TRAP      2
-#define DOMU_KILLED    4
-       uint32_t dom_state = DOM_NORMAL;
-
-       /* This handler runs as interrupt gate. So IPIs from the
-        * polling service routine are defered until we finished.
-        */
-
-        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-        * an other physical CPU or the impacted process in the guest
-        * continues running with corrupted data, otherwise. */
-        vcpu_schedule_lock_irq(vcpu);
-
-       mc_data = x86_mcinfo_getptr();
-       cpu_nr = smp_processor_id();
-       curdom = vcpu->domain;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
-       BUG_ON(cpu_nr != vcpu->processor);
-       mc_global.mc_core_threadid = 0;
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
-         It's not clear to me how to figure this out. */
-       mc_global.mc_socketid = ???;
-#endif
-       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       /* Quick check, who is impacted */
-       xen_impacted = is_idle_domain(curdom);
-
-       /* Dom0 */
-       x86_mcinfo_clear(mc_data);
-       x86_mcinfo_add(mc_data, &mc_global);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               /* An error happened in this bank.
-                * This is expected to be an uncorrectable error,
-                * since correctable errors get polled.
-                */
-               uc = status & MCi_STATUS_UC;
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-                       
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-
-               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-               wmb();
-               add_taint(TAINT_MACHINE_CHECK);
-       }
-
-       status = mc_global.mc_gstatus;
-
-       /* clear MCIP or cpu enters shutdown state
-        * in case another MCE occurs. */
-       status &= ~MCG_STATUS_MCIP;
-       wrmsrl(MSR_IA32_MCG_STATUS, status);
-       wmb();
-
-       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
-        * The thread started here:
-        * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
-        */
-
-       /* MCG_STATUS_RIPV: 
-        * When this bit is not set, then the instruction pointer onto the stack
-        * to resume at is not valid. If xen is interrupted, then we panic anyway
-        * right below. Otherwise it is up to the guest to figure out if 
-        * guest kernel or guest userland is affected and should kill either
-        * itself or the affected process.
-        */
-
-       /* MCG_STATUS_EIPV:
-        * Evaluation of EIPV is the job of the guest.
-        */
-
-       if (xen_impacted) {
-               /* Now we are going to panic anyway. Allow interrupts, so that
-                * printk on serial console can work. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* Uh, that means, machine check exception
-                * inside Xen occured. */
-               printk("Machine check exception occured in Xen.\n");
-
-               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
-                * to the error then it makes sense to print a stack trace.
-                * That can be useful for more detailed error analysis and/or
-                * error case studies to figure out, if we can clear
-                * xen_impacted and kill a DomU instead
-                * (i.e. if a guest only control structure is affected, but then
-                * we must ensure the bad pages are not re-used again).
-                */
-               if (status & MCG_STATUS_EIPV) {
-                       printk("MCE: Instruction Pointer is related to the error. "
-                               "Therefore, print the execution state.\n");
-                       show_execution_state(regs);
-               }
-               x86_mcinfo_dump(mc_data);
-               panic("End of MCE. Use mcelog to decode above error codes.\n");
-       }
-
-       /* If Dom0 registered a machine check handler, which is only possible
-        * with a PV MCA driver, then ... */
-       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
-               dom_state = DOM0_TRAP;
-
-               /* ... deliver machine check trap to Dom0. */
-               send_guest_trap(dom0, 0, TRAP_machine_check);
-
-               /* Xen may tell Dom0 now to notify the DomU.
-                * But this will happen through a hypercall. */
-       } else
-               /* Dom0 did not register a machine check handler, but if DomU
-                * did so, then... */
-                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
-                       dom_state = DOMU_TRAP;
-
-                       /* ... deliver machine check trap to DomU */
-                       send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
-       } else {
-               /* hmm... noone feels responsible to handle the error.
-                * So, do a quick check if a DomU is impacted or not.
-                */
-               if (curdom == dom0) {
-                       /* Dom0 is impacted. Since noone can't handle
-                        * this error, panic! */
-                       x86_mcinfo_dump(mc_data);
-                       panic("MCE occured in Dom0, which it can't handle\n");
-
-                       /* UNREACHED */
-               } else {
-                       dom_state = DOMU_KILLED;
-
-                       /* Enable interrupts. This basically results in
-                        * calling sti on the *physical* cpu. But after
-                        * domain_crash() the vcpu pointer is invalid.
-                        * Therefore, we must unlock the irqs before killing
-                        * it. */
-                       vcpu_schedule_unlock_irq(vcpu);
-
-                       /* DomU is impacted. Kill it and continue. */
-                       domain_crash(curdom);
-               }
-       }
-
-
-       switch (dom_state) {
-       case DOM0_TRAP:
-       case DOMU_TRAP:
-               /* Enable interrupts. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* guest softirqs and event callbacks are scheduled
-                * immediately after this handler exits. */
-               break;
-       case DOMU_KILLED:
-               /* Nothing to do here. */
-               break;
-       default:
-               BUG();
-       }
+       mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
-
 /* AMD K8 machine check */
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 {
        uint64_t value;
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       /* Check for PPro style MCA; our caller has confirmed MCE support. */
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k8_machine_check);
        cpu_nr = smp_processor_id();
-       wmb();
 
        rdmsrl(MSR_IA32_MCG_CAP, value);
        if (value & MCG_CTL_P)  /* Control register present ? */
@@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
                case 4: /* Northbridge */
-                       /* Enable error reporting of all errors,
-                        * enable error checking and
-                        * disable sync flooding */
-                       wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+                       /* Enable error reporting of all errors */
+                       wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
                        wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
                        break;
 
@@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 
        set_in_cr4(X86_CR4_MCE);
        printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+
+       return 1;
 }
index 03827fac5fbb2e9d7399c39061abb144982ea2f1..d354b1f06b2bfefa3ef074c63593be1fd9410117 100644 (file)
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/event.h>
-#include <asm/processor.h> 
+
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(10000)
 #define MCE_MIN    MILLISECS(2000)
 #define MCE_MAX    MILLISECS(30000)
 
 static s_time_t period = MCE_PERIOD;
 static int hw_threshold = 0;
 static int adjust = 0;
+static int variable_period = 1;
 
 /* The polling service routine:
  * Collects information of correctable errors and notifies
@@ -81,100 +82,46 @@ static int adjust = 0;
  */
 void mce_amd_checkregs(void *info)
 {
-       struct vcpu *vcpu = current;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv;
-       unsigned int i;
+       mctelem_cookie_t mctc;
+       struct mca_summary bs;
        unsigned int event_enabled;
-       unsigned int cpu_nr;
-       int error_found;
 
-       /* We don't need a slot yet. Only allocate one on error. */
-       mc_data = NULL;
+       mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
 
-       cpu_nr = smp_processor_id();
        event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
-       error_found = 0;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
-       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
-       BUG_ON(cpu_nr != vcpu->processor);
-       mc_global.mc_core_threadid = 0;
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
-         It's not clear to me how to figure this out. */
-       mc_global.mc_socketid = ???;
-#endif
-       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               if (mc_data == NULL) {
-                       /* Now we need a slot to fill in error telemetry. */
-                       mc_data = x86_mcinfo_getptr();
-                       BUG_ON(mc_data == NULL);
-                       x86_mcinfo_clear(mc_data);
-                       x86_mcinfo_add(mc_data, &mc_global);
-               }
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               /* Increase polling frequency */
-               error_found = 1;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
 
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+       if (bs.errcnt && mctc != NULL) {
+               static uint64_t dumpcount = 0;
 
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-               x86_mcinfo_add(mc_data, &mc_info);
+               /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+                * Otherwise, if dom0 has had plenty of time to register
+                * the virq handler but still hasn't then dump telemetry
+                * to the Xen console.  The call count may be incremented
+                * on multiple cpus at once and is indicative only - just
+                * a simple-minded attempt to avoid spamming the console
+                * for corrected errors in early startup. */
 
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
-               wmb();
-       }
-
-       if (error_found > 0) {
-               /* If Dom0 enabled the VIRQ_MCA event, then ... */
-               if (event_enabled)
-                       /* ... notify it. */
+               if (event_enabled) {
+                       mctelem_commit(mctc);
                        send_guest_global_virq(dom0, VIRQ_MCA);
-               else
-                       /* ... or dump it */
-                       x86_mcinfo_dump(mc_data);
+               } else if (++dumpcount >= 10) {
+                       x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
+                       mctelem_dismiss(mctc);
+               } else {
+                       mctelem_dismiss(mctc);
+               }
+               
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
        }
 
-       adjust += error_found;
+       /* adjust is global and all cpus may attempt to increment it without
+        * synchronisation, so they race and the final adjust count
+        * (number of cpus seeing any error) is approximate.  We can
+        * guarantee that if any cpu observes an error that the
+        * adjust count is at least 1. */
+       if (bs.errcnt)
+               adjust++;
 }
 
 /* polling service routine invoker:
@@ -189,7 +136,7 @@ static void mce_amd_work_fn(void *data)
        on_each_cpu(mce_amd_checkregs, data, 1, 1);
 
        if (adjust > 0) {
-               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+               if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
                        /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
                        printk("MCE: polling routine found correctable error. "
                                " Use mcelog to parse above error output.\n");
@@ -200,7 +147,7 @@ static void mce_amd_work_fn(void *data)
                uint64_t value;
                uint32_t counter;
 
-               rdmsrl(MSR_IA32_MC4_MISC, value);
+               mca_rdmsrl(MSR_IA32_MC4_MISC, value);
                /* Only the error counter field is of interest
                 * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
                 */
@@ -225,24 +172,24 @@ static void mce_amd_work_fn(void *data)
                        value &= ~(0x60FFF00000000ULL);
                        /* Counter enable */
                        value |= (1ULL << 51);
-                       wrmsrl(MSR_IA32_MC4_MISC, value);
+                       mca_wrmsrl(MSR_IA32_MC4_MISC, value);
                        wmb();
                }
        }
 
-       if (adjust > 0) {
+       if (variable_period && adjust > 0) {
                /* Increase polling frequency */
                adjust++; /* adjust == 1 must have an effect */
                period /= adjust;
-       } else {
+       } else if (variable_period) {
                /* Decrease polling frequency */
                period *= 2;
        }
-       if (period > MCE_MAX) {
+       if (variable_period && period > MCE_MAX) {
                /* limit: Poll at least every 30s */
                period = MCE_MAX;
        }
-       if (period < MCE_MIN) {
+       if (variable_period && period < MCE_MIN) {
                /* limit: Poll every 2s.
                 * When this is reached an uncorrectable error
                 * is expected to happen, if Dom0 does nothing.
@@ -263,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
 
        /* The threshold bitfields in MSR_IA32_MC4_MISC has
         * been introduced along with the SVME feature bit. */
-       if (cpu_has(c, X86_FEATURE_SVME)) {
+       if (variable_period && cpu_has(c, X86_FEATURE_SVME)) {
                uint64_t value;
 
                /* hw threshold registers present */
index 59e60a14e853ec0ffce3f14a5188ea00dd2a650b..1a0a0a5fefa38886e3c304f5600bfb7c0436d00e 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
 static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_code)
@@ -57,9 +58,9 @@ static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_co
        }
 
        if (recover&2)
-               panic ("CPU context corrupt");
+               mc_panic ("CPU context corrupt");
        if (recover&1)
-               panic ("Unable to continue");
+               mc_panic ("Unable to continue");
        printk (KERN_EMERG "Attempting to continue.\n");
        mcgstl &= ~(1<<2);
        wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
@@ -67,13 +68,16 @@ static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_co
 
 
 /* AMD K7 machine check */
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
 
-       machine_check_vector = k7_machine_check;
-       wmb();
+       /* Check for PPro style MCA; our caller has confirmed MCE support. */
+       if (!cpu_has(c, X86_FEATURE_MCA))
+               return 0;
+
+       x86_mce_vector_register(k7_machine_check);
 
        rdmsr (MSR_IA32_MCG_CAP, l, h);
        if (l & (1<<8)) /* Control register present ? */
@@ -91,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
        set_in_cr4 (X86_CR4_MCE);
        printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
                smp_processor_id());
+
+       return 1;
 }
index 6406c33ca1c42faddd39157eec4942d5c2692b83..3685056cbae445a18cf1de003396ede723a01d82 100644 (file)
 #include <xen/config.h>
 #include <xen/smp.h>
 #include <xen/errno.h>
+#include <xen/console.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
 
-#include <asm/processor.h> 
+#include <asm/processor.h>
 #include <asm/system.h>
+#include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
-/* XXX For now a fixed array is used. Later this should be changed
- * to a dynamic allocated array with the size calculated in relation
- * to physical cpus present in the machine.
- * The more physical cpus are available, the more entries you need.
- */
-#define MAX_MCINFO     10
-
-struct mc_machine_notify {
-       struct mc_info mc;
-       uint32_t fetch_idx;
-       uint32_t valid;
-};
+static void intpose_init(void);
+static void mcinfo_clear(struct mc_info *);
 
-struct mc_machine {
+#define        SEG_PL(segsel)                  ((segsel) & 0x3)
+#define _MC_MSRINJ_F_REQ_HWCR_WREN     (1 << 16)
 
-       /* Array structure used for collecting machine check error telemetry. */
-       struct mc_info mc[MAX_MCINFO];
+#if 1  /* XXFM switch to 0 for putback */
 
-       /* We handle multiple machine check reports lockless by
-        * iterating through the array using the producer/consumer concept.
-        */
-       /* Producer array index to fill with machine check error data.
-        * Index must be increased atomically. */
-       uint32_t error_idx;
-
-       /* Consumer array index to fetch machine check error data from.
-        * Index must be increased atomically. */
-       uint32_t fetch_idx;
-
-       /* Integer array holding the indeces of the mc array that allows
-         * a Dom0 to notify a DomU to re-fetch the same machine check error
-         * data. The notification and refetch also uses its own 
-        * producer/consumer mechanism, because Dom0 may decide to not report
-        * every error to the impacted DomU.
-        */
-       struct mc_machine_notify notify[MAX_MCINFO];
+#define        x86_mcerr(str, err) _x86_mcerr(str, err)
 
-       /* Array index to get fetch_idx from.
-        * Index must be increased atomically. */
-       uint32_t notifyproducer_idx;
-       uint32_t notifyconsumer_idx;
-};
+static int _x86_mcerr(const char *msg, int err)
+{
+       printk("x86_mcerr: %s, returning %d\n",
+           msg != NULL ? msg : "", err);
+       return err;
+}
+#else
+#define x86_mcerr(str,err)
+#endif
 
-/* Global variable with machine check information. */
-struct mc_machine mc_data;
+cpu_banks_t mca_allbanks;
 
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
-{      
+{
        printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
                smp_processor_id());
 }
 
 
+static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
+
+void x86_mce_vector_register(x86_mce_vector_t hdlr)
+{
+       _machine_check_vector = hdlr;
+       wmb();
+}
+
 /* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check;
+
+void machine_check_vector(struct cpu_user_regs *regs, long error_code)
+{
+       _machine_check_vector(regs, error_code);
+}
 
 /* Init machine check callback handler
  * It is used to collect additional information provided by newer
  * CPU families/models without the need to duplicate the whole handler.
  * This avoids having many handlers doing almost nearly the same and each
  * with its own tweaks ands bugs. */
-int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
+static x86_mce_callback_t mc_callback_bank_extended = NULL;
 
-
-static void amd_mcheck_init(struct cpuinfo_x86 *ci)
+void x86_mce_callback_register(x86_mce_callback_t cbfunc)
 {
+       mc_callback_bank_extended = cbfunc;
+}
 
-       switch (ci->x86) {
-       case 6:
-               amd_k7_mcheck_init(ci);
-               break;
+/* Utility function to perform MCA bank telemetry readout and to push that
+ * telemetry towards an interested dom0 for logging and diagnosis.
+ * The caller - #MC handler or MCA poll function - must arrange that we
+ * do not migrate cpus. */
 
-       case 0xf:
-               amd_k8_mcheck_init(ci);
-               break;
+/* XXFM Could add overflow counting? */
+mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
+    struct mca_summary *sp)
+{
+       struct vcpu *v = current;
+       struct domain *d;
+       uint64_t gstatus, status, addr, misc;
+       struct mcinfo_global mcg;       /* on stack */
+       struct mcinfo_common *mic;
+       struct mcinfo_global *mig;      /* on stack */
+       mctelem_cookie_t mctc = NULL;
+       uint32_t uc = 0, pcc = 0;
+       struct mc_info *mci = NULL;
+       mctelem_class_t which = MC_URGENT;      /* XXXgcc */
+       unsigned int cpu_nr;
+       int errcnt = 0;
+       int i;
+       enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
 
-       case 0x10:
-               amd_f10_mcheck_init(ci);
-               break;
+       cpu_nr = smp_processor_id();
+       BUG_ON(cpu_nr != v->processor);
 
-       default:
-               /* Assume that machine check support is available.
-                * The minimum provided support is at least the K8. */
-               amd_k8_mcheck_init(ci);
-       }
-}
+       mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
 
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
-{
-       if (mce_disabled == 1) {
-               printk(XENLOG_INFO "MCE support disabled by bootparam\n");
-               return;
-       }
-
-       if (!cpu_has(c, X86_FEATURE_MCE)) {
-               printk(XENLOG_INFO "CPU%i: No machine check support available\n",
-                       smp_processor_id());
-               return;
+       memset(&mcg, 0, sizeof (mcg));
+       mcg.common.type = MC_TYPE_GLOBAL;
+       mcg.common.size = sizeof (mcg);
+       if (v != NULL && ((d = v->domain) != NULL)) {
+               mcg.mc_domid = d->domain_id;
+               mcg.mc_vcpuid = v->vcpu_id;
+       } else {
+               mcg.mc_domid = -1;
+               mcg.mc_vcpuid = -1;
        }
+       mcg.mc_gstatus = gstatus;       /* MCG_STATUS */
 
-       memset(&mc_data, 0, sizeof(struct mc_machine));
-
-       switch (c->x86_vendor) {
-       case X86_VENDOR_AMD:
-               amd_mcheck_init(c);
+       switch (who) {
+       case MCA_MCE_HANDLER:
+               mcg.mc_flags = MC_FLAG_MCE;
+               which = MC_URGENT;
                break;
 
-       case X86_VENDOR_INTEL:
-#ifndef CONFIG_X86_64
-               if (c->x86==5)
-                       intel_p5_mcheck_init(c);
-               if (c->x86==6)
-                       intel_p6_mcheck_init(c);
-#endif
-               if (c->x86==15)
-                       intel_p4_mcheck_init(c);
+       case MCA_POLLER:
+       case MCA_RESET:
+               mcg.mc_flags = MC_FLAG_POLLED;
+               which = MC_NONURGENT;
                break;
 
-#ifndef CONFIG_X86_64
-       case X86_VENDOR_CENTAUR:
-               if (c->x86==5)
-                       winchip_mcheck_init(c);
+       case MCA_CMCI_HANDLER:
+               mcg.mc_flags = MC_FLAG_CMCI;
+               which = MC_NONURGENT;
                break;
-#endif
 
        default:
-               break;
+               BUG();
        }
-}
 
+       /* Retrieve detector information */
+       x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
+           &mcg.mc_coreid, &mcg.mc_core_threadid,
+           &mcg.mc_apicid, NULL, NULL, NULL);
 
-static void __init mcheck_disable(char *str)
-{
-       mce_disabled = 1;
-}
-
-static void __init mcheck_enable(char *str)
-{
-       mce_disabled = -1;
-}
+       for (i = 0; i < 32 && i < nr_mce_banks; i++) {
+               struct mcinfo_bank mcb;         /* on stack */
 
-custom_param("nomce", mcheck_disable);
-custom_param("mce", mcheck_enable);
+               /* Skip bank if corresponding bit in bankmask is clear */
+               if (!test_bit(i, bankmask))
+                       continue;
 
+               mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+               if (!(status & MCi_STATUS_VAL))
+                       continue;       /* this bank has no valid telemetry */
+
+               /* If this is the first bank with valid MCA DATA, then
+                * try to reserve an entry from the urgent/nonurgent queue
+                * depending on whethere we are called from an exception or
+                * a poller;  this can fail (for example dom0 may not
+                * yet have consumed past telemetry). */
+               if (errcnt == 0) {
+                       if ((mctc = mctelem_reserve(which)) != NULL) {
+                               mci = mctelem_dataptr(mctc);
+                               mcinfo_clear(mci);
+                       }
+               }
 
-#include <xen/guest_access.h>
-#include <asm/traps.h>
+               memset(&mcb, 0, sizeof (mcb));
+               mcb.common.type = MC_TYPE_BANK;
+               mcb.common.size = sizeof (mcb);
+               mcb.mc_bank = i;
+               mcb.mc_status = status;
 
-struct mc_info *x86_mcinfo_getptr(void)
-{
-       struct mc_info *mi;
-       uint32_t entry, next;
-
-       for (;;) {
-               entry = mc_data.error_idx;
-               smp_rmb();
-               next = entry + 1;
-               if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
-                       break;
-       }
+               /* form a mask of which banks have logged uncorrected errors */
+               if ((status & MCi_STATUS_UC) != 0)
+                       uc |= (1 << i);
 
-       mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
-       BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
+               /* likewise for those with processor context corrupt */
+               if ((status & MCi_STATUS_PCC) != 0)
+                       pcc |= (1 << i);
 
-       return mi;
-}
+               addr = misc = 0;
 
-static int x86_mcinfo_matches_guest(const struct mc_info *mi,
-                       const struct domain *d, const struct vcpu *v)
-{
-       struct mcinfo_common *mic;
-       struct mcinfo_global *mig;
+               if (status & MCi_STATUS_ADDRV) {
+                       mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+                       d = maddr_get_owner(addr);
+                       if (d != NULL && (who == MCA_POLLER ||
+                           who == MCA_CMCI_HANDLER))
+                               mcb.mc_domid = d->domain_id;
+               }
 
-       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
-       mig = (struct mcinfo_global *)mic;
-       if (mig == NULL)
-               return 0;
+               if (status & MCi_STATUS_MISCV)
+                       mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
 
-       if (d->domain_id != mig->mc_domid)
-               return 0;
+               mcb.mc_addr = addr;
+               mcb.mc_misc = misc;
 
-       if (v->vcpu_id != mig->mc_vcpuid)
-               return 0;
+               if (who == MCA_CMCI_HANDLER) {
+                       mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+                       rdtscll(mcb.mc_tsc);
+               }
 
-       return 1;
-}
+               /* Increment the error count;  if this is the first bank
+                * with a valid error then add the global info to the mcinfo. */
+               if (errcnt++ == 0 && mci != NULL)
+                       x86_mcinfo_add(mci, &mcg);
 
+               /* Add the bank data */
+               if (mci != NULL)
+                       x86_mcinfo_add(mci, &mcb);
 
-#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
+               if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
+                       cbret = mc_callback_bank_extended(mci, i, status);
+               }
 
-static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
-                               const struct domain *d, const struct vcpu *v)
-{
-       struct mc_info *mi;
+               /* Clear status */
+               mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+       }
 
-       /* This function is called from the fetch hypercall with
-        * the mc_lock spinlock held. Thus, no need for locking here.
-        */
-       mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
-       if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
-               /* Bogus domU command detected. */
-               *fetch_idx = 0;
-               return NULL;
+       if (mci != NULL && errcnt > 0) {
+               x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
+               mig = (struct mcinfo_global *)mic;
+               if (pcc)
+                       mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+               else if (uc)
+                       mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+               else
+                       mcg.mc_flags |= MC_FLAG_CORRECTABLE;
        }
 
-       *fetch_idx = mc_data.fetch_idx;
-       mc_data.fetch_idx++;
-       BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
 
-       return mi;
+       if (sp) {
+               sp->errcnt = errcnt;
+               sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
+               sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
+               sp->uc = uc;
+               sp->pcc = pcc;
+       }
+
+       return mci != NULL ? mctc : NULL;       /* may be NULL */
 }
 
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
 
-static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain)
+/* Shared #MC handler. */
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
+    cpu_banks_t bankmask)
 {
-       struct mc_machine_notify *mn;
-       struct mcinfo_common *mic = NULL;
-       struct mcinfo_global *mig;
-       struct domain *d;
-       int i;
+       int xen_state_lost, dom0_state_lost, domU_state_lost;
+       struct vcpu *v = current;
+       struct domain *curdom = v->domain;
+       domid_t domid = curdom->domain_id;
+       int ctx_xen, ctx_dom0, ctx_domU;
+       uint32_t dom_state = DOM_NORMAL;
+       mctelem_cookie_t mctc = NULL;
+       struct mca_summary bs;
+       struct mc_info *mci = NULL;
+       int irqlocked = 0;
+       uint64_t gstatus;
+       int ripv;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we're finished.
+        */
+
+       /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * another physical CPU. */
+       vcpu_schedule_lock_irq(v);
+       irqlocked = 1;
+
+       /* Read global status;  if it does not indicate machine check
+        * in progress then bail as long as we have a valid ip to return to. */
+       mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
+       if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
+               add_taint(TAINT_MACHINE_CHECK); /* questionable */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               goto cmn_handler_done;
+       }
 
-       /* This function is called from the notifier hypercall with
-        * the mc_notify_lock spinlock held. Thus, no need for locking here.
+       /* Go and grab error telemetry.  We must choose whether to commit
+        * for logging or dismiss the cookie that is returned, and must not
+        * reference the cookie after that action.
         */
+       mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
+       if (mctc != NULL)
+               mci = (struct mc_info *)mctelem_dataptr(mctc);
+
+       /* Clear MCIP or another #MC will enter shutdown state */
+       gstatus &= ~MCG_STATUS_MCIP;
+       mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       wmb();
+
+       /* If no valid errors and our stack is intact, we're done */
+       if (ripv && bs.errcnt == 0) {
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               goto cmn_handler_done;
+       }
 
-       /* First invalidate entries for guests that disappeared after
-        * notification (e.g. shutdown/crash). This step prevents the
-        * notification array from filling up with stalling/leaking entries.
+       if (bs.uc || bs.pcc)
+               add_taint(TAINT_MACHINE_CHECK);
+
+       /* Machine check exceptions will usually be for UC and/or PCC errors,
+        * but it is possible to configure machine check for some classes
+        * of corrected error.
+        *
+        * UC errors could compromise any domain or the hypervisor
+        * itself - for example a cache writeback of modified data that
+        * turned out to be bad could be for data belonging to anyone, not
+        * just the current domain.  In the absence of known data poisoning
+        * to prevent consumption of such bad data in the system we regard
+        * all UC errors as terminal.  It may be possible to attempt some
+        * heuristics based on the address affected, which guests have
+        * mappings to that mfn etc.
+        *
+        * PCC errors apply to the current context.
+        *
+        * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
+        * and not PCC is terminal - the return instruction pointer
+        * pushed onto the stack is bogus.  If the interrupt context is
+        * the hypervisor or dom0 the game is over, otherwise we can
+        * limit the impact to a single domU but only if we trampoline
+        * somewhere safely - we can't return and unwind the stack.
+        * Since there is no trampoline in place we will treat !RIPV
+        * as terminal for any context.
         */
-       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
-               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-               BUG_ON(mic == NULL);
-               mig = (struct mcinfo_global *)mic;
-               d = get_domain_by_id(mig->mc_domid);
-               if (d == NULL) {
-                       /* Domain does not exist. */
-                       mn->valid = 0;
+       ctx_xen = SEG_PL(regs->cs) == 0;
+       ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
+       ctx_domU = !ctx_xen && !ctx_dom0;
+
+       xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
+           !ripv;
+       dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
+       domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
+
+       if (xen_state_lost) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+
+               printk("Terminal machine check exception occured in "
+                   "hypervisor context.\n");
+
+               /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (bs.eipv & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the "
+                           "error, therefore print the execution state.\n");
+                       show_execution_state(regs);
+               }
+
+               /* Commit the telemetry so that panic flow can find it. */
+               if (mctc != NULL) {
+                       x86_mcinfo_dump(mci);
+                       mctelem_commit(mctc);
                }
-               if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
-                       mc_data.notifyconsumer_idx++;
+               mc_panic("Hypervisor state lost due to machine check "
+                   "exception.\n");
+               /*NOTREACHED*/
        }
 
-       /* Now put in the error telemetry. Since all error data fetchable
-        * by domUs are uncorrectable errors, they are very important.
-        * So we dump them before overriding them. When a guest takes that long,
-        * then we can assume something bad already happened (crash, hang, etc.)
+       /*
+        * Xen hypervisor state is intact.  If dom0 state is lost then
+        * give it a chance to decide what to do if it has registered
+        * a handler for this event, otherwise panic.
+        *
+        * XXFM Could add some Solaris dom0 contract kill here?
         */
-       mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
+       if (dom0_state_lost) {
+               if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
+                       dom_state = DOM0_TRAP;
+                       send_guest_trap(dom0, 0, TRAP_machine_check);
+                       /* XXFM case of return with !ripv ??? */
+               } else {
+                       /* Commit telemetry for panic flow. */
+                       if (mctc != NULL) {
+                               x86_mcinfo_dump(mci);
+                               mctelem_commit(mctc);
+                       }
+                       mc_panic("Dom0 state lost due to machine check "
+                           "exception\n");
+                       /*NOTREACHED*/
+               }
+       }
+
+       /*
+        * If a domU has lost state then send it a trap if it has registered
+        * a handler, otherwise crash the domain.
+        * XXFM Revisit this functionality.
+        */
+       if (domU_state_lost) {
+               if (guest_has_trap_callback(v->domain, v->vcpu_id,
+                   TRAP_machine_check)) {
+                       dom_state = DOMU_TRAP;
+                       send_guest_trap(curdom, v->vcpu_id,
+                           TRAP_machine_check);
+               } else {
+                       dom_state = DOMU_KILLED;
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(v);
+                       irqlocked = 0;
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
 
-       if (mn->valid) {
-               struct mcinfo_common *mic = NULL;
-               struct mcinfo_global *mig;
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
 
-               /* To not loose the information, we dump it. */
-               x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-               BUG_ON(mic == NULL);
-               mig = (struct mcinfo_global *)mic;
-               printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
-                       "fetch machine check error telemetry. But Domain ID "
-                       "did not do that in time.\n",
-                       mig->mc_domid);
-               x86_mcinfo_dump(&mn->mc);
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+
+       case DOM_NORMAL:
+               vcpu_schedule_unlock_irq(v);
+               irqlocked = 0;
+               break;
        }
 
-       memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
-               sizeof(struct mc_info));
-       mn->fetch_idx = mc_notifydomain->fetch_idx;
-       mn->valid = 1;
+cmn_handler_done:
+       BUG_ON(irqlocked);
+       BUG_ON(!ripv);
+
+       if (bs.errcnt) {
+               /* Not panicing, so forward telemetry to dom0 now if it
+                * is interested. */
+               if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+                       if (mctc != NULL)
+                               mctelem_commit(mctc);
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               } else {
+                       x86_mcinfo_dump(mci);
+                       if (mctc != NULL)
+                               mctelem_dismiss(mctc);
+               }
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
+       }
+}
 
-       mc_data.notifyproducer_idx++;
+static int amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+       int rc = 0;
 
-       /* By design there can never be more notifies than machine check errors.
-        * If that ever happens, then we hit a bug. */
-       BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
-       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+       switch (ci->x86) {
+       case 6:
+               rc = amd_k7_mcheck_init(ci);
+               break;
+
+       case 0xf:
+               rc = amd_k8_mcheck_init(ci);
+               break;
+
+       case 0x10:
+               rc = amd_f10_mcheck_init(ci);
+               break;
+
+       default:
+               /* Assume that machine check support is available.
+                * The minimum provided support is at least the K8. */
+               rc = amd_k8_mcheck_init(ci);
+       }
+
+       return rc;
 }
 
-static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
-                               const struct domain *d, const struct vcpu *v)
+/*check the existence of Machine Check*/
+int mce_available(struct cpuinfo_x86 *c)
 {
-       struct mc_machine_notify *mn = NULL;
-       uint32_t i;
-       int found;
+       return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
 
-       /* This function is called from the fetch hypercall with
-        * the mc_notify_lock spinlock held. Thus, no need for locking here.
-        */
+/*
+ * Check if bank 0 is usable for MCE. It isn't for AMD K7,
+ * and Intel P6 family before model 0x1a.
+ */
+int mce_firstbank(struct cpuinfo_x86 *c)
+{
+       if (c->x86 == 6) {
+               if (c->x86_vendor == X86_VENDOR_AMD)
+                       return 1;
 
-       /* The notifier data is filled in the order guests get notified, but
-        * guests may fetch them in a different order. That's why we need
-        * the game with valid/invalid entries. */
-       found = 0;
-       for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
-               mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-               if (!mn->valid) {
-                       if (i == mc_data.notifyconsumer_idx)
-                               mc_data.notifyconsumer_idx++;
-                       continue;
-               }
-               if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
-                       found = 1;
+               if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
+                       return 1;
+       }
+
+       return 0;
+}
+
+/* This has to be run for each processor */
+void mcheck_init(struct cpuinfo_x86 *c)
+{
+       int inited = 0, i;
+
+       if (mce_disabled == 1) {
+               printk(XENLOG_INFO "MCE support disabled by bootparam\n");
+               return;
+       }
+
+       for (i = 0; i < MAX_NR_BANKS; i++)
+               set_bit(i,mca_allbanks);
+
+       /* Enforce at least MCE support in CPUID information.  Individual
+        * families may also need to enforce a check for MCA support. */
+       if (!cpu_has(c, X86_FEATURE_MCE)) {
+               printk(XENLOG_INFO "CPU%i: No machine check support available\n",
+                       smp_processor_id());
+               return;
+       }
+
+       intpose_init();
+       mctelem_init(sizeof (struct mc_info));
+
+       switch (c->x86_vendor) {
+       case X86_VENDOR_AMD:
+               inited = amd_mcheck_init(c);
+               break;
+
+       case X86_VENDOR_INTEL:
+               switch (c->x86) {
+               case 5:
+#ifndef CONFIG_X86_64
+                       inited = intel_p5_mcheck_init(c);
+#endif
+                       break;
+
+               case 6:
+               case 15:
+                       inited = intel_mcheck_init(c);
                        break;
                }
-       }
+               break;
+
+#ifndef CONFIG_X86_64
+       case X86_VENDOR_CENTAUR:
+               if (c->x86==5) {
+                       inited = winchip_mcheck_init(c);
+               }
+               break;
+#endif
 
-       if (!found) {
-               /* This domain has never been notified. This must be
-                * a bogus domU command. */
-               *fetch_idx = 0;
-               return NULL;
+       default:
+               break;
        }
 
-       BUG_ON(mn == NULL);
-       *fetch_idx = mn->fetch_idx;
-       mn->valid = 0;
+       if (!inited)
+               printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
+                   smp_processor_id());
+}
+
+
+static void __init mcheck_disable(char *str)
+{
+       mce_disabled = 1;
+}
 
-       BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-       return &mn->mc;
+static void __init mcheck_enable(char *str)
+{
+       mce_disabled = -1;
 }
 
+custom_param("nomce", mcheck_disable);
+custom_param("mce", mcheck_enable);
 
-void x86_mcinfo_clear(struct mc_info *mi)
+static void mcinfo_clear(struct mc_info *mi)
 {
        memset(mi, 0, sizeof(struct mc_info));
        x86_mcinfo_nentries(mi) = 0;
 }
 
-
 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
 {
        int i;
@@ -371,7 +621,7 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
        end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
 
        if (end1 < end2)
-               return -ENOSPC; /* No space. Can't add entry. */
+               return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
 
        /* there's enough space. add entry. */
        memcpy(mic_index, mic, mic->size);
@@ -380,7 +630,6 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
        return 0;
 }
 
-
 /* Dump machine check information in a format,
  * mcelog can parse. This is used only when
  * Dom0 does not take the notification. */
@@ -395,7 +644,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
        if (mic == NULL)
                return;
        mc_global = (struct mcinfo_global *)mic;
-       if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+       if (mc_global->mc_flags & MC_FLAG_MCE) {
                printk(XENLOG_WARNING
                        "CPU%d: Machine Check Exception: %16"PRIx64"\n",
                        mc_global->mc_coreid, mc_global->mc_gstatus);
@@ -412,10 +661,10 @@ void x86_mcinfo_dump(struct mc_info *mi)
                if (mic == NULL)
                        return;
                if (mic->type != MC_TYPE_BANK)
-                       continue;
+                       goto next;
 
                mc_bank = (struct mcinfo_bank *)mic;
-       
+
                printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
                        mc_bank->mc_bank,
                        mc_bank->mc_status);
@@ -425,13 +674,315 @@ void x86_mcinfo_dump(struct mc_info *mi)
                        printk(" at %16"PRIx64, mc_bank->mc_addr);
 
                printk("\n");
+next:
                mic = x86_mcinfo_next(mic); /* next entry */
                if ((mic == NULL) || (mic->size == 0))
                        break;
        } while (1);
 }
 
+static void do_mc_get_cpu_info(void *v)
+{
+       int cpu = smp_processor_id();
+       int cindex, cpn;
+       struct cpuinfo_x86 *c;
+       xen_mc_logical_cpu_t *log_cpus, *xcp;
+       uint32_t junk, ebx;
+
+       log_cpus = v;
+       c = &cpu_data[cpu];
+       cindex = 0;
+       cpn = cpu - 1;
+
+       /*
+        * Deal with sparse masks, condensed into a contig array.
+        */
+       while (cpn >= 0) {
+               if (cpu_isset(cpn, cpu_online_map))
+                       cindex++;
+               cpn--;
+       }
+
+       xcp = &log_cpus[cindex];
+       c = &cpu_data[cpu];
+       xcp->mc_cpunr = cpu;
+       x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
+           &xcp->mc_coreid, &xcp->mc_threadid,
+           &xcp->mc_apicid, &xcp->mc_ncores,
+           &xcp->mc_ncores_active, &xcp->mc_nthreads);
+       xcp->mc_cpuid_level = c->cpuid_level;
+       xcp->mc_family = c->x86;
+       xcp->mc_vendor = c->x86_vendor;
+       xcp->mc_model = c->x86_model;
+       xcp->mc_step = c->x86_mask;
+       xcp->mc_cache_size = c->x86_cache_size;
+       xcp->mc_cache_alignment = c->x86_cache_alignment;
+       memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
+       memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
+       memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
+
+       /*
+        * This part needs to run on the CPU itself.
+        */
+       xcp->mc_nmsrvals = __MC_NMSRS;
+       xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
+       rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
+
+       if (c->cpuid_level >= 1) {
+               cpuid(1, &junk, &ebx, &junk, &junk);
+               xcp->mc_clusterid = (ebx >> 24) & 0xff;
+       } else
+               xcp->mc_clusterid = hard_smp_processor_id();
+}
+
+
+void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
+                        uint16_t *threadid, uint32_t *apicid,
+                        unsigned *ncores, unsigned *ncores_active,
+                        unsigned *nthreads)
+{
+       struct cpuinfo_x86 *c;
+
+       *apicid = cpu_physical_id(cpu);
+       c = &cpu_data[cpu];
+       if (c->apicid == BAD_APICID) {
+               *chipid = cpu;
+               *coreid = 0;
+               *threadid = 0;
+               if (ncores != NULL)
+                       *ncores = 1;
+               if (ncores_active != NULL)
+                       *ncores_active = 1;
+               if (nthreads != NULL)
+                       *nthreads = 1;
+       } else {
+               *chipid = phys_proc_id[cpu];
+               if (c->x86_max_cores > 1)
+                       *coreid = cpu_core_id[cpu];
+               else
+                       *coreid = 0;
+               *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
+               if (ncores != NULL)
+                       *ncores = c->x86_max_cores;
+               if (ncores_active != NULL)
+                       *ncores_active = c->booted_cores;
+               if (nthreads != NULL)
+                       *nthreads = c->x86_num_siblings;
+       }
+}
+
+#define        INTPOSE_NENT    50
+
+static struct intpose_ent {
+       unsigned  int cpu_nr;
+       uint64_t msr;
+       uint64_t val;
+} intpose_arr[INTPOSE_NENT];
+
+static void intpose_init(void)
+{
+       static int done;
+       int i;
+
+       if (done++ > 0)
+               return;
+
+       for (i = 0; i < INTPOSE_NENT; i++) {
+               intpose_arr[i].cpu_nr = -1;
+       }
+
+}
+
+struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
+    uint64_t *valp)
+{
+       int i;
+
+       for (i = 0; i < INTPOSE_NENT; i++) {
+               if (intpose_arr[i].cpu_nr == cpu_nr &&
+                   intpose_arr[i].msr == msr) {
+                       if (valp != NULL)
+                               *valp = intpose_arr[i].val;
+                       return &intpose_arr[i];
+               }
+       }
+
+       return NULL;
+}
+
+static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
+{
+       struct intpose_ent *ent;
+       int i;
+
+       if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
+               ent->val = val;
+               return;
+       }
+
+       for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
+               if (ent->cpu_nr == -1) {
+                       ent->cpu_nr = cpu_nr;
+                       ent->msr = msr;
+                       ent->val = val;
+                       return;
+               }
+       }
+
+       printk("intpose_add: interpose array full - request dropped\n");
+}
+
+void intpose_inval(unsigned int cpu_nr, uint64_t msr)
+{
+       struct intpose_ent *ent;
+
+       if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
+               ent->cpu_nr = -1;
+       }
+}
+
+#define        IS_MCA_BANKREG(r) \
+    ((r) >= MSR_IA32_MC0_CTL && \
+    (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
+    ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
+
+static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
+{
+       struct cpuinfo_x86 *c;
+       int i, errs = 0;
+
+       c = &cpu_data[smp_processor_id()];
+
+       for (i = 0; i < mci->mcinj_count; i++) {
+               uint64_t reg = mci->mcinj_msr[i].reg;
+               const char *reason = NULL;
+
+               if (IS_MCA_BANKREG(reg)) {
+                       if (c->x86_vendor == X86_VENDOR_AMD) {
+                               /* On AMD we can set MCi_STATUS_WREN in the
+                                * HWCR MSR to allow non-zero writes to banks
+                                * MSRs not to #GP.  The injector in dom0
+                                * should set that bit, but we detect when it
+                                * is necessary and set it as a courtesy to
+                                * avoid #GP in the hypervisor. */
+                               mci->mcinj_flags |=
+                                   _MC_MSRINJ_F_REQ_HWCR_WREN;
+                               continue;
+                       } else {
+                               /* No alternative but to interpose, so require
+                                * that the injector specified as such. */
+                               if (!(mci->mcinj_flags &
+                                   MC_MSRINJ_F_INTERPOSE)) {
+                                       reason = "must specify interposition";
+                               }
+                       }
+               } else {
+                       switch (reg) {
+                       /* MSRs acceptable on all x86 cpus */
+                       case MSR_IA32_MCG_STATUS:
+                               break;
+
+                       /* MSRs that the HV will take care of */
+                       case MSR_K8_HWCR:
+                               if (c->x86_vendor == X86_VENDOR_AMD)
+                                       reason = "HV will operate HWCR";
+                               else
+                                       reason ="only supported on AMD";
+                               break;
+
+                       default:
+                               reason = "not a recognized MCA MSR";
+                               break;
+                       }
+               }
+
+               if (reason != NULL) {
+                       printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
+                           (unsigned long long)mci->mcinj_msr[i].reg, reason);
+                       errs++;
+               }
+       }
+
+       return !errs;
+}
+
+static uint64_t x86_mc_hwcr_wren(void)
+{
+       uint64_t old;
+
+       rdmsrl(MSR_K8_HWCR, old);
 
+       if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
+               uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
+               wrmsrl(MSR_K8_HWCR, new);
+       }
+
+       return old;
+}
+
+static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
+{
+       if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
+               wrmsrl(MSR_K8_HWCR, hwcr);
+}
+
+static void x86_mc_msrinject(void *data)
+{
+       struct xen_mc_msrinject *mci = data;
+       struct mcinfo_msr *msr;
+       struct cpuinfo_x86 *c;
+       uint64_t hwcr = 0;
+       int intpose;
+       int i;
+
+       c = &cpu_data[smp_processor_id()];
+
+       if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
+               hwcr = x86_mc_hwcr_wren();
+
+       intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
+
+       for (i = 0, msr = &mci->mcinj_msr[0];
+           i < mci->mcinj_count; i++, msr++) {
+               printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
+                   "<-- 0x%llx\n",
+                   intpose ?  "interpose" : "hardware",
+                   mci->mcinj_cpunr, smp_processor_id(),
+                   (unsigned long long)msr->reg,
+                   (unsigned long long)msr->value);
+
+               if (intpose)
+                       intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
+               else
+                       wrmsrl(msr->reg, msr->value);
+       }
+
+       if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
+               x86_mc_hwcr_wren_restore(hwcr);
+}
+
+/*ARGSUSED*/
+static void x86_mc_mceinject(void *data)
+{
+       printk("Simulating #MC on cpu %d\n", smp_processor_id());
+       __asm__ __volatile__("int $0x12");
+}
+
+#if BITS_PER_LONG == 64
+
+#define        ID2COOKIE(id)   ((mctelem_cookie_t)(id))
+#define        COOKIE2ID(c) ((uint64_t)(c))
+
+#elif BITS_PER_LONG == 32
+
+#define        ID2COOKIE(id)   ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
+#define        COOKIE2ID(c)    ((uint64_t)(uint32_t)(c))
+
+#elif defined(BITS_PER_LONG)
+#error BITS_PER_LONG has unexpected value
+#else
+#error BITS_PER_LONG definition absent
+#endif
 
 /* Machine Check Architecture Hypercall */
 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
@@ -439,138 +990,188 @@ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
        long ret = 0;
        struct xen_mc curop, *op = &curop;
        struct vcpu *v = current;
-       struct domain *domU;
        struct xen_mc_fetch *mc_fetch;
-       struct xen_mc_notifydomain *mc_notifydomain;
-       struct mc_info *mi;
-       uint32_t flags;
-       uint32_t fetch_idx;
-        uint16_t vcpuid;
-       /* Use a different lock for the notify hypercall in order to allow
-        * a DomU to fetch mc data while Dom0 notifies another DomU. */
-       static DEFINE_SPINLOCK(mc_lock);
-       static DEFINE_SPINLOCK(mc_notify_lock);
+       struct xen_mc_physcpuinfo *mc_physcpuinfo;
+       uint32_t flags, cmdflags;
+       int nlcpu;
+       xen_mc_logical_cpu_t *log_cpus = NULL;
+       mctelem_cookie_t mctc;
+       mctelem_class_t which;
+       unsigned int target;
+       struct xen_mc_msrinject *mc_msrinject;
+       struct xen_mc_mceinject *mc_mceinject;
 
        if ( copy_from_guest(op, u_xen_mc, 1) )
-               return -EFAULT;
+               return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
 
        if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
-               return -EACCES;
+               return x86_mcerr("do_mca: interface version mismatch", -EACCES);
 
-       switch ( op->cmd ) {
+       switch (op->cmd) {
        case XEN_MC_fetch:
-               /* This hypercall is for any domain */
                mc_fetch = &op->u.mc_fetch;
+               cmdflags = mc_fetch->flags;
+
+               /* This hypercall is for Dom0 only */
+               if (!IS_PRIV(v->domain) )
+                       return x86_mcerr(NULL, -EPERM);
 
-               switch (mc_fetch->flags) {
-               case XEN_MC_CORRECTABLE:
-                       /* But polling mode is Dom0 only, because
-                        * correctable errors are reported to Dom0 only */
-                       if ( !IS_PRIV(v->domain) )
-                               return -EPERM;
+               switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
+               case XEN_MC_NONURGENT:
+                       which = MC_NONURGENT;
                        break;
 
-               case XEN_MC_TRAP:
+               case XEN_MC_URGENT:
+                       which = MC_URGENT;
                        break;
+
                default:
-                       return -EFAULT;
+                       return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
                }
 
                flags = XEN_MC_OK;
-               spin_lock(&mc_lock);
 
-               if ( IS_PRIV(v->domain) ) {
-                       /* this must be Dom0. So a notify hypercall
-                        * can't have happened before. */
-                       mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+               if (cmdflags & XEN_MC_ACK) {
+                       mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id);
+                       mctelem_ack(which, cookie);
                } else {
-                       /* Hypercall comes from an unprivileged domain */
-                       domU = v->domain;
-                       if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
-                               /* Dom0 must have notified this DomU before
-                                * via the notify hypercall. */
-                               mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v);
+                       if (guest_handle_is_null(mc_fetch->data))
+                               return x86_mcerr("do_mca fetch: guest buffer "
+                                   "invalid", -EINVAL);
+
+                       if ((mctc = mctelem_consume_oldest_begin(which))) {
+                               struct mc_info *mcip = mctelem_dataptr(mctc);
+                               if (copy_to_guest(mc_fetch->data, mcip, 1)) {
+                                       ret = -EFAULT;
+                                       flags |= XEN_MC_FETCHFAILED;
+                                       mc_fetch->fetch_id = 0;
+                               } else {
+                                       mc_fetch->fetch_id = COOKIE2ID(mctc);
+                               }
+                               mctelem_consume_oldest_end(mctc);
                        } else {
-                               /* Xen notified the DomU. */
-                               mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v);
+                               /* There is no data */
+                               flags |= XEN_MC_NODATA;
+                               mc_fetch->fetch_id = 0;
                        }
+
+                       mc_fetch->flags = flags;
+                       if (copy_to_guest(u_xen_mc, op, 1) != 0)
+                               ret = -EFAULT;
                }
 
-               if (mi) {
-                       memcpy(&mc_fetch->mc_info, mi,
-                               sizeof(struct mc_info));
-               } else {
-                       /* There is no data for a bogus DomU command. */
-                       flags |= XEN_MC_NODATA;
-                       memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
+               break;
+
+       case XEN_MC_notifydomain:
+               return x86_mcerr("do_mca notify unsupported", -EINVAL);
+
+       case XEN_MC_physcpuinfo:
+               if ( !IS_PRIV(v->domain) )
+                       return x86_mcerr("do_mca cpuinfo", -EPERM);
+
+               mc_physcpuinfo = &op->u.mc_physcpuinfo;
+               nlcpu = num_online_cpus();
+
+               if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+                       if (mc_physcpuinfo->ncpus <= 0)
+                               return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
+                                   -EINVAL);
+                       nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
+                       log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
+                       if (log_cpus == NULL)
+                               return x86_mcerr("do_mca cpuinfo", -ENOMEM);
+
+                       if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
+                           1, 1) != 0) {
+                               xfree(log_cpus);
+                               return x86_mcerr("do_mca cpuinfo", -EIO);
+                       }
                }
 
-               mc_fetch->flags = flags;
-               mc_fetch->fetch_idx = fetch_idx;
+               mc_physcpuinfo->ncpus = nlcpu;
 
-               if ( copy_to_guest(u_xen_mc, op, 1) )
-                       ret = -EFAULT;
+               if (copy_to_guest(u_xen_mc, op, 1)) {
+                       if (log_cpus != NULL)
+                               xfree(log_cpus);
+                       return x86_mcerr("do_mca cpuinfo", -EFAULT);
+               }
 
-               spin_unlock(&mc_lock);
+               if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+                       if (copy_to_guest(mc_physcpuinfo->info,
+                           log_cpus, nlcpu))
+                               ret = -EFAULT;
+                       xfree(log_cpus);
+               }
                break;
 
-       case XEN_MC_notifydomain:
-               /* This hypercall is for Dom0 only */
+       case XEN_MC_msrinject:
                if ( !IS_PRIV(v->domain) )
-                       return -EPERM;
+                       return x86_mcerr("do_mca inject", -EPERM);
 
-               spin_lock(&mc_notify_lock);
+               if (nr_mce_banks == 0)
+                       return x86_mcerr("do_mca inject", -ENODEV);
 
-               mc_notifydomain = &op->u.mc_notifydomain;
-               domU = get_domain_by_id(mc_notifydomain->mc_domid);
-               vcpuid = mc_notifydomain->mc_vcpuid;
+               mc_msrinject = &op->u.mc_msrinject;
+               target = mc_msrinject->mcinj_cpunr;
 
-               if ((domU == NULL) || (domU == dom0)) {
-                       /* It's not possible to notify a non-existent domain
-                        * or the dom0. */
-                       spin_unlock(&mc_notify_lock);
-                       return -EACCES;
-               }
+               if (target >= NR_CPUS)
+                       return x86_mcerr("do_mca inject: bad target", -EINVAL);
 
-               if (vcpuid >= MAX_VIRT_CPUS) {
-                       /* It's not possible to notify a vcpu, Xen can't
-                        * assign to a domain. */
-                       spin_unlock(&mc_notify_lock);
-                       return -EACCES;
-               }
+               if (!cpu_isset(target, cpu_online_map))
+                       return x86_mcerr("do_mca inject: target offline",
+                           -EINVAL);
 
-               mc_notifydomain->flags = XEN_MC_OK;
-
-               mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
-               if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
-                       /* The error telemetry is not for the guest, Dom0
-                        * wants to notify. */
-                       mc_notifydomain->flags |= XEN_MC_NOMATCH;
-               } else if ( guest_has_trap_callback(domU, vcpuid,
-                                               TRAP_machine_check) )
-               {
-                       /* Send notification */
-                       if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
-                               mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
-               } else
-                       mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
-
-#ifdef DEBUG
-               /* sanity check - these two flags are mutually exclusive */
-               if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED))
-                       BUG();
-#endif
+               if (mc_msrinject->mcinj_count == 0)
+                       return 0;
 
-               if ( copy_to_guest(u_xen_mc, op, 1) )
-                       ret = -EFAULT;
+               if (!x86_mc_msrinject_verify(mc_msrinject))
+                       return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
 
-               if (ret == 0) {
-                       x86_mcinfo_marknotified(mc_notifydomain);
-               }
+               add_taint(TAINT_ERROR_INJECT);
+
+               on_selected_cpus(cpumask_of_cpu(target),
+                   x86_mc_msrinject, mc_msrinject, 1, 1);
+
+               break;
+
+       case XEN_MC_mceinject:
+               if ( !IS_PRIV(v->domain) )
+                       return x86_mcerr("do_mca #MC", -EPERM);
+
+               if (nr_mce_banks == 0)
+                       return x86_mcerr("do_mca #MC", -ENODEV);
+
+               mc_mceinject = &op->u.mc_mceinject;
+               target = mc_mceinject->mceinj_cpunr;
+
+               if (target >= NR_CPUS)
+                       return x86_mcerr("do_mca #MC: bad target", -EINVAL);
+                      
+               if (!cpu_isset(target, cpu_online_map))
+                       return x86_mcerr("do_mca #MC: target offline", -EINVAL);
+
+               add_taint(TAINT_ERROR_INJECT);
+
+               on_selected_cpus(cpumask_of_cpu(target),
+                   x86_mc_mceinject, mc_mceinject, 1, 1);
 
-               spin_unlock(&mc_notify_lock);
                break;
+
+       default:
+               return x86_mcerr("do_mca: bad command", -EINVAL);
        }
 
        return ret;
 }
+
+void mc_panic(char *s)
+{
+    console_start_sync();
+    printk("Fatal machine check: %s\n", s);
+    printk("\n"
+           "****************************************\n"
+           "\n"
+           "   The processor has reported a hardware error which cannot\n"
+           "   be recovered from.  Xen will now reboot the machine.\n");
+    panic("HARDWARE ERROR");
+}
index b021f5bb010a80fb931c791d3e9cd81e016bee76..2bd6f023d6338011f391c3d08131e1ad63fea1c4 100644 (file)
+#ifndef _MCE_H
+
+#define _MCE_H
+
 #include <xen/init.h>
+#include <xen/smp.h>
+#include <asm/types.h>
 #include <asm/traps.h>
+#include <asm/atomic.h>
+#include <asm/percpu.h>
+
+#include "x86_mca.h"
+#include "mctelem.h"
 
 /* Init functions */
-void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-
-/* Function pointer used in the handlers to collect additional information
- * provided by newer CPU families/models without the need to duplicate
- * the whole handler resulting in various handlers each with its own
- * tweaks and bugs */
-extern int (*mc_callback_bank_extended)(struct mc_info *mi,
-               uint16_t bank, uint64_t status);
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c);
+
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+int winchip_mcheck_init(struct cpuinfo_x86 *c);
+int intel_mcheck_init(struct cpuinfo_x86 *c);
 
+void intel_mcheck_timer(struct cpuinfo_x86 *c);
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 
+int mce_available(struct cpuinfo_x86 *c);
+int mce_firstbank(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
-void x86_mcinfo_clear(struct mc_info *mi);
+void mc_panic(char *s);
+void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *,
+                        uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+
+
+/* Register a handler for machine check exceptions. */
+typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
+extern void x86_mce_vector_register(x86_mce_vector_t);
+
+/* Common generic MCE handler that implementations may nominate
+ * via x86_mce_vector_register. */
+extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
+
+/* Read an MSR, checking for an interposed value first */
+extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t,
+    uint64_t *);
+extern void intpose_inval(unsigned int, uint64_t);
+
+#define mca_rdmsrl(msr, var) do { \
+       if (intpose_lookup(smp_processor_id(), msr, &var) == NULL) \
+               rdmsrl(msr, var); \
+} while (0)
+
+/* Write an MSR, invalidating any interposed value */
+#define        mca_wrmsrl(msr, val) do { \
+       intpose_inval(smp_processor_id(), msr); \
+       wrmsrl(msr, val); \
+} while (0)
+
+
+/* Utility function to "logout" all architectural MCA telemetry from the MCA
+ * banks of the current processor.  A cookie is returned which may be
+ * uses to reference the data so logged (the cookie can be NULL if
+ * no logout structures were available).  The caller can also pass a pointer
+ * to a structure which will be completed with some summary information
+ * of the MCA data observed in the logout operation. */
+
+enum mca_source {
+       MCA_MCE_HANDLER,
+       MCA_POLLER,
+       MCA_CMCI_HANDLER,
+       MCA_RESET
+};
+
+enum mca_extinfo {
+       MCA_EXTINFO_LOCAL,
+       MCA_EXTINFO_GLOBAL,
+       MCA_EXTINFO_IGNORED
+};
+
+struct mca_summary {
+       uint32_t        errcnt; /* number of banks with valid errors */
+       int             ripv;   /* meaningful on #MC */
+       int             eipv;   /* meaningful on #MC */
+       uint32_t        uc;     /* bitmask of banks with UC */
+       uint32_t        pcc;    /* bitmask of banks with PCC */
+};
+
+extern cpu_banks_t mca_allbanks;
+
+extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
+    struct mca_summary *);
+
+/* Register a callback to be made during bank telemetry logout.
+ * This callback is only available to those machine check handlers
+ * that call to the common mcheck_cmn_handler or who use the common
+ * telemetry logout function mcheck_mca_logout in error polling.
+ *
+ * This can be used to collect additional information (typically non-
+ * architectural) provided by newer CPU families/models without the need
+ * to duplicate the whole handler resulting in various handlers each with
+ * its own tweaks and bugs.  The callback receives an struct mc_info pointer
+ * which it can use with x86_mcinfo_add to add additional telemetry,
+ * the current MCA bank number we are reading telemetry from, and the
+ * MCi_STATUS value for that bank.
+ */
+typedef enum mca_extinfo (*x86_mce_callback_t)
+    (struct mc_info *, uint16_t, uint64_t);
+extern void x86_mce_callback_register(x86_mce_callback_t);
+
 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
 void x86_mcinfo_dump(struct mc_info *mi);
 
-/* Global variables */
-extern int mce_disabled;
-extern unsigned int nr_mce_banks;
+#endif /* _MCE_H */
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
new file mode 100644 (file)
index 0000000..fb04a5b
--- /dev/null
@@ -0,0 +1,1075 @@
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/irq.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <xen/delay.h>
+#include <xen/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+#include "x86_mca.h"
+
+DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+static int nr_intel_ext_msrs = 0;
+static int cmci_support = 0;
+static int firstbank;
+
+#ifdef CONFIG_X86_MCE_THERMAL
+static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
+                smp_processor_id());
+    add_taint(TAINT_MACHINE_CHECK);
+}
+
+/* P4/Xeon Thermal transition interrupt handler */
+static void intel_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    u32 l, h;
+    unsigned int cpu = smp_processor_id();
+    static s_time_t next[NR_CPUS];
+
+    ack_APIC_irq();
+    if (NOW() < next[cpu])
+        return;
+
+    next[cpu] = NOW() + MILLISECS(5000);
+    rdmsr(MSR_IA32_THERM_STATUS, l, h);
+    if (l & 0x1) {
+        printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
+        printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
+                cpu);
+        add_taint(TAINT_MACHINE_CHECK);
+    } else {
+        printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+    }
+}
+
+/* Thermal interrupt handler for this CPU setup */
+static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) 
+        = unexpected_thermal_interrupt;
+
+fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    irq_enter();
+    vendor_thermal_interrupt(regs);
+    irq_exit();
+}
+
+/* P4/Xeon Thermal regulation detect and init */
+static void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+    int tm2 = 0;
+    unsigned int cpu = smp_processor_id();
+
+    /* Thermal monitoring */
+    if (!cpu_has(c, X86_FEATURE_ACPI))
+        return; /* -ENODEV */
+
+    /* Clock modulation */
+    if (!cpu_has(c, X86_FEATURE_ACC))
+        return; /* -ENODEV */
+
+    /* first check if its enabled already, in which case there might
+     * be some SMM goo which handles it, so we can't even put a handler
+     * since it might be delivered via SMI already -zwanem.
+     */
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    h = apic_read(APIC_LVTTHMR);
+    if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
+        printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
+        return; /* -EBUSY */
+    }
+
+    if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+        tm2 = 1;
+
+    /* check whether a vector already exists, temporarily masked? */
+    if (h & APIC_VECTOR_MASK) {
+        printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
+                 cpu, (h & APIC_VECTOR_MASK));
+        return; /* -EBUSY */
+    }
+
+    /* The temperature transition interrupt handler setup */
+    h = THERMAL_APIC_VECTOR;    /* our delivery vector */
+    h |= (APIC_DM_FIXED | APIC_LVT_MASKED);  /* we'll mask till we're ready */
+    apic_write_around(APIC_LVTTHMR, h);
+
+    rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
+    wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+
+    /* ok we're good to go... */
+    vendor_thermal_interrupt = intel_thermal_interrupt;
+
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+    l = apic_read (APIC_LVTTHMR);
+    apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+    printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 
+            cpu, tm2 ? "TM2" : "TM1");
+    return;
+}
+#endif /* CONFIG_X86_MCE_THERMAL */
+
+static enum mca_extinfo
+intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
+{
+    struct mcinfo_extended mc_ext;
+
+    if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
+        return MCA_EXTINFO_IGNORED;
+
+    /* this function will called when CAP(9).MCG_EXT_P = 1 */
+    memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext.common.type = MC_TYPE_EXTENDED;
+    mc_ext.common.size = sizeof(mc_ext);
+    mc_ext.mc_msrs = 10;
+
+    mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value);
+    mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value);
+    mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value);
+
+    mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value);
+    mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value);
+    mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value);
+
+    mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value);
+    mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value);
+    mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value);
+    mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value);
+
+    x86_mcinfo_add(mci, &mc_ext);
+
+    return MCA_EXTINFO_GLOBAL;
+}
+
+/* Below are for MCE handling */
+
+/* Log worst error severity and offending CPU.,
+ * Pick this CPU for further processing in softirq */
+static int severity_cpu = -1;
+static int worst = 0;
+
+/* Lock of entry@second round scanning in MCE# handler */
+static cpumask_t scanned_cpus;
+/* Lock for entry@Critical Section in MCE# handler */
+static bool_t mce_enter_lock = 0;
+/* Record how many CPUs impacted in this MCE# */
+static cpumask_t impact_map;
+
+/* Lock of softirq rendezvous entering point */
+static cpumask_t mced_cpus;
+/*Lock of softirq rendezvous leaving point */
+static cpumask_t finished_cpus;
+/* Lock for picking one processing CPU */
+static bool_t mce_process_lock = 0;
+
+/* Spinlock for vMCE# MSR virtualization data */
+static DEFINE_SPINLOCK(mce_locks);
+
+/* Local buffer for holding MCE# data temporarily, sharing between mce
+ * handler and softirq handler. Those data will be finally committed
+ * for DOM0 Log and coped to per_dom related data for guest vMCE#
+ * MSR virtualization.
+ * Note: When local buffer is still in processing in softirq, another
+ * MCA comes, simply panic.
+ */
+
+struct mc_local_t
+{
+    bool_t in_use;
+    mctelem_cookie_t mctc[NR_CPUS];
+};
+static struct mc_local_t mc_local;
+
+/* This node list records errors impacting a domain. when one
+ * MCE# happens, one error bank impacts a domain. This error node
+ * will be inserted to the tail of the per_dom data for vMCE# MSR
+ * virtualization. When one vMCE# injection is finished processing
+ * processed by guest, the corresponding node will be deleted. 
+ * This node list is for GUEST vMCE# MSRS virtualization.
+ */
+static struct bank_entry* alloc_bank_entry(void) {
+    struct bank_entry *entry;
+
+    entry = xmalloc(struct bank_entry);
+    if (!entry) {
+        printk(KERN_ERR "MCE: malloc bank_entry failed\n");
+        return NULL;
+    }
+    memset(entry, 0x0, sizeof(entry));
+    INIT_LIST_HEAD(&entry->list);
+    return entry;
+}
+
+/* Fill error bank info for #vMCE injection and GUEST vMCE#
+ * MSR virtualization data
+ * 1) Log down how many nr_injections of the impacted.
+ * 2) Copy MCE# error bank to impacted DOM node list, 
+      for vMCE# MSRs virtualization
+*/
+
+static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank, 
+        uint64_t gstatus) {
+    struct domain *d;
+    struct bank_entry *entry;
+
+    /* This error bank impacts one domain, we need to fill domain related
+     * data for vMCE MSRs virtualization and vMCE# injection */
+    if (mc_bank->mc_domid != (uint16_t)~0) {
+        d = get_domain_by_id(mc_bank->mc_domid);
+
+        /* Not impact a valid domain, skip this error of the bank */
+        if (!d) {
+            printk(KERN_DEBUG "MCE: Not found valid impacted DOM\n");
+            return 0;
+        }
+
+        entry = alloc_bank_entry();
+        entry->mci_status = mc_bank->mc_status;
+        entry->mci_addr = mc_bank->mc_addr;
+        entry->mci_misc = mc_bank->mc_misc;
+        entry->cpu = cpu;
+        entry->bank = mc_bank->mc_bank;
+
+        /* New error Node, insert to the tail of the per_dom data */
+        list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
+        /* Fill MSR global status */
+        d->arch.vmca_msrs.mcg_status = gstatus;
+        /* New node impact the domain, need another vMCE# injection*/
+        d->arch.vmca_msrs.nr_injection++;
+
+        printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
+                "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
+                entry->cpu, mc_bank->mc_bank,
+                mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid);
+    }
+    return 0;
+}
+
+static int mce_actions(void) {
+    int32_t cpu, ret;
+    struct mc_info *local_mi;
+    struct mcinfo_common *mic = NULL;
+    struct mcinfo_global *mc_global;
+    struct mcinfo_bank *mc_bank;
+
+    /* Spinlock is used for exclusive read/write of vMSR virtualization
+     * (per_dom vMCE# data)
+     */
+    spin_lock(&mce_locks);
+
+    /*
+     * If softirq is filling this buffer while another MCE# comes,
+     * simply panic
+     */
+    test_and_set_bool(mc_local.in_use);
+
+    for_each_cpu_mask(cpu, impact_map) {
+        if (mc_local.mctc[cpu] == NULL) {
+            printk(KERN_ERR "MCE: get reserved entry failed\n ");
+            ret = -1;
+            goto end;
+        }
+        local_mi = (struct mc_info*)mctelem_dataptr(mc_local.mctc[cpu]);
+        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+        if (mic == NULL) {
+            printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+            ret = -1;
+            goto end;
+        }
+
+        mc_global = (struct mcinfo_global *)mic;
+
+        /* Processing bank information */
+        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+
+        for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
+            if (mic->type != MC_TYPE_BANK) {
+                continue;
+            }
+            mc_bank = (struct mcinfo_bank*)mic;
+            /* Fill vMCE# injection and vMCE# MSR virtualization related data */
+            if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) {
+                ret = -1;
+                goto end;
+            }
+
+            /* TODO: Add recovery actions here, such as page-offline, etc */
+        }
+    } /* end of impact_map loop */
+
+    ret = 0;
+
+end:
+
+    for_each_cpu_mask(cpu, impact_map) {
+        /* This reserved entry is processed, commit it */
+        if (mc_local.mctc[cpu] != NULL) {
+            mctelem_commit(mc_local.mctc[cpu]);
+            printk(KERN_DEBUG "MCE: Commit one URGENT ENTRY\n");
+        }
+    }
+
+    test_and_clear_bool(mc_local.in_use);
+    spin_unlock(&mce_locks);
+    return ret;
+}
+
+/* Softirq Handler for this MCE# processing */
+static void mce_softirq(void)
+{
+    int cpu = smp_processor_id();
+    cpumask_t affinity;
+
+    /* Wait until all cpus entered softirq */
+    while ( cpus_weight(mced_cpus) != num_online_cpus() ) {
+        cpu_relax();
+    }
+    /* Not Found worst error on severity_cpu, it's weird */
+    if (severity_cpu == -1) {
+        printk(KERN_WARNING "MCE: not found severity_cpu!\n");
+        mc_panic("MCE: not found severity_cpu!");
+        return;
+    }
+    /* We choose severity_cpu for further processing */
+    if (severity_cpu == cpu) {
+
+        /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
+         * vMCE MSRs virtualization buffer
+         */
+        if (mce_actions())
+            mc_panic("MCE recovery actions or Filling vMCE MSRS "
+                     "virtualization data failed!\n");
+
+        /* Step2: Send Log to DOM0 through vIRQ */
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n");
+            send_guest_global_virq(dom0, VIRQ_MCA);
+        }
+
+        /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */
+        if (guest_has_trap_callback
+               (dom0, 0, TRAP_machine_check) &&
+                 !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
+            dom0->vcpu[0]->cpu_affinity_tmp = 
+                    dom0->vcpu[0]->cpu_affinity;
+            cpus_clear(affinity);
+            cpu_set(cpu, affinity);
+            printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
+                dom0->vcpu[0]->processor);
+            vcpu_set_affinity(dom0->vcpu[0], &affinity);
+            vcpu_kick(dom0->vcpu[0]);
+        }
+
+        /* Clean Data */
+        test_and_clear_bool(mce_process_lock);
+        cpus_clear(impact_map);
+        cpus_clear(scanned_cpus);
+        worst = 0;
+        cpus_clear(mced_cpus);
+        memset(&mc_local, 0x0, sizeof(mc_local));
+    }
+
+    cpu_set(cpu, finished_cpus);
+    wmb();
+   /* Leave until all cpus finished recovery actions in softirq */
+    while ( cpus_weight(finished_cpus) != num_online_cpus() ) {
+        cpu_relax();
+    }
+
+    cpus_clear(finished_cpus);
+    severity_cpu = -1;
+    printk(KERN_DEBUG "CPU%d exit softirq \n", cpu);
+}
+
+/* Machine Check owner judge algorithm:
+ * When error happens, all cpus serially read its msr banks.
+ * The first CPU who fetches the error bank's info will clear
+ * this bank. Later readers can't get any infor again.
+ * The first CPU is the actual mce_owner
+ *
+ * For Fatal (pcc=1) error, it might cause machine crash
+ * before we're able to log. For avoiding log missing, we adopt two
+ * round scanning:
+ * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
+ * All MCE banks are sticky, when boot up, MCE polling mechanism
+ * will help to collect and log those MCE errors.
+ * Round2: Do all MCE processing logic as normal.
+ */
+
+/* Simple Scan. Panic when found non-recovery errors. Doing this for
+ * avoiding LOG missing
+ */
+static void severity_scan(void)
+{
+    uint64_t status;
+    int32_t i;
+
+    /* TODO: For PCC = 0, we need to have further judge. If it is can't be
+     * recovered, we need to RESET for avoiding DOM0 LOG missing
+     */
+    for ( i = 0; i < nr_mce_banks; i++) {
+        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i , status);
+        if ( !(status & MCi_STATUS_VAL) )
+            continue;
+        /* MCE handler only handles UC error */
+        if ( !(status & MCi_STATUS_UC) )
+            continue;
+        if ( !(status & MCi_STATUS_EN) )
+            continue;
+        if (status & MCi_STATUS_PCC)
+            mc_panic("pcc = 1, cpu unable to continue\n");
+    }
+
+    /* TODO: Further judgement for later CPUs here, maybe need MCACOD assistence */
+    /* EIPV and RIPV is not a reliable way to judge the error severity */
+
+}
+
+
+static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+{
+    unsigned int cpu = smp_processor_id();
+    int32_t severity = 0;
+    uint64_t gstatus;
+    mctelem_cookie_t mctc = NULL;
+    struct mca_summary bs;
+
+    /* First round scanning */
+    severity_scan();
+    cpu_set(cpu, scanned_cpus);
+    while (cpus_weight(scanned_cpus) < num_online_cpus())
+        cpu_relax();
+
+    wmb();
+    /* All CPUs Finished first round scanning */
+    if (mc_local.in_use != 0) {
+        mc_panic("MCE: Local buffer is being processed, can't handle new MCE!\n");
+        return;
+    }
+
+    /* Enter Critical Section */
+    while (test_and_set_bool(mce_enter_lock)) {
+        udelay (1);
+    }
+
+    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, mca_allbanks, &bs);
+     /* local data point to the reserved entry, let softirq to
+      * process the local data */
+    if (!bs.errcnt) {
+        if (mctc != NULL)
+            mctelem_dismiss(mctc);
+        mc_local.mctc[cpu] = NULL;
+        cpu_set(cpu, mced_cpus);
+        test_and_clear_bool(mce_enter_lock);
+        raise_softirq(MACHINE_CHECK_SOFTIRQ);
+        return;
+    }
+    else if ( mctc != NULL) {
+        mc_local.mctc[cpu] = mctc;
+    }
+
+    if (bs.uc || bs.pcc)
+        add_taint(TAINT_MACHINE_CHECK);
+
+    if (bs.pcc) {
+        printk(KERN_WARNING "PCC=1 should have caused reset\n");
+        severity = 3;
+    }
+    else if (bs.uc) {
+        severity = 2;
+    }
+    else {
+        printk(KERN_WARNING "We should skip Correctable Error\n");
+        severity = 1;
+    }
+    /* This is the offending cpu! */
+    cpu_set(cpu, impact_map);
+
+    if ( severity > worst) {
+        worst = severity;
+        severity_cpu = cpu;
+    }
+    cpu_set(cpu, mced_cpus);
+    test_and_clear_bool(mce_enter_lock);
+    wmb();
+
+    /* Wait for all cpus Leave Critical */
+    while (cpus_weight(mced_cpus) < num_online_cpus())
+        cpu_relax();
+    /* Print MCE error */
+    x86_mcinfo_dump(mctelem_dataptr(mctc));
+
+    /* Pick one CPU to clear MCIP */
+    if (!test_and_set_bool(mce_process_lock)) {
+        rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+        wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
+
+        if (worst >= 3) {
+            printk(KERN_WARNING "worst=3 should have caused RESET\n");
+            mc_panic("worst=3 should have caused RESET");
+        }
+        else {
+            printk(KERN_DEBUG "MCE: trying to recover\n");
+        }
+    }
+    raise_softirq(MACHINE_CHECK_SOFTIRQ);
+}
+
+static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
+
+/*
+ * Discover bank sharing using the algorithm recommended in the SDM.
+ */
+static int do_cmci_discover(int i)
+{
+    unsigned msr = MSR_IA32_MC0_CTL2 + i;
+    u64 val;
+
+    rdmsrl(msr, val);
+    /* Some other CPU already owns this bank. */
+    if (val & CMCI_EN) {
+        clear_bit(i, __get_cpu_var(mce_banks_owned));
+        goto out;
+    }
+    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
+    rdmsrl(msr, val);
+
+    if (!(val & CMCI_EN)) {
+        /* This bank does not support CMCI. Polling timer has to handle it. */
+        set_bit(i, __get_cpu_var(no_cmci_banks));
+        return 0;
+    }
+    set_bit(i, __get_cpu_var(mce_banks_owned));
+out:
+    clear_bit(i, __get_cpu_var(no_cmci_banks));
+    return 1;
+}
+
+static void cmci_discover(void)
+{
+    unsigned long flags;
+    int i;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
+
+    printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
+
+    spin_lock_irqsave(&cmci_discover_lock, flags);
+
+    for (i = 0; i < nr_mce_banks; i++)
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            do_cmci_discover(i);
+
+    spin_unlock_irqrestore(&cmci_discover_lock, flags);
+
+    /* In case CMCI happended when do owner change.
+     * If CMCI happened yet not processed immediately,
+     * MCi_status (error_count bit 38~52) is not cleared,
+     * the CMCI interrupt will never be triggered again.
+     */
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
+            send_guest_global_virq(dom0, VIRQ_MCA);
+        } else {
+            x86_mcinfo_dump(mctelem_dataptr(mctc));
+            mctelem_dismiss(mctc);
+        }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
+
+    printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
+           smp_processor_id(), 
+           *((unsigned long *)__get_cpu_var(mce_banks_owned)), 
+           *((unsigned long *)__get_cpu_var(no_cmci_banks)));
+}
+
+/*
+ * Define an owner for each bank. Banks can be shared between CPUs
+ * and to avoid reporting events multiple times always set up one
+ * CPU as owner. 
+ *
+ * The assignment has to be redone when CPUs go offline and
+ * any of the owners goes away. Also pollers run in parallel so we
+ * have to be careful to update the banks in a way that doesn't
+ * lose or duplicate events.
+ */
+
+static void mce_set_owner(void)
+{
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    cmci_discover();
+}
+
+static void __cpu_mcheck_distribute_cmci(void *unused)
+{
+    cmci_discover();
+}
+
+void cpu_mcheck_distribute_cmci(void)
+{
+    if (cmci_support && !mce_disabled)
+        on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0, 0);
+}
+
+static void clear_cmci(void)
+{
+    int i;
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    printk(KERN_DEBUG "CMCI: clear_cmci support on CPU%d\n", 
+            smp_processor_id());
+
+    for (i = 0; i < nr_mce_banks; i++) {
+        unsigned msr = MSR_IA32_MC0_CTL2 + i;
+        u64 val;
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        rdmsrl(msr, val);
+        if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
+            wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
+        clear_bit(i, __get_cpu_var(mce_banks_owned));
+    }
+}
+
+void cpu_mcheck_disable(void)
+{
+    clear_in_cr4(X86_CR4_MCE);
+
+    if (cmci_support && !mce_disabled)
+        clear_cmci();
+}
+
+static void intel_init_cmci(struct cpuinfo_x86 *c)
+{
+    u32 l, apic;
+    int cpu = smp_processor_id();
+
+    if (!mce_available(c) || !cmci_support) {
+        printk(KERN_DEBUG "CMCI: CPU%d has no CMCI support\n", cpu);
+        return;
+    }
+
+    apic = apic_read(APIC_CMCI);
+    if ( apic & APIC_VECTOR_MASK )
+    {
+        printk(KERN_WARNING "CPU%d CMCI LVT vector (%#x) already installed\n",
+            cpu, ( apic & APIC_VECTOR_MASK ));
+        return;
+    }
+
+    apic = CMCI_APIC_VECTOR;
+    apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+    apic_write_around(APIC_CMCI, apic);
+
+    l = apic_read(APIC_CMCI);
+    apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
+}
+
+fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
+{
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
+
+    ack_APIC_irq();
+    irq_enter();
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
+            printk(KERN_DEBUG "CMCI: send CMCI to DOM0 through virq\n");
+            send_guest_global_virq(dom0, VIRQ_MCA);
+        } else {
+            x86_mcinfo_dump(mctelem_dataptr(mctc));
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
+
+    irq_exit();
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+
+#ifdef CONFIG_X86_MCE_THERMAL
+    intel_init_thermal(c);
+#endif
+    intel_init_cmci(c);
+}
+
+uint64_t g_mcg_cap;
+static void mce_cap_init(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    /* For Guest vMCE usage */
+    g_mcg_cap = ((u64)h << 32 | l) & (~MCG_CMCI_P);
+
+    if ((l & MCG_CMCI_P) && cpu_has_apic)
+        cmci_support = 1;
+
+    nr_mce_banks = l & 0xff;
+    if (nr_mce_banks > MAX_NR_BANKS)
+        printk(KERN_WARNING "MCE: exceed max mce banks\n");
+    if (l & MCG_EXT_P)
+    {
+        nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
+        printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
+            smp_processor_id(), nr_intel_ext_msrs);
+    }
+    firstbank = mce_firstbank(c);
+}
+
+static void mce_init(void)
+{
+    u32 l, h;
+    int i;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
+
+    clear_in_cr4(X86_CR4_MCE);
+
+    /* log the machine checks left over from the previous reset.
+     * This also clears all registers*/
+
+    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
+
+    /* in the boot up stage, don't inject to DOM0, but print out */
+    if (bs.errcnt && mctc != NULL) {
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
+        mctelem_dismiss(mctc);
+    }
+
+    set_in_cr4(X86_CR4_MCE);
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if (l & MCG_CTL_P) /* Control register present ? */
+        wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+    for (i = firstbank; i < nr_mce_banks; i++)
+    {
+        /* Some banks are shared across cores, use MCi_CTRL to judge whether
+         * this bank has been initialized by other cores already. */
+        rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
+        if (!(l | h))
+        {
+            /* if ctl is 0, this bank is never initialized */
+            printk(KERN_DEBUG "mce_init: init bank%d\n", i);
+            wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
+            wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
+        }
+    }
+    if (firstbank) /* if cmci enabled, firstbank = 0 */
+        wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+}
+
+/* p4/p6 family have similar MCA initialization process */
+int intel_mcheck_init(struct cpuinfo_x86 *c)
+{
+    mce_cap_init(c);
+    printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+            smp_processor_id());
+
+    /* machine check is available */
+    x86_mce_vector_register(intel_machine_check);
+    x86_mce_callback_register(intel_get_extended_msrs);
+
+    mce_init();
+    mce_intel_feature_init(c);
+    mce_set_owner();
+
+    open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
+    return 1;
+}
+
+/* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
+int intel_mce_wrmsr(u32 msr, u32 lo, u32 hi)
+{
+    struct domain *d = current->domain;
+    struct bank_entry *entry = NULL;
+    uint64_t value = (u64)hi << 32 | lo;
+    int ret = 1;
+
+    spin_lock(&mce_locks);
+    switch(msr)
+    {
+    case MSR_IA32_MCG_CTL:
+        if (value != (u64)~0x0 && value != 0x0) {
+            gdprintk(XENLOG_WARNING, "MCE: value writen to MCG_CTL"
+                     "should be all 0s or 1s\n");
+            ret = -1;
+            break;
+        }
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: wrmsr not in DOM context, skip\n");
+            break;
+        }
+        d->arch.vmca_msrs.mcg_ctl = value;
+        break;
+    case MSR_IA32_MCG_STATUS:
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: wrmsr not in DOM context, skip\n");
+            break;
+        }
+        d->arch.vmca_msrs.mcg_status = value;
+        gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_CTL %"PRIx64"\n", value);
+        break;
+    case MSR_IA32_MC0_CTL2:
+    case MSR_IA32_MC1_CTL2:
+    case MSR_IA32_MC2_CTL2:
+    case MSR_IA32_MC3_CTL2:
+    case MSR_IA32_MC4_CTL2:
+    case MSR_IA32_MC5_CTL2:
+    case MSR_IA32_MC6_CTL2:
+    case MSR_IA32_MC7_CTL2:
+    case MSR_IA32_MC8_CTL2:
+        gdprintk(XENLOG_WARNING, "We have disabled CMCI capability, "
+                 "Guest should not write this MSR!\n");
+        break;
+    case MSR_IA32_MC0_CTL:
+    case MSR_IA32_MC1_CTL:
+    case MSR_IA32_MC2_CTL:
+    case MSR_IA32_MC3_CTL:
+    case MSR_IA32_MC4_CTL:
+    case MSR_IA32_MC5_CTL:
+    case MSR_IA32_MC6_CTL:
+    case MSR_IA32_MC7_CTL:
+    case MSR_IA32_MC8_CTL:
+        if (value != (u64)~0x0 && value != 0x0) {
+            gdprintk(XENLOG_WARNING, "MCE: value writen to MCi_CTL"
+                     "should be all 0s or 1s\n");
+            ret = -1;
+            break;
+        }
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: wrmsr not in DOM context, skip\n");
+            break;
+        }
+        d->arch.vmca_msrs.mci_ctl[(msr - MSR_IA32_MC0_CTL)/4] = value;
+        break;
+    case MSR_IA32_MC0_STATUS:
+    case MSR_IA32_MC1_STATUS:
+    case MSR_IA32_MC2_STATUS:
+    case MSR_IA32_MC3_STATUS:
+    case MSR_IA32_MC4_STATUS:
+    case MSR_IA32_MC5_STATUS:
+    case MSR_IA32_MC6_STATUS:
+    case MSR_IA32_MC7_STATUS:
+    case MSR_IA32_MC8_STATUS:
+        if (!d || is_idle_domain(d)) {
+            /* Just skip */
+            gdprintk(XENLOG_WARNING, "mce wrmsr: not in domain context!\n");
+            break;
+        }
+        /* Give the first entry of the list, it corresponds to current
+         * vMCE# injection. When vMCE# is finished processing by the
+         * the guest, this node will be deleted.
+         * Only error bank is written. Non-error bank simply return.
+         */
+        if ( !list_empty(&d->arch.vmca_msrs.impact_header) ) {
+            entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                               struct bank_entry, list);
+            if ( entry->bank == (msr - MSR_IA32_MC0_STATUS)/4 ) {
+                entry->mci_status = value;
+            }
+            gdprintk(XENLOG_DEBUG, "MCE: wmrsr mci_status in vMCE# context\n");
+        }
+        gdprintk(XENLOG_DEBUG, "MCE: wrmsr mci_status val:%"PRIx64"\n", value);
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+    spin_unlock(&mce_locks);
+    return ret;
+}
+
+int intel_mce_rdmsr(u32 msr, u32 *lo, u32 *hi)
+{
+    struct domain *d = current->domain;
+    int ret = 1;
+    struct bank_entry *entry = NULL;
+
+    *lo = *hi = 0x0;
+    spin_lock(&mce_locks);
+    switch(msr)
+    {
+    case MSR_IA32_MCG_STATUS:
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: rdmsr not in domain context!\n");
+            *lo = *hi = 0x0;
+            break;
+        }
+        *lo = (u32)d->arch.vmca_msrs.mcg_status;
+        *hi = (u32)(d->arch.vmca_msrs.mcg_status >> 32);
+        gdprintk(XENLOG_DEBUG, "MCE: rd MCG_STATUS lo %x hi %x\n", *lo, *hi);
+        break;
+    case MSR_IA32_MCG_CAP:
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: rdmsr not in domain context!\n");
+            *lo = *hi = 0x0;
+            break;
+        }
+        *lo = (u32)d->arch.vmca_msrs.mcg_cap;
+        *hi = (u32)(d->arch.vmca_msrs.mcg_cap >> 32);
+        gdprintk(XENLOG_DEBUG, "MCE: rdmsr MCG_CAP lo %x hi %x\n", *lo, *hi);
+        break;
+    case MSR_IA32_MCG_CTL:
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: rdmsr not in domain context!\n");
+            *lo = *hi = 0x0;
+            break;
+        }
+        *lo = (u32)d->arch.vmca_msrs.mcg_ctl;
+        *hi = (u32)(d->arch.vmca_msrs.mcg_ctl >> 32);
+        gdprintk(XENLOG_DEBUG, "MCE: rdmsr MCG_CTL lo %x hi %x\n", *lo, *hi);
+        break;
+    case MSR_IA32_MC0_CTL2:
+    case MSR_IA32_MC1_CTL2:
+    case MSR_IA32_MC2_CTL2:
+    case MSR_IA32_MC3_CTL2:
+    case MSR_IA32_MC4_CTL2:
+    case MSR_IA32_MC5_CTL2:
+    case MSR_IA32_MC6_CTL2:
+    case MSR_IA32_MC7_CTL2:
+    case MSR_IA32_MC8_CTL2:
+        gdprintk(XENLOG_WARNING, "We have disabled CMCI capability, "
+                 "Guest should not read this MSR!\n");
+        break;
+    case MSR_IA32_MC0_CTL:
+    case MSR_IA32_MC1_CTL:
+    case MSR_IA32_MC2_CTL:
+    case MSR_IA32_MC3_CTL:
+    case MSR_IA32_MC4_CTL:
+    case MSR_IA32_MC5_CTL:
+    case MSR_IA32_MC6_CTL:
+    case MSR_IA32_MC7_CTL:
+    case MSR_IA32_MC8_CTL:
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: rdmsr not in domain context!\n");
+            *lo = *hi = 0x0;
+            break;
+        }
+        *lo = (u32)d->arch.vmca_msrs.mci_ctl[(msr - MSR_IA32_MC0_CTL)/4];
+        *hi =
+            (u32)(d->arch.vmca_msrs.mci_ctl[(msr - MSR_IA32_MC0_CTL)/4]
+                  >> 32);
+        gdprintk(XENLOG_DEBUG, "MCE: rdmsr MCi_CTL lo %x hi %x\n", *lo, *hi);
+        break;
+    case MSR_IA32_MC0_STATUS:
+    case MSR_IA32_MC1_STATUS:
+    case MSR_IA32_MC2_STATUS:
+    case MSR_IA32_MC3_STATUS:
+    case MSR_IA32_MC4_STATUS:
+    case MSR_IA32_MC5_STATUS:
+    case MSR_IA32_MC6_STATUS:
+    case MSR_IA32_MC7_STATUS:
+    case MSR_IA32_MC8_STATUS:
+        /* Only error bank is read. Non-error bank simply return */
+        *lo = *hi = 0x0;
+        gdprintk(XENLOG_DEBUG, "MCE: rdmsr mci_status\n");
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "mce_rdmsr: not in domain context!\n");
+            break;
+        }
+        if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+            entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                               struct bank_entry, list);
+            if ( entry->bank == (msr - MSR_IA32_MC0_STATUS)/4 ) {
+                *lo = entry->mci_status;
+                *hi = entry->mci_status >> 32;
+                gdprintk(XENLOG_DEBUG, "MCE: rdmsr MCi_STATUS in vmCE# context "
+                         "lo %x hi %x\n", *lo, *hi);
+            }
+        }
+        break;
+    case MSR_IA32_MC0_ADDR:
+    case MSR_IA32_MC1_ADDR:
+    case MSR_IA32_MC2_ADDR:
+    case MSR_IA32_MC3_ADDR:
+    case MSR_IA32_MC4_ADDR:
+    case MSR_IA32_MC5_ADDR:
+    case MSR_IA32_MC6_ADDR:
+    case MSR_IA32_MC7_ADDR:
+    case MSR_IA32_MC8_ADDR:
+        *lo = *hi = 0x0;
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "mce_rdmsr: not in domain context!\n");
+            break;
+        }
+        if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+            entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                               struct bank_entry, list);
+            if ( entry->bank == (msr - MSR_IA32_MC0_ADDR)/4 ) {
+                *lo = entry->mci_addr;
+                *hi = entry->mci_addr >> 32;
+                gdprintk(XENLOG_DEBUG, "MCE: rdmsr MCi_ADDR in vMCE# context "
+                         "lo %x hi %x\n", *lo, *hi);
+            }
+        }
+        break;
+    case MSR_IA32_MC0_MISC:
+    case MSR_IA32_MC1_MISC:
+    case MSR_IA32_MC2_MISC:
+    case MSR_IA32_MC3_MISC:
+    case MSR_IA32_MC4_MISC:
+    case MSR_IA32_MC5_MISC:
+    case MSR_IA32_MC6_MISC:
+    case MSR_IA32_MC7_MISC:
+    case MSR_IA32_MC8_MISC:
+        *lo = *hi = 0x0;
+        if (!d || is_idle_domain(d)) {
+            gdprintk(XENLOG_WARNING, "MCE: rdmsr not in domain context!\n");
+            break;
+        }
+        if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+            entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                               struct bank_entry, list);
+            if ( entry->bank == (msr - MSR_IA32_MC0_MISC)/4 ) {
+                *lo = entry->mci_misc;
+                *hi = entry->mci_misc >> 32;
+                gdprintk(XENLOG_DEBUG, "MCE: rdmsr MCi_MISC in vMCE# context "
+                         " lo %x hi %x\n", *lo, *hi);
+            }
+        }
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+    spin_unlock(&mce_locks);
+    return ret;
+}
+
+
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
new file mode 100644 (file)
index 0000000..4111ddc
--- /dev/null
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+/*
+ * mctelem.c - x86 Machine Check Telemetry Transport
+ */
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+struct mctelem_ent {
+       struct mctelem_ent *mcte_next;  /* next in chronological order */
+       struct mctelem_ent *mcte_prev;  /* previous in chronological order */
+       uint32_t mcte_flags;            /* See MCTE_F_* below */
+       uint32_t mcte_refcnt;           /* Reference count */
+       void *mcte_data;                /* corresponding data payload */
+};
+
+#define        MCTE_F_HOME_URGENT              0x0001U /* free to urgent freelist */
+#define        MCTE_F_HOME_NONURGENT           0x0002U /* free to nonurgent freelist */
+#define        MCTE_F_CLASS_URGENT             0x0004U /* in use - urgent errors */
+#define        MCTE_F_CLASS_NONURGENT          0x0008U /* in use - nonurgent errors */
+#define        MCTE_F_STATE_FREE               0x0010U /* on a freelist */
+#define        MCTE_F_STATE_UNCOMMITTED        0x0020U /* reserved; on no list */
+#define        MCTE_F_STATE_COMMITTED          0x0040U /* on a committed list */
+#define        MCTE_F_STATE_PROCESSING         0x0080U /* on a processing list */
+
+#define        MCTE_F_MASK_HOME        (MCTE_F_HOME_URGENT | MCTE_F_HOME_NONURGENT)
+#define        MCTE_F_MASK_CLASS       (MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
+#define        MCTE_F_MASK_STATE       (MCTE_F_STATE_FREE | \
+                               MCTE_F_STATE_UNCOMMITTED | \
+                               MCTE_F_STATE_COMMITTED | \
+                               MCTE_F_STATE_PROCESSING)
+
+#define        MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME)
+
+#define        MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
+#define        MCTE_SET_CLASS(tep, new) do { \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
+    (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
+
+#define        MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
+#define        MCTE_TRANSITION_STATE(tep, old, new) do { \
+    BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
+    (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
+
+#define        MC_URGENT_NENT          10
+#define        MC_NONURGENT_NENT       20
+
+#define        MC_NCLASSES             (MC_NONURGENT + 1)
+
+#define        COOKIE2MCTE(c)          ((struct mctelem_ent *)(c))
+#define        MCTE2COOKIE(tep)        ((mctelem_cookie_t)(tep))
+
+static struct mc_telem_ctl {
+       /* Linked lists that thread the array members together.
+        *
+        * The free lists are singly-linked via mcte_next, and we allocate
+        * from them by atomically unlinking an element from the head.
+        * Consumed entries are returned to the head of the free list.
+        * When an entry is reserved off the free list it is not linked
+        * on any list until it is committed or dismissed.
+        *
+        * The committed list grows at the head and we do not maintain a
+        * tail pointer; insertions are performed atomically.  The head
+        * thus has the most-recently committed telemetry, i.e. the
+        * list is in reverse chronological order.  The committed list
+        * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
+        * When we move telemetry from the committed list to the processing
+        * list we atomically unlink the committed list and keep a pointer
+        * to the head of that list;  we then traverse the list following
+        * mcte_prev and fill in mcte_next to doubly-link the list, and then
+        * append the tail of the list onto the processing list.  If we panic
+        * during this manipulation of the committed list we still have
+        * the pointer to its head so we can recover all entries during
+        * the panic flow (albeit in reverse chronological order).
+        *
+        * The processing list is updated in a controlled context, and
+        * we can lock it for updates.  The head of the processing list
+        * always has the oldest telemetry, and we append (as above)
+        * at the tail of the processing list. */
+       struct mctelem_ent *mctc_free[MC_NCLASSES];
+       struct mctelem_ent *mctc_committed[MC_NCLASSES];
+       struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
+       struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
+       /*
+        * Telemetry array
+        */
+       struct mctelem_ent *mctc_elems;
+} mctctl;
+
+/* Lock protecting all processing lists */
+static DEFINE_SPINLOCK(processing_lock);
+
+static void *cmpxchgptr(void *ptr, void *old, void *new)
+{
+       unsigned long *ulp = (unsigned long *)ptr;
+       unsigned long a = (unsigned long)old;
+       unsigned long b = (unsigned long)new;
+
+       return (void *)cmpxchg(ulp, a, b);
+}
+
+/* Free an entry to its native free list; the entry must not be linked on
+ * any list.
+ */
+static void mctelem_free(struct mctelem_ent *tep)
+{
+       mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent **freelp;
+       struct mctelem_ent *oldhead;
+
+       BUG_ON(tep->mcte_refcnt != 0);
+       BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
+
+       tep->mcte_prev = NULL;
+       freelp = &mctctl.mctc_free[target];
+       for (;;) {
+               oldhead = *freelp;
+               tep->mcte_next = oldhead;
+               wmb();
+               if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
+                       break;
+       }
+}
+
+/* Increment the reference count of an entry that is not linked on to
+ * any list and which only the caller has a pointer to.
+ */
+static void mctelem_hold(struct mctelem_ent *tep)
+{
+       tep->mcte_refcnt++;
+}
+
+/* Increment the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_hold(struct mctelem_ent *tep)
+{
+       int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep != mctctl.mctc_processing_head[which]);
+       tep->mcte_refcnt++;
+}
+
+/* Decrement the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_release(struct mctelem_ent *tep)
+{
+       int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep != mctctl.mctc_processing_head[which]);
+       if (--tep->mcte_refcnt == 0) {
+               MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
+               mctctl.mctc_processing_head[which] = tep->mcte_next;
+               mctelem_free(tep);
+       }
+}
+
+void mctelem_init(int reqdatasz)
+{
+       static int called = 0;
+       static int datasz = 0, realdatasz = 0;
+       char *datarr;
+       int i;
+       
+       BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
+
+       /* Called from mcheck_init for all processors; initialize for the
+        * first call only (no race here since the boot cpu completes
+        * init before others start up). */
+       if (++called == 1) {
+               realdatasz = reqdatasz;
+               datasz = (reqdatasz & ~0xf) + 0x10;     /* 16 byte roundup */
+       } else {
+               BUG_ON(reqdatasz != realdatasz);
+               return;
+       }
+
+       if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
+           MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
+           (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) *
+           datasz)) == NULL) {
+               if (mctctl.mctc_elems)
+                       xfree(mctctl.mctc_elems);
+               printk("Allocations for MCA telemetry failed\n");
+               return;
+       }
+
+       for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
+               struct mctelem_ent *tep, **tepp;
+
+               tep = mctctl.mctc_elems + i;
+               tep->mcte_flags = MCTE_F_STATE_FREE;
+               tep->mcte_refcnt = 0;
+               tep->mcte_data = datarr + i * datasz;
+
+               if (i < MC_URGENT_NENT) {
+                       tepp = &mctctl.mctc_free[MC_URGENT];
+                       tep->mcte_flags |= MCTE_F_HOME_URGENT;
+               } else {
+                       tepp = &mctctl.mctc_free[MC_NONURGENT];
+                       tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
+               }
+
+               tep->mcte_next = *tepp;
+               tep->mcte_prev = NULL;
+               *tepp = tep;
+       }
+}
+
+/* incremented non-atomically when reserve fails */
+static int mctelem_drop_count;
+
+/* Reserve a telemetry entry, or return NULL if none available.
+ * If we return an entry then the caller must subsequently call exactly one of
+ * mctelem_unreserve or mctelem_commit for that entry.
+ */
+mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
+{
+       struct mctelem_ent **freelp;
+       struct mctelem_ent *oldhead, *newhead;
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+
+       freelp = &mctctl.mctc_free[target];
+       for (;;) {
+               if ((oldhead = *freelp) == NULL) {
+                       if (which == MC_URGENT && target == MC_URGENT) {
+                               /* raid the non-urgent freelist */
+                               target = MC_NONURGENT;
+                               freelp = &mctctl.mctc_free[target];
+                               continue;
+                       } else {
+                               mctelem_drop_count++;
+                               return (NULL);
+                       }
+               }
+
+               newhead = oldhead->mcte_next;
+               if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
+                       struct mctelem_ent *tep = oldhead;
+
+                       mctelem_hold(tep);
+                       MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
+                       tep->mcte_next = NULL;
+                       tep->mcte_prev = NULL;
+                       if (which == MC_URGENT)
+                               MCTE_SET_CLASS(tep, URGENT);
+                       else
+                               MCTE_SET_CLASS(tep, NONURGENT);
+                       return MCTE2COOKIE(tep);
+               }
+       }
+}
+
+void *mctelem_dataptr(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       return tep->mcte_data;
+}
+
+/* Release a previously reserved entry back to the freelist without
+ * submitting it for logging.  The entry must not be linked on to any
+ * list - that's how mctelem_reserve handed it out.
+ */
+void mctelem_dismiss(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       tep->mcte_refcnt--;
+       MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
+       mctelem_free(tep);
+}
+
+/* Commit an entry with completed telemetry for logging.  The caller must
+ * not reference the entry after this call.  Note that we add entries
+ * at the head of the committed list, so that list therefore has entries
+ * in reverse chronological order.
+ */
+void mctelem_commit(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+       struct mctelem_ent **commlp;
+       struct mctelem_ent *oldhead;
+       mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+
+       BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
+       MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
+
+       commlp = &mctctl.mctc_committed[target];
+       for (;;) {
+               oldhead = *commlp;
+               tep->mcte_prev = oldhead;
+               wmb();
+               if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
+                       break;
+       }
+}
+
+/* Move telemetry from committed list to processing list, reversing the
+ * list into chronological order.  The processing list has been
+ * locked by the caller, and may be non-empty.  We append the
+ * reversed committed list on to the tail of the processing list.
+ * The committed list may grow even while we run, so use atomic
+ * operations to swap NULL to the freelist head.
+ *
+ * Note that "chronological order" means the order in which producers
+ * won additions to the processing list, which may not reflect the
+ * strict chronological order of the associated events if events are
+ * closely spaced in time and contend for the processing list at once.
+ */
+
+static struct mctelem_ent *dangling[MC_NCLASSES];
+
+static void mctelem_append_processing(mctelem_class_t which)
+{
+       mctelem_class_t target = which == MC_URGENT ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
+       struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
+       struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
+       struct mctelem_ent *tep, *ltep;
+
+       /* Check for an empty list; no race since we hold the processing lock */
+       if (*commlp == NULL)
+               return;
+
+       /* Atomically unlink the committed list, and keep a pointer to
+        * the list we unlink in a well-known location so it can be
+        * picked up in panic code should we panic between this unlink
+        * and the append to the processing list. */
+       for (;;) {
+               dangling[target] = *commlp;
+               wmb();
+               if (cmpxchgptr(commlp, dangling[target], NULL) ==
+                   dangling[target])
+                       break;
+       }
+
+       if (dangling[target] == NULL)
+               return;
+
+       /* Traverse the list following the previous pointers (reverse
+        * chronological order).  For each entry fill in the next pointer
+        * and transition the element state.  */
+       for (tep = dangling[target], ltep = NULL; tep != NULL;
+           tep = tep->mcte_prev) {
+               MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
+               tep->mcte_next = ltep;
+               ltep = tep;
+       }
+
+       /* ltep points to the head of a chronologically ordered linked
+        * list of telemetry entries ending at the most recent entry
+        * dangling[target] if mcte_next is followed; tack this on to
+        * the processing list.
+        */
+       if (*proclhp == NULL) {
+               *proclhp = ltep;
+               *procltp = dangling[target];
+       } else {
+               (*procltp)->mcte_next = ltep;
+               ltep->mcte_prev = *procltp;
+               *procltp = dangling[target];
+       }
+       wmb();
+       dangling[target] = NULL;
+       wmb();
+}
+
+mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
+{
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent *tep;
+
+       spin_lock(&processing_lock);
+       mctelem_append_processing(target);
+       if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
+               spin_unlock(&processing_lock);
+               return NULL;
+       }
+
+       mctelem_processing_hold(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+       return MCTE2COOKIE(tep);
+}
+
+void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
+{
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       spin_lock(&processing_lock);
+       mctelem_processing_release(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+}
+
+void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
+{
+       mctelem_class_t target = (which == MC_URGENT) ?
+           MC_URGENT : MC_NONURGENT;
+       struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+       if (tep == NULL)
+               return;
+
+       spin_lock(&processing_lock);
+       if (tep == mctctl.mctc_processing_head[target])
+               mctelem_processing_release(tep);
+       wmb();
+       spin_unlock(&processing_lock);
+}
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.h b/xen/arch/x86/cpu/mcheck/mctelem.h
new file mode 100644 (file)
index 0000000..e3270f6
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef _MCTELEM_H
+
+#define        _MCTELEM_H
+
+#include <xen/init.h>
+#include <xen/smp.h>
+#include <asm/traps.h>
+
+/* Helper functions used for collecting error telemetry.
+ *
+ * mctelem_init preallocates a number of data areas for use during
+ * machine check data "logout".  Two classes are distinguished -
+ * urgent uses, intended for use from machine check exception handlers,
+ * and non-urgent uses intended for use from error pollers.
+ * Associated with each logout entry of whatever class is a data area
+ * sized per the single argument to mctelem_init.  mcelem_init should be
+ * called from MCA init code before anybody has the chance to change the
+ * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout.
+ *
+ * To reserve an entry of a given class for use in logout, call
+ * mctelem_reserve (or use the common handler functions which do all this
+ * for you).  This returns an opaque cookie, or NULL if no elements are
+ * available.  Elements are reserved with an atomic operation so no deadlock
+ * will occur if, for example, a machine check exception interrupts a
+ * scheduled error poll.  The implementation will raid free non-urgent
+ * entries if all urgent entries are in use when an urgent request is received.
+ * Once an entry is reserved the caller must eventually perform exactly
+ * one of two actions: mctelem_commit or mctelem_dismiss.
+ *
+ * On mctelem_commit the entry is appended to a processing list; mctelem_dismiss
+ * frees the element without processing.  After either call the cookie
+ * must not be referenced again.
+ *
+ * To consume committed telemetry call mctelem_consume_oldest_begin
+ * which will return a cookie referencing the oldest (first committed)
+ * entry of the requested class.  Access the associated data using
+ * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the
+ * begin .. end bracket you are guaranteed that the entry canot be freed
+ * even if it is ack'd elsewhere).  Once the ultimate consumer of the
+ * telemetry has processed it to stable storage it should acknowledge
+ * the telemetry quoting the cookie id, at which point we will free
+ * the element from the processing list.
+ */
+
+typedef struct mctelem_cookie *mctelem_cookie_t;
+
+typedef enum mctelem_class {
+       MC_URGENT,
+       MC_NONURGENT
+} mctelem_class_t;
+
+extern void mctelem_init(int);
+extern mctelem_cookie_t mctelem_reserve(mctelem_class_t);
+extern void *mctelem_dataptr(mctelem_cookie_t);
+extern void mctelem_commit(mctelem_cookie_t);
+extern void mctelem_dismiss(mctelem_cookie_t);
+extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
+extern void mctelem_consume_oldest_end(mctelem_cookie_t);
+extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
+
+#endif
index 4984eed757ce62d7f0d9eb11b643a66e51378b0d..167b1cea2a837ed0ede24026b8a3bd2c11973579 100644 (file)
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/sched.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
 
-static int firstbank;
+static cpu_banks_t bankmask;
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(8000)
+#define MCE_PERIOD_MIN MILLISECS(2000)
+#define MCE_PERIOD_MAX MILLISECS(16000)
+
+static uint64_t period = MCE_PERIOD;
+static int adjust = 0;
+static int variable_period = 1;
 
 static void mce_checkregs (void *info)
 {
-       u32 low, high;
-       int i;
-
-       for (i=firstbank; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
-
-               if (high & (1<<31)) {
-                       printk(KERN_INFO "MCE: The hardware reports a non "
-                               "fatal, correctable incident occurred on "
-                               "CPU %d.\n",
-                               smp_processor_id());
-                       printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
-
-                       /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
-                       wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-
-                       /* Serialize */
-                       wmb();
-                       add_taint(TAINT_MACHINE_CHECK);
+       mctelem_cookie_t mctc;
+       struct mca_summary bs;
+       static uint64_t dumpcount = 0;
+
+       mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs);
+
+       if (bs.errcnt && mctc != NULL) {
+               adjust++;
+
+               /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+                * Otherwise, if dom0 has had plenty of time to register
+                * the virq handler but still hasn't then dump telemetry
+                * to the Xen console.  The call count may be incremented
+                * on multiple cpus at once and is indicative only - just
+                * a simple-minded attempt to avoid spamming the console
+                * for corrected errors in early startup.
+                */
+
+               if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+                       mctelem_commit(mctc);
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               } else if (++dumpcount >= 10) {
+                       x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
+                       mctelem_dismiss(mctc);
+               } else {
+                       mctelem_dismiss(mctc);
                }
+       } else if (mctc != NULL) {
+               mctelem_dismiss(mctc);
        }
 }
 
 static void mce_work_fn(void *data)
 { 
        on_each_cpu(mce_checkregs, NULL, 1, 1);
-       set_timer(&mce_timer, NOW() + MCE_PERIOD);
+
+       if (variable_period) {
+               if (adjust)
+                       period /= (adjust + 1);
+               else
+                       period *= 2;
+               if (period > MCE_PERIOD_MAX)
+                       period = MCE_PERIOD_MAX;
+               if (period < MCE_PERIOD_MIN)
+                       period = MCE_PERIOD_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
 }
 
 static int __init init_nonfatal_mce_checker(void)
@@ -61,12 +91,12 @@ static int __init init_nonfatal_mce_checker(void)
        struct cpuinfo_x86 *c = &boot_cpu_data;
 
        /* Check for MCE support */
-       if (!cpu_has(c, X86_FEATURE_MCE))
+       if (!mce_available(c))
                return -ENODEV;
 
-       /* Check for PPro style MCA */
-       if (!cpu_has(c, X86_FEATURE_MCA))
-               return -ENODEV;
+       memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t));
+       if (mce_firstbank(c) == 1)
+               clear_bit(0, bankmask);
 
        /*
         * Check for non-fatal errors every MCE_RATE s
@@ -74,7 +104,6 @@ static int __init init_nonfatal_mce_checker(void)
        switch (c->x86_vendor) {
        case X86_VENDOR_AMD:
                if (c->x86 == 6) { /* K7 */
-                       firstbank = 1;
                        init_timer(&mce_timer, mce_work_fn, NULL, 0);
                        set_timer(&mce_timer, NOW() + MCE_PERIOD);
                        break;
@@ -85,12 +114,19 @@ static int __init init_nonfatal_mce_checker(void)
                break;
 
        case X86_VENDOR_INTEL:
-               init_timer(&mce_timer, mce_work_fn, NULL, 0);
-               set_timer(&mce_timer, NOW() + MCE_PERIOD);
+               /*
+                * The P5 family is different. P4/P6 and latest CPUs share the
+                * same polling methods.
+                */
+               if ( c->x86 != 5 )
+               {
+                       init_timer(&mce_timer, mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + MCE_PERIOD);
+               }
                break;
        }
 
-       printk(KERN_INFO "MCA: Machine check polling timer started.\n");
+       printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n");
        return 0;
 }
 __initcall(init_nonfatal_mce_checker);
index ac952af082c206ae3db6a47cc69c27eabff114ce..4106cbcf53b3b845ede8c4841bd35cd9ce9e7df8 100644 (file)
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
+static void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
 {
        u32 loaddr, hi, lotype;
        rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
@@ -27,19 +28,14 @@ static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long err
 }
 
 /* Set up machine check reporting for processors with Intel style MCE */
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        
-       /*Check for MCE support */
-       if( !cpu_has(c, X86_FEATURE_MCE) )
-               return; 
-
        /* Default P5 to off as its often misconnected */
        if(mce_disabled != -1)
-               return;
-       machine_check_vector = pentium_machine_check;
-       wmb();
+               return 0;
+       x86_mce_vector_register(pentium_machine_check);
 
        /* Read registers before enabling */
        rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
@@ -49,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
        /* Enable MCE */
        set_in_cr4(X86_CR4_MCE);
        printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+
+       return 1;
 }
index 12b3e6db2444533253925ce1de97a459dc9e61dc..6dede3796f40aa519b1299b2b9c66f78e2b7cffe 100644 (file)
 #include "mce.h"
 
 /* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
+static void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
 {
        printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
        add_taint(TAINT_MACHINE_CHECK);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
-void winchip_mcheck_init(struct cpuinfo_x86 *c)
+int winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 lo, hi;
-       machine_check_vector = winchip_machine_check;
+
        wmb();
+       x86_mce_vector_register(winchip_machine_check);
        rdmsr(MSR_IDT_FCR1, lo, hi);
        lo|= (1<<2);    /* Enable EIERRINT (int 18 MCE) */
        lo&= ~(1<<4);   /* Enable MCE */
        wrmsr(MSR_IDT_FCR1, lo, hi);
        set_in_cr4(X86_CR4_MCE);
        printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+       return (1);
 }
index b20808e451d90fcd87e9494d506f78e9c36892a1..ac98744932bf93b2ccbddf3b548f47f79d3e5c29 100644 (file)
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#ifndef X86_MCA_H
+
+#define X86_MCA_H
+
 
 /* The MCA/MCE MSRs should not be used anywhere else.
  * They are cpu family/model specific and are only for use
 /* Bitfield of the MSR_IA32_MCG_CAP register */
 #define MCG_CAP_COUNT           0x00000000000000ffULL
 #define MCG_CTL_P               0x0000000000000100ULL
-/* Bits 9-63 are reserved */
+#define MCG_EXT_P              (1UL<<9)
+#define MCG_EXT_CNT            (16)
+#define MCG_CMCI_P             (1UL<<10)
+/* Other bits are reserved */
 
 /* Bitfield of the MSR_IA32_MCG_STATUS register */
 #define MCG_STATUS_RIPV         0x0000000000000001ULL
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/* Bitfield of MSR_K8_HWCR register */
+#define K8_HWCR_MCi_STATUS_WREN                (1ULL << 18)
+
+/*Intel Specific bitfield*/
+#define CMCI_THRESHOLD                 0x2
+
+#include <asm/domain.h>
+typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+/* Below interfaces are defined for MCA internal processing:
+ * a. pre_handler will be called early in MCA ISR context, mainly for early
+ *    need_reset detection for avoiding log missing. Also, it is used to judge
+ *    impacted DOMAIN if possible.
+ * b. mca_error_handler is actually a (error_action_index,
+ *    recovery_hanlder pointer) pair. The defined recovery_handler
+ *    performs the actual recovery operations such as page_offline, cpu_offline
+ *    in softIRQ context when the per_bank MCA error matching the corresponding
+ *    mca_code index. If pre_handler can't judge the impacted domain,
+ *    recovery_handler must figure it out.
+*/
+
+/* MCA error has been recovered successfully by the recovery action*/
+#define MCA_RECOVERED (0x1 < 0)
+/* MCA error impact the specified DOMAIN in owner field below */
+#define MCA_OWNER (0x1 < 1)
+/* MCA error can't be recovered and need reset */
+#define MCA_NEED_RESET (0x1 < 2)
+/* MCA error need further actions in softIRQ context for recovery */
+#define MCA_MORE_ACTION (0x1 < 3)
+
+struct mca_handle_result
+{
+    uint32_t result;
+    /* Used one result & MCA_OWNER */
+    domid_t owner;
+    /* Used by mca_error_handler, result & MCA_RECOVRED */
+    struct recovery_action *action;
+};
+
+extern void (*mca_prehandler)( struct cpu_user_regs *regs,
+                        struct mca_handle_result *result);
+
+struct mca_error_handler
+{
+    /* Assume corresponding recovery action could be uniquely
+     * identified by mca_code. Otherwise, we might need to have
+     * a seperate function to decode the corresponding actions
+     * for the particular mca error later.
+    */
+    uint16_t mca_code;
+    void (*recovery_handler)( struct mcinfo_bank *bank,
+                    struct mcinfo_global *global,
+                    struct mcinfo_extended *extension,
+                    struct mca_handle_result *result);
+};
+
+/* Global variables */
+extern int mce_disabled;
+extern unsigned int nr_mce_banks;
+
+#endif /* X86_MCA_H */
index 6e1498a0f14462567ef1781e47c8967b220f60c0..8d7a166f651070b645e50e57f1d79003584eb722 100644 (file)
@@ -58,7 +58,9 @@ DEFINE_PER_CPU(u64, efer);
 DEFINE_PER_CPU(unsigned long, cr4);
 
 static void default_idle(void);
+static void default_dead_idle(void);
 void (*pm_idle) (void) = default_idle;
+void (*dead_idle) (void) = default_dead_idle;
 
 static void paravirt_ctxt_switch_from(struct vcpu *v);
 static void paravirt_ctxt_switch_to(struct vcpu *v);
@@ -84,6 +86,12 @@ static void default_idle(void)
         local_irq_enable();
 }
 
+static void default_dead_idle(void)
+{
+    for ( ; ; )
+        halt();
+}
+
 static void play_dead(void)
 {
     /*
@@ -102,8 +110,7 @@ static void play_dead(void)
 
     /* With physical CPU hotplug, we should halt the cpu. */
     local_irq_disable();
-    for ( ; ; )
-        halt();
+    (*dead_idle)();
 }
 
 void idle_loop(void)
@@ -141,49 +148,82 @@ void dump_pageframe_info(struct domain *d)
     }
     else
     {
-        list_for_each_entry ( page, &d->page_list, list )
+        page_list_for_each ( page, &d->page_list )
         {
-            printk("    DomPage %p: caf=%08x, taf=%" PRtype_info "\n",
+            printk("    DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
                    _p(page_to_mfn(page)),
                    page->count_info, page->u.inuse.type_info);
         }
     }
 
-    list_for_each_entry ( page, &d->xenpage_list, list )
+    if ( is_hvm_domain(d) )
+    {
+        p2m_pod_dump_data(d);
+    }
+
+    page_list_for_each ( page, &d->xenpage_list )
     {
-        printk("    XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
+        printk("    XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
                _p(page_to_mfn(page)),
                page->count_info, page->u.inuse.type_info);
     }
 }
 
+struct domain *alloc_domain_struct(void)
+{
+    struct domain *d;
+    /*
+     * We pack the MFN of the domain structure into a 32-bit field within
+     * the page_info structure. Hence the MEMF_bits() restriction.
+     */
+    d = alloc_xenheap_pages(
+        get_order_from_bytes(sizeof(*d)), MEMF_bits(32 + PAGE_SHIFT));
+    if ( d != NULL )
+        memset(d, 0, sizeof(*d));
+    return d;
+}
+
+void free_domain_struct(struct domain *d)
+{
+    free_xenheap_pages(d, get_order_from_bytes(sizeof(*d)));
+}
+
 struct vcpu *alloc_vcpu_struct(void)
 {
     struct vcpu *v;
-    if ( (v = xmalloc(struct vcpu)) != NULL )
+    /*
+     * This structure contains embedded PAE PDPTEs, used when an HVM guest
+     * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
+     * may require that the shadow CR3 points below 4GB, and hence the whole
+     * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
+     */
+    v = alloc_xenheap_pages(get_order_from_bytes(sizeof(*v)), MEMF_bits(32));
+    if ( v != NULL )
         memset(v, 0, sizeof(*v));
     return v;
 }
 
 void free_vcpu_struct(struct vcpu *v)
 {
-    xfree(v);
+    free_xenheap_pages(v, get_order_from_bytes(sizeof(*v)));
 }
 
 #ifdef CONFIG_COMPAT
 
 static int setup_compat_l4(struct vcpu *v)
 {
-    struct page_info *pg = alloc_domheap_page(NULL, 0);
+    struct page_info *pg;
     l4_pgentry_t *l4tab;
 
+    pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
     if ( pg == NULL )
         return -ENOMEM;
 
     /* This page needs to look like a pagetable so that it can be shadowed */
     pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
 
-    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
+    l4tab = page_to_virt(pg);
+    copy_page(l4tab, idle_pg_table);
     l4tab[0] = l4e_empty();
     l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
         l4e_from_page(pg, __PAGE_HYPERVISOR);
@@ -309,12 +349,7 @@ int vcpu_initialise(struct vcpu *v)
         if ( is_idle_domain(d) )
         {
             v->arch.schedule_tail = continue_idle_domain;
-            if ( v->vcpu_id )
-                v->arch.cr3 = d->vcpu[0]->arch.cr3;
-            else if ( !*idle_vcpu )
-                v->arch.cr3 = __pa(idle_pg_table);
-            else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
-                return -ENOMEM;
+            v->arch.cr3           = __pa(idle_pg_table);
         }
 
         v->arch.guest_context.ctrlreg[4] =
@@ -324,6 +359,8 @@ int vcpu_initialise(struct vcpu *v)
     v->arch.perdomain_ptes =
         d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
 
+    spin_lock_init(&v->arch.shadow_ldt_lock);
+
     return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
 }
 
@@ -336,6 +373,7 @@ void vcpu_destroy(struct vcpu *v)
         hvm_vcpu_destroy(v);
 }
 
+extern uint64_t g_mcg_cap;
 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
 {
 #ifdef __x86_64__
@@ -349,13 +387,15 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
         hvm_funcs.hap_supported &&
         (domcr_flags & DOMCRF_hap);
 
+    d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
+
     INIT_LIST_HEAD(&d->arch.pdev_list);
 
     d->arch.relmem = RELMEM_not_started;
-    INIT_LIST_HEAD(&d->arch.relmem_list);
+    INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
 
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
-    d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
+    d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order, 0);
     if ( d->arch.mm_perdomain_pt == NULL )
         goto fail;
     memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
@@ -403,7 +443,11 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
         if ( d->arch.ioport_caps == NULL )
             goto fail;
 
-        if ( (d->shared_info = alloc_xenheap_page()) == NULL )
+        /*
+         * The shared_info machine address must fit in a 32-bit field within a
+         * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
+         */
+        if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
             goto fail;
 
         clear_page(d->shared_info);
@@ -412,6 +456,16 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
 
         if ( (rc = iommu_domain_init(d)) != 0 )
             goto fail;
+
+        /* For Guest vMCE MSRs virtualization */
+        d->arch.vmca_msrs.mcg_status = 0x0;
+        d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
+        d->arch.vmca_msrs.mcg_ctl = (uint64_t)~0x0;
+        d->arch.vmca_msrs.nr_injection = 0;
+        memset(d->arch.vmca_msrs.mci_ctl, 0x1,
+            sizeof(d->arch.vmca_msrs.mci_ctl));
+        INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
+
     }
 
     if ( is_hvm_domain(d) )
@@ -797,7 +851,7 @@ map_vcpu_info(struct vcpu *v, unsigned long mfn, unsigned offset)
      * lost.  The domain will get a spurious event, but it can cope.
      */
     vcpu_info(v, evtchn_upcall_pending) = 1;
-    for ( i = 0; i < BITS_PER_GUEST_LONG(d); i++ )
+    for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ )
         set_bit(i, &vcpu_info(v, evtchn_pending_sel));
 
     return 0;
@@ -1171,14 +1225,18 @@ static void paravirt_ctxt_switch_to(struct vcpu *v)
     }
 }
 
+static inline int need_full_gdt(struct vcpu *v)
+{
+    return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
+}
+
 static void __context_switch(void)
 {
     struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
-    unsigned int          i, cpu = smp_processor_id();
+    unsigned int          cpu = smp_processor_id();
     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
     struct vcpu          *n = current;
     struct desc_struct   *gdt;
-    struct page_info     *page;
     struct desc_ptr       gdt_desc;
 
     ASSERT(p != n);
@@ -1207,16 +1265,19 @@ static void __context_switch(void)
 
     gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
                                   per_cpu(compat_gdt_table, cpu);
-    page = virt_to_page(gdt);
-    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+    if ( need_full_gdt(n) )
     {
-        l1e_write(n->domain->arch.mm_perdomain_pt +
-                  (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
-                  FIRST_RESERVED_GDT_PAGE + i,
-                  l1e_from_page(page + i, __PAGE_HYPERVISOR));
+        struct page_info *page = virt_to_page(gdt);
+        unsigned int i;
+        for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
+            l1e_write(n->domain->arch.mm_perdomain_pt +
+                      (n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+                      FIRST_RESERVED_GDT_PAGE + i,
+                      l1e_from_page(page + i, __PAGE_HYPERVISOR));
     }
 
-    if ( p->vcpu_id != n->vcpu_id )
+    if ( need_full_gdt(p) &&
+         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) )
     {
         gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
         gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
@@ -1225,8 +1286,10 @@ static void __context_switch(void)
 
     write_ptbase(n);
 
-    if ( p->vcpu_id != n->vcpu_id )
+    if ( need_full_gdt(n) &&
+         ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) )
     {
+        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
         gdt_desc.base = GDT_VIRT_START(n);
         asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
     }
@@ -1255,11 +1318,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
             flush_tlb_mask(next->vcpu_dirty_cpumask);
     }
 
-    local_irq_disable();
-
     if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
         pt_save_timer(prev);
 
+    local_irq_disable();
+
     set_current(next);
 
     if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
@@ -1614,9 +1677,8 @@ int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...)
 #endif
 
 static int relinquish_memory(
-    struct domain *d, struct list_head *list, unsigned long type)
+    struct domain *d, struct page_list_head *list, unsigned long type)
 {
-    struct list_head *ent;
     struct page_info  *page;
     unsigned long     x, y;
     int               ret = 0;
@@ -1624,47 +1686,35 @@ static int relinquish_memory(
     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
     spin_lock_recursive(&d->page_alloc_lock);
 
-    ent = list->next;
-    while ( ent != list )
+    while ( (page = page_list_remove_head(list)) )
     {
-        page = list_entry(ent, struct page_info, list);
-
         /* Grab a reference to the page so it won't disappear from under us. */
         if ( unlikely(!get_page(page, d)) )
         {
             /* Couldn't get a reference -- someone is freeing this page. */
-            ent = ent->next;
-            list_move_tail(&page->list, &d->arch.relmem_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
             continue;
         }
 
         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
-            put_page_and_type(page);
+            ret = put_page_and_type_preemptible(page, 1);
+        switch ( ret )
+        {
+        case 0:
+            break;
+        case -EAGAIN:
+        case -EINTR:
+            page_list_add(page, list);
+            set_bit(_PGT_pinned, &page->u.inuse.type_info);
+            put_page(page);
+            goto out;
+        default:
+            BUG();
+        }
 
         if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
             put_page(page);
 
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-        /*
-         * Forcibly drop reference counts of page tables above top most (which
-         * were skipped to prevent long latencies due to deep recursion - see
-         * the special treatment in free_lX_table()).
-         */
-        y = page->u.inuse.type_info;
-        if ( (type < PGT_root_page_table) &&
-             unlikely(((y + PGT_type_mask) &
-                       (PGT_type_mask|PGT_validated)) == type) )
-        {
-            BUG_ON((y & PGT_count_mask) >=
-                   (page->count_info & PGC_count_mask));
-            while ( y & PGT_count_mask )
-            {
-                put_page_and_type(page);
-                y = page->u.inuse.type_info;
-            }
-        }
-#endif
-
         /*
          * Forcibly invalidate top-most, still valid page tables at this point
          * to break circular 'linear page table' references as well as clean up
@@ -1685,15 +1735,39 @@ static int relinquish_memory(
                         x & ~(PGT_validated|PGT_partial));
             if ( likely(y == x) )
             {
-                if ( free_page_type(page, x, 0) != 0 )
+                /* No need for atomic update of type_info here: noone else updates it. */
+                switch ( ret = free_page_type(page, x, 1) )
+                {
+                case 0:
+                    break;
+                case -EINTR:
+                    page_list_add(page, list);
+                    page->u.inuse.type_info |= PGT_validated;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    put_page(page);
+                    ret = -EAGAIN;
+                    goto out;
+                case -EAGAIN:
+                    page_list_add(page, list);
+                    page->u.inuse.type_info |= PGT_partial;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    goto out;
+                default:
                     BUG();
+                }
+                if ( x & PGT_partial )
+                {
+                    page->u.inuse.type_info--;
+                    put_page(page);
+                }
                 break;
             }
         }
 
-        /* Follow the list chain and /then/ potentially free the page. */
-        ent = ent->next;
-        list_move_tail(&page->list, &d->arch.relmem_list);
+        /* Put the page on the list and /then/ potentially free it. */
+        page_list_add_tail(page, &d->arch.relmem_list);
         put_page(page);
 
         if ( hypercall_preempt_check() )
@@ -1703,7 +1777,12 @@ static int relinquish_memory(
         }
     }
 
-    list_splice_init(&d->arch.relmem_list, list);
+    /* list is empty at this point. */
+    if ( !page_list_empty(&d->arch.relmem_list) )
+    {
+        *list = d->arch.relmem_list;
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+    }
 
  out:
     spin_unlock_recursive(&d->page_alloc_lock);
@@ -1794,6 +1873,13 @@ int domain_relinquish_resources(struct domain *d)
             unmap_vcpu_info(v);
         }
 
+        if ( d->arch.pirq_eoi_map != NULL )
+        {
+            unmap_domain_page_global(d->arch.pirq_eoi_map);
+            put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
+            d->arch.pirq_eoi_map = NULL;
+        }
+
         d->arch.relmem = RELMEM_xen;
         /* fallthrough */
 
@@ -1831,20 +1917,12 @@ int domain_relinquish_resources(struct domain *d)
         /* fallthrough */
 
     case RELMEM_done:
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-        ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
-        if ( ret )
-            return ret;
-#endif
         break;
 
     default:
         BUG();
     }
 
-    /* Free page used by xen oprofile buffer. */
-    free_xenoprof_pages(d);
-
     if ( is_hvm_domain(d) )
         hvm_domain_relinquish_resources(d);
 
@@ -1892,6 +1970,54 @@ void domain_cpuid(
     *eax = *ebx = *ecx = *edx = 0;
 }
 
+void vcpu_kick(struct vcpu *v)
+{
+    /*
+     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
+     * pending flag. These values may fluctuate (after all, we hold no
+     * locks) but the key insight is that each change will cause
+     * evtchn_upcall_pending to be polled.
+     * 
+     * NB2. We save the running flag across the unblock to avoid a needless
+     * IPI for domains that we IPI'd to unblock.
+     */
+    bool_t running = v->is_running;
+    vcpu_unblock(v);
+    if ( running && (in_irq() || (v != current)) )
+        cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
+}
+
+void vcpu_mark_events_pending(struct vcpu *v)
+{
+    int already_pending = test_and_set_bit(
+        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
+
+    if ( already_pending )
+        return;
+
+    if ( is_hvm_vcpu(v) )
+        hvm_assert_evtchn_irq(v);
+    else
+        vcpu_kick(v);
+}
+
+static void vcpu_kick_softirq(void)
+{
+    /*
+     * Nothing to do here: we merely prevent notifiers from racing with checks
+     * executed on return to guest context with interrupts enabled. See, for
+     * example, xxx_intr_assist() executed on return to HVM guest context.
+     */
+}
+
+static int __init init_vcpu_kick_softirq(void)
+{
+    open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
+    return 0;
+}
+__initcall(init_vcpu_kick_softirq);
+
+
 /*
  * Local variables:
  * mode: C
index 5bd7714ba067c8e3d051a2b62c1d4f89980e0ac7..6cd4f042871b85a5041e3dcf48873f0d0a9a767f 100644 (file)
@@ -19,6 +19,7 @@
 #include <xen/iocap.h>
 #include <xen/bitops.h>
 #include <xen/compat.h>
+#include <xen/libelf.h>
 #include <asm/regs.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -30,7 +31,9 @@
 #include <asm/e820.h>
 
 #include <public/version.h>
-#include <public/libelf.h>
+
+int __init bzimage_parse(
+    char *output, char **image_start, unsigned long *image_len);
 
 extern unsigned long initial_images_nrpages(void);
 extern void discard_initial_images(void);
@@ -196,7 +199,8 @@ static void __init process_dom0_ioports_disable(void)
 
 int __init construct_dom0(
     struct domain *d,
-    unsigned long _image_start, unsigned long image_len, 
+    unsigned long _image_base,
+    unsigned long _image_start, unsigned long image_len,
     unsigned long _initrd_start, unsigned long initrd_len,
     char *cmdline)
 {
@@ -213,9 +217,11 @@ int __init construct_dom0(
     struct vcpu *v = d->vcpu[0];
     unsigned long long value;
 #if defined(__i386__)
+    char *image_base   = (char *)_image_base;   /* use lowmem mappings */
     char *image_start  = (char *)_image_start;  /* use lowmem mappings */
     char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */
 #elif defined(__x86_64__)
+    char *image_base   = __va(_image_base);
     char *image_start  = __va(_image_start);
     char *initrd_start = __va(_initrd_start);
 #endif
@@ -262,6 +268,9 @@ int __init construct_dom0(
 
     nr_pages = compute_dom0_nr_pages();
 
+    if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 )
+        return rc;
+
     if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
         return rc;
 #ifdef VERBOSE
@@ -341,6 +350,12 @@ int __init construct_dom0(
 #endif
     }
 
+    if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
+    {
+        printk(XENLOG_WARNING "P2M table base ignored\n");
+        parms.p2m_base = UNSET_ADDR;
+    }
+
     domain_set_alloc_bitsize(d);
 
     /*
@@ -359,6 +374,8 @@ int __init construct_dom0(
     vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
                                                      sizeof(unsigned long) :
                                                      sizeof(unsigned int)));
+    if ( parms.p2m_base != UNSET_ADDR )
+        vphysmap_end = vphysmap_start;
     vstartinfo_start = round_pgup(vphysmap_end);
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
@@ -400,6 +417,11 @@ int __init construct_dom0(
     /* Ensure that our low-memory 1:1 mapping covers the allocation. */
     page = alloc_domheap_pages(d, order, MEMF_bits(30));
 #else
+    if ( parms.p2m_base != UNSET_ADDR )
+    {
+        vphysmap_start = parms.p2m_base;
+        vphysmap_end   = vphysmap_start + nr_pages * sizeof(unsigned long);
+    }
     page = alloc_domheap_pages(d, order, 0);
 #endif
     if ( page == NULL )
@@ -430,14 +452,6 @@ int __init construct_dom0(
            _p(v_start), _p(v_end));
     printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
 
-    if ( ((v_end - v_start)>>PAGE_SHIFT) > nr_pages )
-    {
-        printk("Initial guest OS requires too much space\n"
-               "(%luMB is greater than %luMB limit)\n",
-               (v_end-v_start)>>20, nr_pages>>(20-PAGE_SHIFT));
-        return -ENOMEM;
-    }
-
     mpt_alloc = (vpt_start - v_start) +
         (unsigned long)pfn_to_paddr(alloc_spfn);
 
@@ -455,8 +469,9 @@ int __init construct_dom0(
     /* WARNING: The new domain must have its 'processor' field filled in! */
     l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
     l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
-    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
+        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
+                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
         l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
         l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
             l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
@@ -700,13 +715,12 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, i % num_online_cpus());
 
     /* Set up CR3 value for write_ptbase */
-    if ( paging_mode_enabled(v->domain) )
+    if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
     else
         update_cr3(v);
 
-    /* Install the new page tables. */
-    local_irq_disable();
+    /* We run on dom0's page tables for the final part of the build process. */
     write_ptbase(v);
 
     /* Copy the OS image and free temporary buffer. */
@@ -719,11 +733,11 @@ int __init construct_dom0(
              (parms.virt_hypercall >= v_end) )
         {
             write_ptbase(current);
-            local_irq_enable();
             printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
             return -1;
         }
-        hypercall_page_initialise(d, (void *)(unsigned long)parms.virt_hypercall);
+        hypercall_page_initialise(
+            d, (void *)(unsigned long)parms.virt_hypercall);
     }
 
     /* Copy the initial ramdisk. */
@@ -748,8 +762,109 @@ int __init construct_dom0(
     snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
              elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
 
+    count = d->tot_pages;
+#ifdef __x86_64__
+    /* Set up the phys->machine table if not part of the initial mapping. */
+    if ( parms.p2m_base != UNSET_ADDR )
+    {
+        unsigned long va = vphysmap_start;
+
+        if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
+            panic("DOM0 P->M table overlaps initial mapping");
+
+        while ( va < vphysmap_end )
+        {
+            if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
+                                 >> PAGE_SHIFT) + 3 > nr_pages )
+                panic("Dom0 allocation too small for initial P->M table.\n");
+
+            l4tab = l4start + l4_table_offset(va);
+            if ( !l4e_get_intpte(*l4tab) )
+            {
+                page = alloc_domheap_page(d, 0);
+                if ( !page )
+                    break;
+                /* No mapping, PGC_allocated + page-table page. */
+                page->count_info = PGC_allocated | 2;
+                page->u.inuse.type_info =
+                    PGT_l3_page_table | PGT_validated | 1;
+                clear_page(page_to_virt(page));
+                *l4tab = l4e_from_page(page, L4_PROT);
+            }
+            l3tab = page_to_virt(l4e_get_page(*l4tab));
+            l3tab += l3_table_offset(va);
+            if ( !l3e_get_intpte(*l3tab) )
+            {
+                if ( cpu_has_page1gb &&
+                     !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
+                     vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
+                     (page = alloc_domheap_pages(d,
+                                                 L3_PAGETABLE_SHIFT -
+                                                     PAGE_SHIFT,
+                                                 0)) != NULL )
+                {
+                    *l3tab = l3e_from_page(page,
+                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                    va += 1UL << L3_PAGETABLE_SHIFT;
+                    continue;
+                }
+                if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                    break;
+                else
+                {
+                    /* No mapping, PGC_allocated + page-table page. */
+                    page->count_info = PGC_allocated | 2;
+                    page->u.inuse.type_info =
+                        PGT_l2_page_table | PGT_validated | 1;
+                    clear_page(page_to_virt(page));
+                    *l3tab = l3e_from_page(page, L3_PROT);
+                }
+            }
+            l2tab = page_to_virt(l3e_get_page(*l3tab));
+            l2tab += l2_table_offset(va);
+            if ( !l2e_get_intpte(*l2tab) )
+            {
+                if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
+                     vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
+                     (page = alloc_domheap_pages(d,
+                                                 L2_PAGETABLE_SHIFT -
+                                                     PAGE_SHIFT,
+                                                 0)) != NULL )
+                {
+                    *l2tab = l2e_from_page(page,
+                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                    va += 1UL << L2_PAGETABLE_SHIFT;
+                    continue;
+                }
+                if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                    break;
+                else
+                {
+                    /* No mapping, PGC_allocated + page-table page. */
+                    page->count_info = PGC_allocated | 2;
+                    page->u.inuse.type_info =
+                        PGT_l1_page_table | PGT_validated | 1;
+                    clear_page(page_to_virt(page));
+                    *l2tab = l2e_from_page(page, L2_PROT);
+                }
+            }
+            l1tab = page_to_virt(l2e_get_page(*l2tab));
+            l1tab += l1_table_offset(va);
+            BUG_ON(l1e_get_intpte(*l1tab));
+            page = alloc_domheap_page(d, 0);
+            if ( !page )
+                break;
+            *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
+            va += PAGE_SIZE;
+            va &= PAGE_MASK;
+        }
+        if ( !page )
+            panic("Not enough RAM for DOM0 P->M table.\n");
+    }
+#endif
+
     /* Write the phys->machine and machine->phys table entries. */
-    for ( pfn = 0; pfn < d->tot_pages; pfn++ )
+    for ( pfn = 0; pfn < count; pfn++ )
     {
         mfn = pfn + alloc_spfn;
 #ifndef NDEBUG
@@ -763,6 +878,26 @@ int __init construct_dom0(
             ((unsigned int *)vphysmap_start)[pfn] = mfn;
         set_gpfn_from_mfn(mfn, pfn);
     }
+    si->first_p2m_pfn = pfn;
+    si->nr_p2m_frames = d->tot_pages - count;
+    page_list_for_each ( page, &d->page_list )
+    {
+        mfn = page_to_mfn(page);
+        if ( get_gpfn_from_mfn(mfn) >= count )
+        {
+            BUG_ON(is_pv_32bit_domain(d));
+            if ( !page->u.inuse.type_info &&
+                 !get_page_and_type(page, d, PGT_writable_page) )
+                BUG();
+            ((unsigned long *)vphysmap_start)[pfn] = mfn;
+            set_gpfn_from_mfn(mfn, pfn);
+            ++pfn;
+#ifndef NDEBUG
+            ++alloc_epfn;
+#endif
+        }
+    }
+    BUG_ON(pfn != d->tot_pages);
     while ( pfn < nr_pages )
     {
         if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
@@ -804,9 +939,8 @@ int __init construct_dom0(
         xlat_start_info(si, XLAT_start_info_console_dom0);
 #endif
 
-    /* Reinstate the caller's page tables. */
+    /* Return to idle domain's page tables. */
     write_ptbase(current);
-    local_irq_enable();
 
 #if defined(__i386__)
     /* Destroy low mappings - they were only for our convenience. */
index a145583137fd43f21e3abf1b785e4eff24b37b92..7f4d7dbbead7a17c95b5f99384e356c1a2662f70 100644 (file)
@@ -240,7 +240,7 @@ long arch_do_domctl(
         struct domain *d = rcu_lock_domain_by_id(domctl->domain);
         unsigned long max_pfns = domctl->u.getmemlist.max_pfns;
         uint64_t mfn;
-        struct list_head *list_ent;
+        struct page_info *page;
 
         ret = -EINVAL;
         if ( d != NULL )
@@ -259,19 +259,19 @@ long arch_do_domctl(
                 goto getmemlist_out;
             }
 
-            ret = 0;
-            list_ent = d->page_list.next;
-            for ( i = 0; (i < max_pfns) && (list_ent != &d->page_list); i++ )
+            ret = i = 0;
+            page_list_for_each(page, &d->page_list)
             {
-                mfn = page_to_mfn(list_entry(
-                    list_ent, struct page_info, list));
+                if ( i >= max_pfns )
+                    break;
+                mfn = page_to_mfn(page);
                 if ( copy_to_guest_offset(domctl->u.getmemlist.buffer,
                                           i, &mfn, 1) )
                 {
                     ret = -EFAULT;
                     break;
                 }
-                list_ent = mfn_to_page(mfn)->list.next;
+                ++i;
             }
             
             spin_unlock(&d->page_alloc_lock);
@@ -326,13 +326,9 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_sethvmcontext:
     { 
-        struct hvm_domain_context c;
-        struct domain             *d;
+        struct hvm_domain_context c = { .size = domctl->u.hvmcontext.size };
+        struct domain *d;
 
-        c.cur = 0;
-        c.size = domctl->u.hvmcontext.size;
-        c.data = NULL;
-        
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
             break;
@@ -367,8 +363,8 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_gethvmcontext:
     { 
-        struct hvm_domain_context c;
-        struct domain             *d;
+        struct hvm_domain_context c = { 0 };
+        struct domain *d;
 
         ret = -ESRCH;
         if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
@@ -382,9 +378,7 @@ long arch_do_domctl(
         if ( !is_hvm_domain(d) ) 
             goto gethvmcontext_out;
 
-        c.cur = 0;
         c.size = hvm_save_size(d);
-        c.data = NULL;
 
         if ( guest_handle_is_null(domctl->u.hvmcontext.buffer) )
         {
@@ -423,6 +417,34 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_gethvmcontext_partial:
+    { 
+        struct domain *d;
+
+        ret = -ESRCH;
+        if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
+            break;
+
+        ret = xsm_hvmcontext(d, domctl->cmd);
+        if ( ret )
+            goto gethvmcontext_partial_out;
+
+        ret = -EINVAL;
+        if ( !is_hvm_domain(d) ) 
+            goto gethvmcontext_partial_out;
+
+        domain_pause(d);
+        ret = hvm_save_one(d, domctl->u.hvmcontext_partial.type,
+                           domctl->u.hvmcontext_partial.instance,
+                           domctl->u.hvmcontext_partial.buffer);
+        domain_unpause(d);
+
+    gethvmcontext_partial_out:
+        rcu_unlock_domain(d);
+    }
+    break;
+
+
     case XEN_DOMCTL_set_address_size:
     {
         struct domain *d;
@@ -472,7 +494,8 @@ long arch_do_domctl(
             break;
         }
 
-        domctl->u.address_size.size = BITS_PER_GUEST_LONG(d);
+        domctl->u.address_size.size =
+            is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG;
 
         ret = 0;
         rcu_unlock_domain(d);
@@ -671,14 +694,6 @@ long arch_do_domctl(
         }
 
         ret = -EINVAL;
-        if ( device_assigned(bus, devfn) )
-        {
-            gdprintk(XENLOG_ERR, "XEN_DOMCTL_assign_device: "
-                     "%x:%x:%x already assigned, or non-existent\n",
-                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
-            put_domain(d);
-            break;
-        }
 
         ret = assign_device(d, bus, devfn);
         if ( ret )
@@ -721,15 +736,10 @@ long arch_do_domctl(
             put_domain(d);
             break;
         }
-
-        if ( !device_assigned(bus, devfn) )
-        {
-            put_domain(d);
-            break;
-        }
-
         ret = 0;
-        deassign_device(d, bus, devfn);
+        spin_lock(&pcidevs_lock);
+        ret = deassign_device(d, bus, devfn);
+        spin_unlock(&pcidevs_lock);
         gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n",
             bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 
@@ -754,7 +764,11 @@ long arch_do_domctl(
 
         ret = -ESRCH;
         if ( iommu_enabled )
+        {
+            spin_lock(&pcidevs_lock);
             ret = pt_irq_create_bind_vtd(d, bind);
+            spin_unlock(&pcidevs_lock);
+        }
         if ( ret < 0 )
             gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n");
 
@@ -773,7 +787,11 @@ long arch_do_domctl(
             break;
         bind = &(domctl->u.bind_pt_irq);
         if ( iommu_enabled )
+        {
+            spin_lock(&pcidevs_lock);
             ret = pt_irq_destroy_bind_vtd(d, bind);
+            spin_unlock(&pcidevs_lock);
+        }
         if ( ret < 0 )
             gdprintk(XENLOG_ERR, "pt_irq_destroy_bind failed!\n");
         rcu_unlock_domain(d);
@@ -1043,6 +1061,32 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_debug_op:
+    {
+        struct domain *d;
+        struct vcpu *v;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        ret = -EINVAL;
+        if ( (domctl->u.debug_op.vcpu >= MAX_VIRT_CPUS) ||
+             ((v = d->vcpu[domctl->u.debug_op.vcpu]) == NULL) )
+            goto debug_op_out;
+
+        ret = -EINVAL;
+        if ( !is_hvm_domain(d))
+            goto debug_op_out;
+
+        ret = hvm_debug_op(v, domctl->u.debug_op.op);
+
+    debug_op_out:
+        rcu_unlock_domain(d);
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
@@ -1074,11 +1118,24 @@ void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
 
     if ( is_hvm_vcpu(v) )
     {
+        struct segment_register sreg;
         memset(c.nat->ctrlreg, 0, sizeof(c.nat->ctrlreg));
         c.nat->ctrlreg[0] = v->arch.hvm_vcpu.guest_cr[0];
         c.nat->ctrlreg[2] = v->arch.hvm_vcpu.guest_cr[2];
         c.nat->ctrlreg[3] = v->arch.hvm_vcpu.guest_cr[3];
         c.nat->ctrlreg[4] = v->arch.hvm_vcpu.guest_cr[4];
+        hvm_get_segment_register(v, x86_seg_cs, &sreg);
+        c.nat->user_regs.cs = sreg.sel;
+        hvm_get_segment_register(v, x86_seg_ss, &sreg);
+        c.nat->user_regs.ss = sreg.sel;
+        hvm_get_segment_register(v, x86_seg_ds, &sreg);
+        c.nat->user_regs.ds = sreg.sel;
+        hvm_get_segment_register(v, x86_seg_es, &sreg);
+        c.nat->user_regs.es = sreg.sel;
+        hvm_get_segment_register(v, x86_seg_fs, &sreg);
+        c.nat->user_regs.fs = sreg.sel;
+        hvm_get_segment_register(v, x86_seg_gs, &sreg);
+        c.nat->user_regs.gs = sreg.sel;
     }
     else
     {
index 8b79c5ea176e97f29e5020998f967120790de050..125c8ff5f4a9951a20d07b3d8cf21d37c78e7c5c 100644 (file)
@@ -1,10 +1,10 @@
 #include <xen/config.h>
 #include <xen/init.h>
 #include <xen/lib.h>
+#include <xen/mm.h>
 #include <xen/compat.h>
 #include <xen/dmi.h>
 #include <asm/e820.h>
-#include <asm/mm.h>
 #include <asm/page.h>
 
 /* opt_mem: Limit of physical RAM. Any RAM beyond this point is ignored. */
@@ -391,8 +391,9 @@ static void __init machine_specific_memory_setup(
     reserve_dmi_region();
 }
 
-/* Reserve RAM area (@s,@e) in the specified e820 map. */
-int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
+int __init e820_change_range_type(
+    struct e820map *e820, uint64_t s, uint64_t e,
+    uint32_t orig_type, uint32_t new_type)
 {
     uint64_t rs = 0, re = 0;
     int i;
@@ -406,55 +407,79 @@ int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
             break;
     }
 
-    if ( (i == e820->nr_map) || (e820->map[i].type != E820_RAM) )
+    if ( (i == e820->nr_map) || (e820->map[i].type != orig_type) )
         return 0;
 
     if ( (s == rs) && (e == re) )
     {
-        /* Complete excision. */
-        memmove(&e820->map[i], &e820->map[i+1],
-                (e820->nr_map-i-1) * sizeof(e820->map[0]));
-        e820->nr_map--;
-    }
-    else if ( s == rs )
-    {
-        /* Truncate start. */
-        e820->map[i].addr += e - s;
-        e820->map[i].size -= e - s;
+        e820->map[i].type = new_type;
     }
-    else if ( e == re )
+    else if ( (s == rs) || (e == re) )
     {
-        /* Truncate end. */
-        e820->map[i].size -= e - s;
-    }
-    else if ( e820->nr_map < ARRAY_SIZE(e820->map) )
-    {
-        /* Split in two. */
+        if ( (e820->nr_map + 1) > ARRAY_SIZE(e820->map) )
+            goto overflow;
+
         memmove(&e820->map[i+1], &e820->map[i],
                 (e820->nr_map-i) * sizeof(e820->map[0]));
         e820->nr_map++;
-        e820->map[i].size = s - rs;
-        i++;
-        e820->map[i].addr = e;
-        e820->map[i].size = re - e;
-    }
-    else
-    {
-        /* e820map is at maximum size. We have to leak some space. */
-        if ( (s - rs) > (re - e) )
+
+        if ( s == rs )
         {
-            printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", e, re);
-            e820->map[i].size = s - rs;
+            e820->map[i].size = e - s;
+            e820->map[i].type = new_type;
+            e820->map[i+1].addr = e;
+            e820->map[i+1].size = re - e;
         }
         else
         {
-            printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", rs, s);
-            e820->map[i].addr = e;
-            e820->map[i].size = re - e;
+            e820->map[i].size = s - rs;
+            e820->map[i+1].addr = s;
+            e820->map[i+1].size = e - s;
+            e820->map[i+1].type = new_type;
         }
     }
+    else if ( e820->nr_map+1 < ARRAY_SIZE(e820->map) )
+    {
+        if ( (e820->nr_map + 2) > ARRAY_SIZE(e820->map) )
+            goto overflow;
+
+        memmove(&e820->map[i+2], &e820->map[i],
+                (e820->nr_map-i) * sizeof(e820->map[0]));
+        e820->nr_map += 2;
+
+        e820->map[i].size = s - rs;
+        e820->map[i+1].addr = s;
+        e820->map[i+1].size = e - s;
+        e820->map[i+1].type = new_type;
+        e820->map[i+2].addr = e;
+        e820->map[i+2].size = re - e;
+    }
+
+    /* Finally, look for any opportunities to merge adjacent e820 entries. */
+    for ( i = 0; i < (e820->nr_map - 1); i++ )
+    {
+        if ( (e820->map[i].type != e820->map[i+1].type) ||
+             ((e820->map[i].addr + e820->map[i].size) != e820->map[i+1].addr) )
+            continue;
+        e820->map[i].size += e820->map[i+1].size;
+        memmove(&e820->map[i+1], &e820->map[i+2],
+                (e820->nr_map-i-2) * sizeof(e820->map[0]));
+        e820->nr_map--;
+        i--;
+    }
 
     return 1;
+
+ overflow:
+    printk("Overflow in e820 while reserving region %"PRIx64"-%"PRIx64"\n",
+           s, e);
+    return 0;
+}
+
+/* Set E820_RAM area (@s,@e) as RESERVED in specified e820 map. */
+int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e)
+{
+    return e820_change_range_type(e820, s, e, E820_RAM, E820_RESERVED);
 }
 
 unsigned long __init init_e820(
index 5a660245b663c34e5cb27ced65a4ba5fbdb568ea..97d09793057c91ae02437d2bb4f7d0624ac5e5d5 100644 (file)
 #include <xen/timer.h>
 #include <xen/smp.h>
 #include <xen/softirq.h>
+#include <xen/irq.h>
 #include <asm/fixmap.h>
 #include <asm/div64.h>
 #include <asm/hpet.h>
-
-#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
+#include <asm/msi.h>
+#include <mach_apic.h>
 
 #define MAX_DELTA_NS MILLISECS(10*1000)
 #define MIN_DELTA_NS MICROSECS(20)
 
+#define MAX_HPET_NUM 32
+
+#define HPET_EVT_USED_BIT   2
+#define HPET_EVT_USED       (1 << HPET_EVT_USED_BIT)
+
 struct hpet_event_channel
 {
     unsigned long mult;
@@ -27,11 +33,31 @@ struct hpet_event_channel
     cpumask_t     cpumask;
     spinlock_t    lock;
     void          (*event_handler)(struct hpet_event_channel *);
-};
-static struct hpet_event_channel hpet_event;
+
+    unsigned int idx;   /* physical channel idx */
+    int cpu;            /* msi target */
+    unsigned int vector;/* msi vector */
+    unsigned int flags; /* HPET_EVT_x */
+} __cacheline_aligned;
+static struct hpet_event_channel legacy_hpet_event;
+static struct hpet_event_channel hpet_events[MAX_HPET_NUM];
+static unsigned int num_hpets_used; /* msi hpet channels used for broadcast */
+
+DEFINE_PER_CPU(struct hpet_event_channel *, cpu_bc_channel);
+
+static int vector_channel[NR_IRQS] = {[0 ... NR_IRQS-1] = -1};
+
+#define vector_to_channel(vector)   vector_channel[vector]
 
 unsigned long hpet_address;
 
+void msi_compose_msg(struct pci_dev *pdev, int vector, struct msi_msg *msg);
+
+/* force_hpet_broadcast: if true, force using hpet_broadcast to fix lapic stop
+   issue for deep C state with pit disabled */
+int force_hpet_broadcast;
+boolean_param("hpetbroadcast", force_hpet_broadcast);
+
 /*
  * Calculate a multiplication factor for scaled math, which is used to convert
  * nanoseconds based values to clock ticks:
@@ -65,7 +91,7 @@ static inline unsigned long ns2ticks(unsigned long nsec, int shift,
     return (unsigned long) tmp;
 }
 
-static int hpet_legacy_next_event(unsigned long delta)
+static int hpet_next_event(unsigned long delta, int timer)
 {
     uint32_t cnt, cmp;
     unsigned long flags;
@@ -73,7 +99,7 @@ static int hpet_legacy_next_event(unsigned long delta)
     local_irq_save(flags);
     cnt = hpet_read32(HPET_COUNTER);
     cmp = cnt + delta;
-    hpet_write32(cmp, HPET_T0_CMP);
+    hpet_write32(cmp, HPET_Tn_CMP(timer));
     cmp = hpet_read32(HPET_COUNTER);
     local_irq_restore(flags);
 
@@ -103,7 +129,7 @@ static int reprogram_hpet_evt_channel(
     if ( expire == STIME_MAX )
     {
         /* We assume it will take a long time for the timer to wrap. */
-        hpet_write32(0, HPET_T0_CMP);
+        hpet_write32(0, HPET_Tn_CMP(ch->idx));
         return 0;
     }
 
@@ -111,11 +137,11 @@ static int reprogram_hpet_evt_channel(
     delta = max_t(int64_t, delta, MIN_DELTA_NS);
     delta = ns2ticks(delta, ch->shift, ch->mult);
 
-    ret = hpet_legacy_next_event(delta);
+    ret = hpet_next_event(delta, ch->idx);
     while ( ret && force )
     {
         delta += delta;
-        ret = hpet_legacy_next_event(delta);
+        ret = hpet_next_event(delta, ch->idx);
     }
 
     return ret;
@@ -146,7 +172,7 @@ static void handle_hpet_broadcast(struct hpet_event_channel *ch)
     s_time_t now, next_event;
     int cpu;
 
-    spin_lock(&ch->lock);
+    spin_lock_irq(&ch->lock);
 
 again:
     ch->next_event = STIME_MAX;
@@ -171,20 +197,338 @@ again:
         if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
     }
-    spin_unlock(&ch->lock);
+    spin_unlock_irq(&ch->lock);
+}
+
+static void hpet_interrupt_handler(int vector, void *data,
+        struct cpu_user_regs *regs)
+{
+    struct hpet_event_channel *ch = (struct hpet_event_channel *)data;
+    if ( !ch->event_handler )
+    {
+        printk(XENLOG_WARNING "Spurious HPET timer interrupt on HPET timer %d\n", ch->idx);
+        return;
+    }
+
+    ch->event_handler(ch);
+}
+
+static void hpet_msi_unmask(unsigned int vector)
+{
+    unsigned long cfg;
+    int ch_idx = vector_to_channel(vector);
+    struct hpet_event_channel *ch;
+
+    BUG_ON(ch_idx < 0);
+    ch = &hpet_events[ch_idx];
+
+    cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
+    cfg |= HPET_TN_FSB;
+    hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
+}
+
+static void hpet_msi_mask(unsigned int vector)
+{
+    unsigned long cfg;
+    int ch_idx = vector_to_channel(vector);
+    struct hpet_event_channel *ch;
+
+    BUG_ON(ch_idx < 0);
+    ch = &hpet_events[ch_idx];
+
+    cfg = hpet_read32(HPET_Tn_CFG(ch->idx));
+    cfg &= ~HPET_TN_FSB;
+    hpet_write32(cfg, HPET_Tn_CFG(ch->idx));
+}
+
+static void hpet_msi_write(unsigned int vector, struct msi_msg *msg)
+{
+    int ch_idx = vector_to_channel(vector);
+    struct hpet_event_channel *ch;
+
+    BUG_ON(ch_idx < 0);
+    ch = &hpet_events[ch_idx];
+
+    hpet_write32(msg->data, HPET_Tn_ROUTE(ch->idx));
+    hpet_write32(msg->address_lo, HPET_Tn_ROUTE(ch->idx) + 4);
+}
+
+static void hpet_msi_read(unsigned int vector, struct msi_msg *msg)
+{
+    int ch_idx = vector_to_channel(vector);
+    struct hpet_event_channel *ch;
+
+    BUG_ON(ch_idx < 0);
+    ch = &hpet_events[ch_idx];
+
+    msg->data = hpet_read32(HPET_Tn_ROUTE(ch->idx));
+    msg->address_lo = hpet_read32(HPET_Tn_ROUTE(ch->idx) + 4);
+    msg->address_hi = 0;
+}
+
+static unsigned int hpet_msi_startup(unsigned int vector)
+{
+    hpet_msi_unmask(vector);
+    return 0;
+}
+
+static void hpet_msi_shutdown(unsigned int vector)
+{
+    hpet_msi_mask(vector);
+}
+
+static void hpet_msi_ack(unsigned int vector)
+{
+    ack_APIC_irq();
+}
+
+static void hpet_msi_end(unsigned int vector)
+{
 }
 
+static void hpet_msi_set_affinity(unsigned int vector, cpumask_t mask)
+{
+    struct msi_msg msg;
+    unsigned int dest;
+    cpumask_t tmp;
+
+    cpus_and(tmp, mask, cpu_online_map);
+    if ( cpus_empty(tmp) )
+        mask = TARGET_CPUS;
+
+    dest = cpu_mask_to_apicid(mask);
+
+    hpet_msi_read(vector, &msg);
+
+    msg.data &= ~MSI_DATA_VECTOR_MASK;
+    msg.data |= MSI_DATA_VECTOR(vector);
+    msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+    msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+    hpet_msi_write(vector, &msg);
+    irq_desc[vector].affinity = mask;
+}
+
+/*
+ * IRQ Chip for MSI HPET Devices,
+ */
+static struct hw_interrupt_type hpet_msi_type = {
+    .typename   = "HPET-MSI",
+    .startup    = hpet_msi_startup,
+    .shutdown   = hpet_msi_shutdown,
+    .enable        = hpet_msi_unmask,
+    .disable    = hpet_msi_mask,
+    .ack        = hpet_msi_ack,
+    .end        = hpet_msi_end,
+    .set_affinity   = hpet_msi_set_affinity,
+};
+
+static int hpet_setup_msi_irq(unsigned int vector)
+{
+    int ret;
+    struct msi_msg msg;
+    struct hpet_event_channel *ch = &hpet_events[vector_to_channel(vector)];
+
+    irq_desc[vector].handler = &hpet_msi_type;
+    ret = request_irq_vector(vector, hpet_interrupt_handler,
+                      0, "HPET", ch);
+    if ( ret < 0 )
+        return ret;
+
+    msi_compose_msg(NULL, vector, &msg);
+    hpet_msi_write(vector, &msg);
+
+    return 0;
+}
+
+static int hpet_assign_irq(struct hpet_event_channel *ch)
+{
+    unsigned int vector;
+
+    vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
+    if ( !vector )
+        return -EINVAL;
+
+    irq_vector[vector] = vector;
+    vector_irq[vector] = vector;
+    vector_channel[vector] = ch - &hpet_events[0];
+
+    if ( hpet_setup_msi_irq(vector) )
+    {
+        irq_vector[vector] = 0;
+        vector_irq[vector] = FREE_TO_ASSIGN_IRQ;
+        vector_channel[vector] = -1;
+        return -EINVAL;
+    }
+
+    ch->vector = vector;
+    return 0;
+}
+
+static int hpet_fsb_cap_lookup(void)
+{
+    unsigned int id;
+    unsigned int num_chs, num_chs_used;
+    int i;
+
+    id = hpet_read32(HPET_ID);
+
+    num_chs = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+    num_chs++; /* Value read out starts from 0 */
+
+    num_chs_used = 0;
+    for ( i = 0; i < num_chs; i++ )
+    {
+        struct hpet_event_channel *ch = &hpet_events[num_chs_used];
+        unsigned long cfg = hpet_read32(HPET_Tn_CFG(i));
+
+        /* Only consider HPET timer with MSI support */
+        if ( !(cfg & HPET_TN_FSB_CAP) )
+            continue;
+
+        ch->flags = 0;
+        ch->idx = i;
+
+        if ( hpet_assign_irq(ch) )
+            continue;
+
+        /* set default irq affinity */
+        ch->cpu = num_chs_used;
+        per_cpu(cpu_bc_channel, ch->cpu) = ch;
+        irq_desc[ch->vector].handler->
+            set_affinity(ch->vector, cpumask_of_cpu(ch->cpu));
+
+        num_chs_used++;
+
+        if ( num_chs_used == num_possible_cpus() )
+            break;
+    }
+
+    printk(XENLOG_INFO
+           "HPET: %d timers in total, %d timers will be used for broadcast\n",
+           num_chs, num_chs_used);
+
+    return num_chs_used;
+}
+
+static int next_channel;
+static spinlock_t next_lock = SPIN_LOCK_UNLOCKED;
+
+static struct hpet_event_channel *hpet_get_channel(int cpu)
+{
+    int i;
+    int next;
+    struct hpet_event_channel *ch;
+
+    spin_lock(&next_lock);
+    next = next_channel = (next_channel + 1) % num_hpets_used;
+    spin_unlock(&next_lock);
+
+    /* try unused channel first */
+    for ( i = next; i < next + num_hpets_used; i++ )
+    {
+        ch = &hpet_events[i % num_hpets_used];
+        if ( !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) )
+        {
+            ch->cpu = cpu;
+            return ch;
+        }
+    }
+
+    /* share a in-use channel */
+    ch = &hpet_events[next];
+    if ( !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) )
+        ch->cpu = cpu;
+
+    return ch;
+}
+
+static void hpet_attach_channel_share(int cpu, struct hpet_event_channel *ch)
+{
+    per_cpu(cpu_bc_channel, cpu) = ch;
+
+    /* try to be the channel owner again while holding the lock */
+    if ( !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) )
+        ch->cpu = cpu;
+
+    if ( ch->cpu != cpu )
+        return;
+
+    /* set irq affinity */
+    irq_desc[ch->vector].handler->
+        set_affinity(ch->vector, cpumask_of_cpu(ch->cpu));
+}
+
+static void hpet_detach_channel_share(int cpu)
+{
+    struct hpet_event_channel *ch = per_cpu(cpu_bc_channel, cpu);
+
+    per_cpu(cpu_bc_channel, cpu) = NULL;
+
+    if ( cpu != ch->cpu )
+        return;
+
+    if ( cpus_empty(ch->cpumask) )
+    {
+        ch->cpu = -1;
+        clear_bit(HPET_EVT_USED_BIT, &ch->flags);
+        return;
+    }
+
+    ch->cpu = first_cpu(ch->cpumask);
+    /* set irq affinity */
+    irq_desc[ch->vector].handler->
+        set_affinity(ch->vector, cpumask_of_cpu(ch->cpu));
+}
+
+static void (*hpet_attach_channel)(int cpu, struct hpet_event_channel *ch);
+static void (*hpet_detach_channel)(int cpu);
+
 void hpet_broadcast_init(void)
 {
     u64 hpet_rate;
     u32 hpet_id, cfg;
+    int i;
 
     hpet_rate = hpet_setup();
     if ( hpet_rate == 0 )
         return;
 
+    num_hpets_used = hpet_fsb_cap_lookup();
+    if ( num_hpets_used > 0 )
+    {
+        /* Stop HPET legacy interrupts */
+        cfg = hpet_read32(HPET_CFG);
+        cfg &= ~HPET_CFG_LEGACY;
+        hpet_write32(cfg, HPET_CFG);
+
+        for ( i = 0; i < num_hpets_used; i++ )
+        {
+            /* set HPET Tn as oneshot */
+            cfg = hpet_read32(HPET_Tn_CFG(hpet_events[i].idx));
+            cfg &= ~HPET_TN_PERIODIC;
+            cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+            hpet_write32(cfg, HPET_Tn_CFG(hpet_events[i].idx));
+  
+            hpet_events[i].mult = div_sc((unsigned long)hpet_rate,
+                                         1000000000ul, 32);
+            hpet_events[i].shift = 32;
+            hpet_events[i].next_event = STIME_MAX;
+            hpet_events[i].event_handler = handle_hpet_broadcast;
+            spin_lock_init(&hpet_events[i].lock);
+        }
+
+        if ( num_hpets_used < num_possible_cpus() )
+        {
+            hpet_attach_channel = hpet_attach_channel_share;
+            hpet_detach_channel = hpet_detach_channel_share;
+        }
+
+        return;
+    }
+
     hpet_id = hpet_read32(HPET_ID);
-    if ( !(hpet_id & HPET_ID_LEGSUP) )
+    if ( !(hpet_id & HPET_ID_LEGSUP) || !force_hpet_broadcast )
         return;
 
     /* Start HPET legacy interrupts */
@@ -202,22 +546,36 @@ void hpet_broadcast_init(void)
      * The period is a femto seconds value. We need to calculate the scaled
      * math multiplication factor for nanosecond to hpet tick conversion.
      */
-    hpet_event.mult = div_sc((unsigned long)hpet_rate, 1000000000ul, 32);
-    hpet_event.shift = 32;
-    hpet_event.next_event = STIME_MAX;
-    hpet_event.event_handler = handle_hpet_broadcast;
-    spin_lock_init(&hpet_event.lock);
+    legacy_hpet_event.mult = div_sc((unsigned long)hpet_rate, 1000000000ul, 32);
+    legacy_hpet_event.shift = 32;
+    legacy_hpet_event.next_event = STIME_MAX;
+    legacy_hpet_event.event_handler = handle_hpet_broadcast;
+    legacy_hpet_event.idx = 0;
+    legacy_hpet_event.flags = 0;
+    spin_lock_init(&legacy_hpet_event.lock);
+
+    for_each_cpu(i)
+        per_cpu(cpu_bc_channel, i) = &legacy_hpet_event;
 }
 
 void hpet_broadcast_enter(void)
 {
-    struct hpet_event_channel *ch = &hpet_event;
+    int cpu = smp_processor_id();
+    struct hpet_event_channel *ch = per_cpu(cpu_bc_channel, cpu);
 
+    if ( !ch )
+        ch = hpet_get_channel(cpu);
+    BUG_ON( !ch );
+
+    ASSERT(!local_irq_is_enabled());
     spin_lock(&ch->lock);
 
+    if ( hpet_attach_channel )
+        hpet_attach_channel(cpu, ch);
+
     disable_APIC_timer();
 
-    cpu_set(smp_processor_id(), ch->cpumask);
+    cpu_set(cpu, ch->cpumask);
 
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline) < ch->next_event )
@@ -228,8 +586,10 @@ void hpet_broadcast_enter(void)
 
 void hpet_broadcast_exit(void)
 {
-    struct hpet_event_channel *ch = &hpet_event;
     int cpu = smp_processor_id();
+    struct hpet_event_channel *ch = per_cpu(cpu_bc_channel, cpu);
+
+    BUG_ON( !ch );
 
     spin_lock_irq(&ch->lock);
 
@@ -247,32 +607,36 @@ void hpet_broadcast_exit(void)
             reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
     }
 
+    if ( hpet_detach_channel )
+        hpet_detach_channel(cpu);
+
     spin_unlock_irq(&ch->lock);
 }
 
 int hpet_broadcast_is_available(void)
 {
-    return (hpet_event.event_handler == handle_hpet_broadcast);
+    return (legacy_hpet_event.event_handler == handle_hpet_broadcast
+            || num_hpets_used > 0);
 }
 
 int hpet_legacy_irq_tick(void)
 {
-    if ( !hpet_event.event_handler )
+    if ( !legacy_hpet_event.event_handler )
         return 0;
-    hpet_event.event_handler(&hpet_event);
+    legacy_hpet_event.event_handler(&legacy_hpet_event);
     return 1;
 }
 
 u64 hpet_setup(void)
 {
     static u64 hpet_rate;
-    static int initialised;
+    static u32 system_reset_latch;
     u32 hpet_id, hpet_period, cfg;
     int i;
 
-    if ( initialised )
+    if ( system_reset_latch == system_reset_counter )
         return hpet_rate;
-    initialised = 1;
+    system_reset_latch = system_reset_counter;
 
     if ( hpet_address == 0 )
         return 0;
@@ -280,9 +644,9 @@ u64 hpet_setup(void)
     set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
 
     hpet_id = hpet_read32(HPET_ID);
-    if ( hpet_id == 0 )
+    if ( (hpet_id & HPET_ID_REV) == 0 )
     {
-        printk("BAD HPET vendor id.\n");
+        printk("BAD HPET revision id.\n");
         return 0;
     }
 
@@ -300,9 +664,9 @@ u64 hpet_setup(void)
 
     for ( i = 0; i <= ((hpet_id >> 8) & 31); i++ )
     {
-        cfg = hpet_read32(HPET_T0_CFG + i*0x20);
+        cfg = hpet_read32(HPET_Tn_CFG(i));
         cfg &= ~HPET_TN_ENABLE;
-        hpet_write32(cfg & ~HPET_TN_ENABLE, HPET_T0_CFG);
+        hpet_write32(cfg, HPET_Tn_CFG(i));
     }
 
     cfg = hpet_read32(HPET_CFG);
index ae16db0dfcf96e61968ebe3eeb9665c7c8d00df9..6fbce84e9012b0f9523b950c16326d92db92cfc3 100644 (file)
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <xen/paging.h>
+#include <xen/trace.h>
 #include <asm/event.h>
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 
+#define HVMTRACE_IO_ASSIST_WRITE 0x200
+static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
+{
+    unsigned int size, event;
+    unsigned char buffer[12];
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    event = is_mmio ? TRC_HVM_MMIO_ASSIST : TRC_HVM_IO_ASSIST;
+    if ( !p->dir )
+        event |= HVMTRACE_IO_ASSIST_WRITE;
+
+    *(uint64_t *)buffer = p->addr;
+    size = (p->addr != (u32)p->addr) ? 8 : 4;
+    if ( size == 8 )
+        event |= TRC_64_FLAG;
+
+    if ( !p->data_is_ptr )
+    {
+        *(uint32_t *)&buffer[size] = p->data;
+        size += 4;
+    }
+
+    trace_var(event, 0/*!cycles*/, size, buffer);
+}
+
 static int hvmemul_do_io(
     int is_mmio, paddr_t addr, unsigned long *reps, int size,
     paddr_t ram_gpa, int dir, int df, void *p_data)
@@ -111,6 +139,8 @@ static int hvmemul_do_io(
     p->data = value;
     p->io_count++;
 
+    hvmtrace_io_assist(is_mmio, p);
+
     if ( is_mmio )
     {
         rc = hvm_mmio_intercept(p);
@@ -763,7 +793,7 @@ static int hvmemul_read_msr(
     if ( (rc = hvm_msr_read_intercept(&_regs)) != 0 )
         return rc;
 
-    *val = ((uint64_t)(uint32_t)_regs.edx << 32) || (uint32_t)_regs.eax;
+    *val = ((uint64_t)(uint32_t)_regs.edx << 32) | (uint32_t)_regs.eax;
     return X86EMUL_OKAY;
 }
 
index e8a83949f1e3d711c9b92f49e3690f0d6b01569f..42c5c58fab41dd2d7fcb6785440d5da188a287e4 100644 (file)
@@ -76,6 +76,7 @@
         ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10))
 
 #define timer_config(h, n)       (h->hpet.timers[n].config)
+#define timer_enabled(h, n)      (timer_config(h, n) & HPET_TN_ENABLE)
 #define timer_is_periodic(h, n)  (timer_config(h, n) & HPET_TN_PERIODIC)
 #define timer_is_32bit(h, n)     (timer_config(h, n) & HPET_TN_32BIT)
 #define hpet_enabled(h)          (h->hpet.config & HPET_CFG_ENABLE)
     ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \
         >> HPET_TN_INT_ROUTE_CAP_SHIFT)
 
-#define hpet_time_after(a, b)   ((int32_t)(b) - (int32_t)(a) < 0)
-#define hpet_time_after64(a, b) ((int64_t)(b) - (int64_t)(a) < 0)
+static inline uint64_t hpet_read_maincounter(HPETState *h)
+{
+    ASSERT(spin_is_locked(&h->lock));
+
+    if ( hpet_enabled(h) )
+        return guest_time_hpet(h->vcpu) + h->mc_offset;
+    else 
+        return h->hpet.mc64;
+}
 
+static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn)
+{
+    uint64_t comparator;
+    uint64_t elapsed;
+
+    comparator = h->hpet.comparator64[tn];
+    if ( timer_is_periodic(h, tn) )
+    {
+        /* update comparator by number of periods elapsed since last update */
+        uint64_t period = h->hpet.period[tn];
+        if (period)
+        {
+            elapsed = hpet_read_maincounter(h) + period - 1 - comparator;
+            comparator += (elapsed / period) * period;
+            h->hpet.comparator64[tn] = comparator;
+        }
+    }
+    
+    /* truncate if timer is in 32 bit mode */
+    if ( timer_is_32bit(h, tn) )
+        comparator = (uint32_t)comparator;
+    h->hpet.timers[tn].cmp = comparator;
+    return comparator;
+}
 static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
 {
     addr &= ~7;
@@ -104,7 +136,7 @@ static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
     case HPET_STATUS:
         return h->hpet.isr;
     case HPET_COUNTER:
-        return h->hpet.mc64;
+        return hpet_read_maincounter(h);
     case HPET_T0_CFG:
     case HPET_T1_CFG:
     case HPET_T2_CFG:
@@ -112,7 +144,7 @@ static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
     case HPET_T0_CMP:
     case HPET_T1_CMP:
     case HPET_T2_CMP:
-        return h->hpet.timers[(addr - HPET_T0_CMP) >> 5].cmp;
+        return hpet_get_comparator(h, (addr - HPET_T0_CMP) >> 5);
     case HPET_T0_ROUTE:
     case HPET_T1_ROUTE:
     case HPET_T2_ROUTE:
@@ -140,16 +172,6 @@ static inline int hpet_check_access_length(
     return 0;
 }
 
-static inline uint64_t hpet_read_maincounter(HPETState *h)
-{
-    ASSERT(spin_is_locked(&h->lock));
-
-    if ( hpet_enabled(h) )
-        return guest_time_hpet(h->vcpu) + h->mc_offset;
-    else 
-        return h->hpet.mc64;
-}
-
 static int hpet_read(
     struct vcpu *v, unsigned long addr, unsigned long length,
     unsigned long *pval)
@@ -169,8 +191,6 @@ static int hpet_read(
     spin_lock(&h->lock);
 
     val = hpet_read64(h, addr);
-    if ( (addr & ~7) == HPET_COUNTER )
-        val = hpet_read_maincounter(h);
 
     result = val;
     if ( length != 8 )
@@ -187,7 +207,10 @@ static void hpet_stop_timer(HPETState *h, unsigned int tn)
 {
     ASSERT(tn < HPET_TIMER_NUM);
     ASSERT(spin_is_locked(&h->lock));
-    stop_timer(&h->timers[tn]);
+    destroy_periodic_time(&h->pt[tn]);
+    /* read the comparator to get it updated so a read while stopped will
+     * return the expected value. */
+    hpet_get_comparator(h, tn);
 }
 
 /* the number of HPET tick that stands for
@@ -197,6 +220,8 @@ static void hpet_stop_timer(HPETState *h, unsigned int tn)
 static void hpet_set_timer(HPETState *h, unsigned int tn)
 {
     uint64_t tn_cmp, cur_tick, diff;
+    unsigned int irq;
+    unsigned int oneshot;
 
     ASSERT(tn < HPET_TIMER_NUM);
     ASSERT(spin_is_locked(&h->lock));
@@ -209,7 +234,10 @@ static void hpet_set_timer(HPETState *h, unsigned int tn)
         pit_stop_channel0_irq(pit);
     }
 
-    tn_cmp   = h->hpet.timers[tn].cmp;
+    if ( !timer_enabled(h, tn) )
+        return;
+
+    tn_cmp   = hpet_get_comparator(h, tn);
     cur_tick = hpet_read_maincounter(h);
     if ( timer_is_32bit(h, tn) )
     {
@@ -229,7 +257,25 @@ static void hpet_set_timer(HPETState *h, unsigned int tn)
         diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
             ? (uint32_t)diff : 0;
 
-    set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, diff));
+    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
+        /* if LegacyReplacementRoute bit is set, HPET specification requires
+           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
+           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
+        irq = (tn == 0) ? 0 : 8;
+    else
+        irq = timer_int_route(h, tn);
+
+    /*
+     * diff is the time from now when the timer should fire, for a periodic 
+     * timer we also need the period which may be different because time may
+     * have elapsed between the time the comparator was written and the timer
+     * being enabled (now).
+     */
+    oneshot = !timer_is_periodic(h, tn);
+    create_periodic_time(h->vcpu, &h->pt[tn],
+                         hpet_tick_to_ns(h, diff),
+                         oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]),
+                         irq, NULL, NULL);
 }
 
 static inline uint64_t hpet_fixup_reg(
@@ -248,6 +294,13 @@ static int hpet_write(
     uint64_t old_val, new_val;
     int tn, i;
 
+    /* Acculumate a bit mask of timers whos state is changed by this write. */
+    unsigned long start_timers = 0;
+    unsigned long stop_timers  = 0;
+#define set_stop_timer(n)    (__set_bit((n), &stop_timers))
+#define set_start_timer(n)   (__set_bit((n), &start_timers))
+#define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n))
+
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
@@ -256,9 +309,6 @@ static int hpet_write(
     spin_lock(&h->lock);
 
     old_val = hpet_read64(h, addr);
-    if ( (addr & ~7) == HPET_COUNTER )
-        old_val = hpet_read_maincounter(h);
-
     new_val = val;
     if ( length != 8 )
         new_val = hpet_fixup_reg(
@@ -275,22 +325,35 @@ static int hpet_write(
             /* Enable main counter and interrupt generation. */
             h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
-                hpet_set_timer(h, i); 
+            {
+                h->hpet.comparator64[i] =
+                            h->hpet.timers[i].config & HPET_TN_32BIT ?
+                                          (uint32_t)h->hpet.timers[i].cmp :
+                                                    h->hpet.timers[i].cmp;
+                if ( timer_enabled(h, i) )
+                    set_start_timer(i);
+            }
         }
         else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) )
         {
             /* Halt main counter and disable interrupt generation. */
             h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
-                hpet_stop_timer(h, i);
+                if ( timer_enabled(h, i) )
+                    set_stop_timer(i);
         }
         break;
 
     case HPET_COUNTER:
+        h->hpet.mc64 = new_val;
         if ( hpet_enabled(h) )
+        {
             gdprintk(XENLOG_WARNING, 
                      "HPET: writing main counter but it's not halted!\n");
-        h->hpet.mc64 = new_val;
+            for ( i = 0; i < HPET_TIMER_NUM; i++ )
+                if ( timer_enabled(h, i) )
+                    set_restart_timer(i);
+        }
         break;
 
     case HPET_T0_CFG:
@@ -313,7 +376,28 @@ static int hpet_write(
             h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp;
             h->hpet.period[tn] = (uint32_t)h->hpet.period[tn];
         }
-
+        if ( hpet_enabled(h) )
+        {
+            if ( new_val & HPET_TN_ENABLE )
+            {
+                if ( (new_val ^ old_val) & HPET_TN_PERIODIC )
+                    /* timer is enabled but switching mode to/from periodic/
+                     * one-shot, stop and restart the vpt timer to get it in
+                     * the right mode. */
+                    set_restart_timer(tn);
+                else if ( (new_val & HPET_TN_32BIT) &&
+                         !(old_val & HPET_TN_32BIT) )
+                    /* switching from 64 bit to 32 bit mode could cause timer
+                     * next fire time, or period, to change. */
+                    set_restart_timer(tn);
+                else if ( !(old_val & HPET_TN_ENABLE) )
+                    /* transition from timer disabled to timer enabled. */
+                    set_start_timer(tn);
+            }
+            else if ( old_val & HPET_TN_ENABLE )
+                /* transition from timer enabled to timer disabled. */
+                set_stop_timer(tn);
+        }
         break;
 
     case HPET_T0_CMP:
@@ -322,24 +406,32 @@ static int hpet_write(
         tn = (addr - HPET_T0_CMP) >> 5;
         if ( timer_is_32bit(h, tn) )
             new_val = (uint32_t)new_val;
-        if ( !timer_is_periodic(h, tn) ||
-             (h->hpet.timers[tn].config & HPET_TN_SETVAL) )
-            h->hpet.timers[tn].cmp = new_val;
-        else
+        h->hpet.timers[tn].cmp = new_val;
+        if ( h->hpet.timers[tn].config & HPET_TN_SETVAL )
+            /*
+             * When SETVAL is one, software is able to "directly set a periodic
+             * timer's accumulator."  That is, set the comparator without
+             * adjusting the period.  Much the same as just setting the
+             * comparator on an enabled one-shot timer.
+             * 
+             * This configuration bit clears when the comparator is written.
+             */
+            h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
+        else if ( timer_is_periodic(h, tn) )
         {
             /*
              * Clamp period to reasonable min/max values:
-             *  - minimum is 900us, same as timers controlled by vpt.c
+             *  - minimum is 100us, same as timers controlled by vpt.c
              *  - maximum is to prevent overflow in time_after() calculations
              */
-            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(900) )
-                new_val = (MICROSECS(900) << 10) / h->hpet_to_ns_scale;
+            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) )
+                new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale;
             new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1;
             h->hpet.period[tn] = new_val;
         }
-        h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
-        if ( hpet_enabled(h) )
-            hpet_set_timer(h, tn);
+        h->hpet.comparator64[tn] = new_val;
+        if ( hpet_enabled(h) && timer_enabled(h, tn) )
+            set_restart_timer(tn);
         break;
 
     case HPET_T0_ROUTE:
@@ -354,6 +446,25 @@ static int hpet_write(
         break;
     }
 
+    /* stop/start timers whos state was changed by this write. */
+    while (stop_timers)
+    {
+        i = find_first_set_bit(stop_timers);
+        __clear_bit(i, &stop_timers);
+        hpet_stop_timer(h, i);
+    }
+
+    while (start_timers)
+    {
+        i = find_first_set_bit(start_timers);
+        __clear_bit(i, &start_timers);
+        hpet_set_timer(h, i);
+    }
+
+#undef set_stop_timer
+#undef set_start_timer
+#undef set_restart_timer
+
     spin_unlock(&h->lock);
 
  out:
@@ -373,86 +484,6 @@ struct hvm_mmio_handler hpet_mmio_handler = {
     .write_handler = hpet_write
 };
 
-static void hpet_route_interrupt(HPETState *h, unsigned int tn)
-{
-    unsigned int tn_int_route = timer_int_route(h, tn);
-    struct domain *d = h->vcpu->domain;
-
-    ASSERT(spin_is_locked(&h->lock));
-
-    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
-    {
-        /* if LegacyReplacementRoute bit is set, HPET specification requires
-           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
-           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
-        int isa_irq = (tn == 0) ? 0 : 8;
-        hvm_isa_irq_deassert(d, isa_irq);
-        hvm_isa_irq_assert(d, isa_irq);
-        return;
-    }
-
-    if ( !(timer_int_route_cap(h, tn) & (1U << tn_int_route)) )
-    {
-        gdprintk(XENLOG_ERR,
-                 "HPET: timer%u: invalid interrupt route config\n", tn);
-        domain_crash(d);
-        return;
-    }
-
-    /* We support only edge-triggered interrupt. */
-    spin_lock(&d->arch.hvm_domain.irq_lock);
-    vioapic_irq_positive_edge(d, tn_int_route);
-    spin_unlock(&d->arch.hvm_domain.irq_lock);
-}
-
-static void hpet_timer_fn(void *opaque)
-{
-    struct HPET_timer_fn_info *htfi = opaque;
-    HPETState *h = htfi->hs;
-    unsigned int tn = htfi->tn;
-
-    spin_lock(&h->lock);
-
-    if ( !hpet_enabled(h) )
-    {
-        spin_unlock(&h->lock);
-        return;
-    }
-
-    if ( timer_config(h, tn) & HPET_TN_ENABLE )
-        hpet_route_interrupt(h, tn);
-
-    if ( timer_is_periodic(h, tn) && (h->hpet.period[tn] != 0) )
-    {
-        uint64_t mc = hpet_read_maincounter(h), period = h->hpet.period[tn];
-        if ( timer_is_32bit(h, tn) )
-        {
-            while ( hpet_time_after(mc, h->hpet.timers[tn].cmp) )
-                h->hpet.timers[tn].cmp = (uint32_t)(
-                    h->hpet.timers[tn].cmp + period);
-        }
-        else
-        {
-            while ( hpet_time_after64(mc, h->hpet.timers[tn].cmp) )
-                h->hpet.timers[tn].cmp += period;
-        }
-        set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, period));
-    }
-
-    spin_unlock(&h->lock);
-}
-
-void hpet_migrate_timers(struct vcpu *v)
-{
-    struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
-    int i;
-
-    if ( v != h->vcpu )
-        return;
-
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        migrate_timer(&h->timers[i], v->processor);
-}
 
 static int hpet_save(struct domain *d, hvm_domain_context_t *h)
 {
@@ -477,18 +508,20 @@ static int hpet_save(struct domain *d, hvm_domain_context_t *h)
         C(isr);
         C(mc64);
         C(timers[0].config);
-        C(timers[0].cmp);
         C(timers[0].fsb);
         C(timers[1].config);
-        C(timers[1].cmp);
         C(timers[1].fsb);
         C(timers[2].config);
-        C(timers[2].cmp);
         C(timers[2].fsb);
         C(period[0]);
         C(period[1]);
         C(period[2]);
 #undef C
+        /* save the 64 bit comparator in the 64 bit timer[n].cmp field
+         * regardless of whether or not the timer is in 32 bit mode. */
+        rec->timers[0].cmp = hp->hpet.comparator64[0];
+        rec->timers[1].cmp = hp->hpet.comparator64[1];
+        rec->timers[2].cmp = hp->hpet.comparator64[2];
     }
 
     spin_unlock(&hp->lock);
@@ -500,6 +533,7 @@ static int hpet_load(struct domain *d, hvm_domain_context_t *h)
 {
     HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
     struct hvm_hw_hpet *rec;
+    uint64_t cmp;
     int i;
 
     spin_lock(&hp->lock);
@@ -515,32 +549,38 @@ static int hpet_load(struct domain *d, hvm_domain_context_t *h)
     h->cur += HVM_SAVE_LENGTH(HPET);
 
 #define C(x) hp->hpet.x = rec->x
-        C(capability);
-        C(config);
-        C(isr);
-        C(mc64);
-        C(timers[0].config);
-        C(timers[0].cmp);
-        C(timers[0].fsb);
-        C(timers[1].config);
-        C(timers[1].cmp);
-        C(timers[1].fsb);
-        C(timers[2].config);
-        C(timers[2].cmp);
-        C(timers[2].fsb);
-        C(period[0]);
-        C(period[1]);
-        C(period[2]);
+    C(capability);
+    C(config);
+    C(isr);
+    C(mc64);
+    /* The following define will generate a compiler error if HPET_TIMER_NUM
+     * changes. This indicates an incompatability with previous saved state. */
+#define HPET_TIMER_NUM 3
+    for ( i = 0; i < HPET_TIMER_NUM; i++ )
+    {
+        C(timers[i].config);
+        C(timers[i].fsb);
+        C(period[i]);
+        /* restore the hidden 64 bit comparator and truncate the timer's
+         * visible comparator field if in 32 bit mode. */
+        cmp = rec->timers[i].cmp;
+        hp->hpet.comparator64[i] = cmp;
+        if ( timer_is_32bit(hp, i) )
+            cmp = (uint32_t)cmp;
+        hp->hpet.timers[i].cmp = cmp;
+    }
 #undef C
     
     /* Recalculate the offset between the main counter and guest time */
     hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu);
-                
-    /* Restart the timers */
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        if ( hpet_enabled(hp) )
-            hpet_set_timer(hp, i);
 
+    /* restart all timers */
+
+    if ( hpet_enabled(hp) )
+        for ( i = 0; i < HPET_TIMER_NUM; i++ )
+            if ( timer_enabled(hp, i) )
+                hpet_set_timer(hp, i);
     spin_unlock(&hp->lock);
 
     return 0;
@@ -575,10 +615,7 @@ void hpet_init(struct vcpu *v)
         h->hpet.timers[i].config = 
             HPET_TN_INT_ROUTE_CAP | HPET_TN_SIZE_CAP | HPET_TN_PERIODIC_CAP;
         h->hpet.timers[i].cmp = ~0ULL;
-        h->timer_fn_info[i].hs = h;
-        h->timer_fn_info[i].tn = i;
-        init_timer(&h->timers[i], hpet_timer_fn, &h->timer_fn_info[i],
-                   v->processor);
+        h->pt[i].source = PTSRC_isa;
     }
 }
 
@@ -587,8 +624,14 @@ void hpet_deinit(struct domain *d)
     int i;
     HPETState *h = &d->arch.hvm_domain.pl_time.vhpet;
 
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        kill_timer(&h->timers[i]);
+    spin_lock(&h->lock);
+
+    if ( hpet_enabled(h) )
+        for ( i = 0; i < HPET_TIMER_NUM; i++ )
+            if ( timer_enabled(h, i) )
+                hpet_stop_timer(h, i);
+
+    spin_unlock(&h->lock);
 }
 
 void hpet_reset(struct domain *d)
index 64116f2bf36b49fda87d9e323c9bb4dc1dc7e161..9a6d6cd5a1d49ad940b3e495a85239c76484513c 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include <xen/config.h>
+#include <xen/ctype.h>
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/trace.h>
@@ -73,11 +74,12 @@ void hvm_enable(struct hvm_function_table *fns)
     printk("HVM: %s enabled\n", fns->name);
 
     /*
-     * Allow direct access to the PC debug port (it is often used for I/O
-     * delays, but the vmexits simply slow things down).
+     * Allow direct access to the PC debug ports 0x80 and 0xed (they are
+     * often used for I/O delays, but the vmexits simply slow things down).
      */
     memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap));
     __clear_bit(0x80, hvm_io_bitmap);
+    __clear_bit(0xed, hvm_io_bitmap);
 
     hvm_funcs   = *fns;
     hvm_enabled = 1;
@@ -163,7 +165,6 @@ u64 hvm_get_guest_tsc(struct vcpu *v)
 void hvm_migrate_timers(struct vcpu *v)
 {
     rtc_migrate_timers(v);
-    hpet_migrate_timers(v);
     pt_migrate(v);
 }
 
@@ -274,6 +275,10 @@ static int hvm_print_line(
 
     BUG_ON(bytes != 1);
 
+    /* Accept only printable characters, newline, and horizontal tab. */
+    if ( !isprint(c) && (c != '\n') && (c != '\t') )
+        return X86EMUL_OKAY;
+
     spin_lock(&hd->pbuf_lock);
     hd->pbuf[hd->pbuf_idx++] = c;
     if ( (hd->pbuf_idx == (sizeof(hd->pbuf) - 2)) || (c == '\n') )
@@ -304,6 +309,9 @@ int hvm_domain_initialise(struct domain *d)
     spin_lock_init(&d->arch.hvm_domain.irq_lock);
     spin_lock_init(&d->arch.hvm_domain.uc_lock);
 
+    INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
+    spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
+
     hvm_init_guest_time(d);
 
     d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1;
@@ -344,11 +352,15 @@ int hvm_domain_initialise(struct domain *d)
     return rc;
 }
 
+extern void msixtbl_pt_cleanup(struct domain *d);
+
 void hvm_domain_relinquish_resources(struct domain *d)
 {
     hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq);
     hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq);
 
+    msixtbl_pt_cleanup(d);
+
     /* Stop all asynchronous timer actions. */
     rtc_deinit(d);
     if ( d->vcpu[0] != NULL )
@@ -538,6 +550,22 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
         return -EINVAL;
     }
 
+    /* Older Xen versions used to save the segment arbytes directly 
+     * from the VMCS on Intel hosts.  Detect this and rearrange them
+     * into the struct segment_register format. */
+#define UNFOLD_ARBYTES(_r)                          \
+    if ( (_r & 0xf000) && !(_r & 0x0f00) )          \
+        _r = ((_r & 0xff) | ((_r >> 4) & 0xf00))
+    UNFOLD_ARBYTES(ctxt.cs_arbytes);
+    UNFOLD_ARBYTES(ctxt.ds_arbytes);
+    UNFOLD_ARBYTES(ctxt.es_arbytes);
+    UNFOLD_ARBYTES(ctxt.fs_arbytes);
+    UNFOLD_ARBYTES(ctxt.gs_arbytes);
+    UNFOLD_ARBYTES(ctxt.ss_arbytes);
+    UNFOLD_ARBYTES(ctxt.tr_arbytes);
+    UNFOLD_ARBYTES(ctxt.ldtr_arbytes);
+#undef UNFOLD_ARBYTES
+
     /* Architecture-specific vmcs/vmcb bits */
     if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 )
         return -EINVAL;
@@ -1504,7 +1532,15 @@ static enum hvm_copy_result __hvm_copy(
 
         if ( flags & HVMCOPY_to_guest )
         {
-            if ( p2mt != p2m_ram_ro )
+            if ( p2mt == p2m_ram_ro )
+            {
+                static unsigned long lastpage;
+                if ( xchg(&lastpage, gfn) != gfn )
+                    gdprintk(XENLOG_DEBUG, "guest attempted write to read-only"
+                             " memory page. gfn=%#lx, mfn=%#lx\n",
+                             gfn, mfn);
+            }
+            else
             {
                 memcpy(p, buf, count);
                 paging_mark_dirty(curr->domain, mfn);
@@ -1741,6 +1777,15 @@ int hvm_msr_read_intercept(struct cpu_user_regs *regs)
         msr_content = var_range_base[index];
         break;
 
+    case MSR_K8_ENABLE_C1E:
+         /* There's no point in letting the guest see C-States.
+          * Further, this AMD-only register may be accessed if this HVM guest
+          * has been migrated to an Intel host. This fixes a guest crash
+          * in this case.
+          */
+         msr_content = 0;
+         break;
+
     default:
         return hvm_funcs.msr_read_intercept(regs);
     }
@@ -1885,6 +1930,25 @@ static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE(void) arg)
     return rc;
 }
 
+static long hvm_vcpu_op(
+    int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+    long rc;
+
+    switch ( cmd )
+    {
+    case VCPUOP_register_runstate_memory_area:
+    case VCPUOP_get_runstate_info:
+        rc = do_vcpu_op(cmd, vcpuid, arg);
+        break;
+    default:
+        rc = -ENOSYS;
+        break;
+    }
+
+    return rc;
+}
+
 typedef unsigned long hvm_hypercall_t(
     unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
@@ -1896,6 +1960,7 @@ typedef unsigned long hvm_hypercall_t(
 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
     [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
     [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+    [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
     HYPERCALL(xen_version),
     HYPERCALL(event_channel_op),
     HYPERCALL(sched_op),
@@ -1912,9 +1977,29 @@ static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE(void) arg)
     return rc;
 }
 
+static long hvm_vcpu_op_compat32(
+    int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+    long rc;
+
+    switch ( cmd )
+    {
+    case VCPUOP_register_runstate_memory_area:
+    case VCPUOP_get_runstate_info:
+        rc = compat_vcpu_op(cmd, vcpuid, arg);
+        break;
+    default:
+        rc = -ENOSYS;
+        break;
+    }
+
+    return rc;
+}
+
 static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
     [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
     [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+    [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op,
     HYPERCALL(xen_version),
     HYPERCALL(event_channel_op),
     HYPERCALL(sched_op),
@@ -1924,6 +2009,7 @@ static hvm_hypercall_t *hvm_hypercall64_table[NR_hypercalls] = {
 static hvm_hypercall_t *hvm_hypercall32_table[NR_hypercalls] = {
     [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32,
     [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+    [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32,
     HYPERCALL(xen_version),
     HYPERCALL(event_channel_op),
     HYPERCALL(sched_op),
@@ -2082,7 +2168,7 @@ static int hvmop_set_pci_intx_level(
 
 void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip)
 {
-    struct domain *d = current->domain;
+    struct domain *d = v->domain;
     struct vcpu_guest_context *ctxt;
     struct segment_register reg;
 
@@ -2660,6 +2746,32 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
     return rc;
 }
 
+int hvm_debug_op(struct vcpu *v, int32_t op)
+{
+    int rc;
+
+    switch ( op )
+    {
+        case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
+        case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
+            rc = -ENOSYS;
+            if ( !cpu_has_monitor_trap_flag )
+                break;
+            rc = 0;
+            vcpu_pause(v);
+            v->arch.hvm_vcpu.single_step =
+                (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON);
+            vcpu_unpause(v); /* guest will latch new state */
+            break;
+        default:
+            rc = -ENOSYS;
+            break;
+    }
+
+    return rc;
+}
+
+
 /*
  * Local variables:
  * mode: C
index 6fa955cad6c1ac51cb33096784e9ea10e993edc7..2babc2e93d78f7d298c62c4fe4dbbb5785e095df 100644 (file)
@@ -213,13 +213,13 @@ static void pit_load_count(PITState *pit, int channel, int val)
     case 2:
     case 3:
         /* Periodic timer. */
-        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
+        create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, 
                              &pit->count_load_time[channel]);
         break;
     case 1:
     case 4:
         /* One-shot timer. */
-        create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
+        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired,
                              &pit->count_load_time[channel]);
         break;
     default:
index 0e110e00dcbe6c833635f51415bec269b7baa4af..107b87d2f700ecf2162f56a042c026aedacde6da 100644 (file)
 extern struct hvm_mmio_handler hpet_mmio_handler;
 extern struct hvm_mmio_handler vlapic_mmio_handler;
 extern struct hvm_mmio_handler vioapic_mmio_handler;
+extern struct hvm_mmio_handler msixtbl_mmio_handler;
 
-#define HVM_MMIO_HANDLER_NR 3
+#define HVM_MMIO_HANDLER_NR 4
 
 static struct hvm_mmio_handler *hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] =
 {
     &hpet_mmio_handler,
     &vlapic_mmio_handler,
-    &vioapic_mmio_handler
+    &vioapic_mmio_handler,
+    &msixtbl_mmio_handler
 };
 
 static int hvm_mmio_access(struct vcpu *v,
index 77f31a6f7f3e62ca2eb01c4e835a1af23422d96c..e1448d99ee99bab6afc1fc32e05f835ed1e28e65 100644 (file)
@@ -351,11 +351,18 @@ static uint8_t page_pat_type(uint64_t pat_cr, uint32_t pte_flags)
 static uint8_t effective_mm_type(struct mtrr_state *m,
                                  uint64_t pat,
                                  paddr_t gpa,
-                                 uint32_t pte_flags)
+                                 uint32_t pte_flags,
+                                 uint8_t gmtrr_mtype)
 {
     uint8_t mtrr_mtype, pat_value, effective;
-
-    mtrr_mtype = get_mtrr_type(m, gpa);
+   
+    /* if get_pat_flags() gives a dedicated MTRR type,
+     * just use it
+     */ 
+    if ( gmtrr_mtype == NO_HARDCODE_MEM_TYPE )
+        mtrr_mtype = get_mtrr_type(m, gpa);
+    else
+        mtrr_mtype = gmtrr_mtype;
 
     pat_value = page_pat_type(pat, pte_flags);
 
@@ -367,7 +374,8 @@ static uint8_t effective_mm_type(struct mtrr_state *m,
 uint32_t get_pat_flags(struct vcpu *v,
                        uint32_t gl1e_flags,
                        paddr_t gpaddr,
-                       paddr_t spaddr)
+                       paddr_t spaddr,
+                       uint8_t gmtrr_mtype)
 {
     uint8_t guest_eff_mm_type;
     uint8_t shadow_mtrr_type;
@@ -378,7 +386,8 @@ uint32_t get_pat_flags(struct vcpu *v,
     /* 1. Get the effective memory type of guest physical address,
      * with the pair of guest MTRR and PAT
      */
-    guest_eff_mm_type = effective_mm_type(g, pat, gpaddr, gl1e_flags);
+    guest_eff_mm_type = effective_mm_type(g, pat, gpaddr, 
+                                          gl1e_flags, gmtrr_mtype);
     /* 2. Get the memory type of host physical address, with MTRR */
     shadow_mtrr_type = get_mtrr_type(&mtrr_state, spaddr);
 
@@ -392,12 +401,16 @@ uint32_t get_pat_flags(struct vcpu *v,
      */
     if ( pat_entry_value == INVALID_MEM_TYPE )
     {
-        gdprintk(XENLOG_WARNING,
-                 "Conflict occurs for a given guest l1e flags:%x "
-                 "at %"PRIx64" (the effective mm type:%d), "
-                 "because the host mtrr type is:%d\n",
-                 gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type,
-                 shadow_mtrr_type);
+        struct domain *d = v->domain;
+        p2m_type_t p2mt;
+        gfn_to_mfn(d, paddr_to_pfn(gpaddr), &p2mt);
+        if (p2m_is_ram(p2mt))
+            gdprintk(XENLOG_WARNING,
+                    "Conflict occurs for a given guest l1e flags:%x "
+                    "at %"PRIx64" (the effective mm type:%d), "
+                    "because the host mtrr type is:%d\n",
+                    gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type,
+                    shadow_mtrr_type);
         pat_entry_value = PAT_TYPE_UNCACHABLE;
     }
     /* 4. Get the pte flags */
@@ -698,12 +711,15 @@ HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr,
                           1, HVMSR_PER_VCPU);
 
 uint8_t epte_get_entry_emt(
-    struct domain *d, unsigned long gfn, unsigned long mfn)
+    struct domain *d, unsigned long gfn, 
+    unsigned long mfn, uint8_t *igmt, int direct_mmio)
 {
     uint8_t gmtrr_mtype, hmtrr_mtype;
     uint32_t type;
     struct vcpu *v = current;
 
+    *igmt = 0;
+
     if ( (current->domain != d) && ((v = d->vcpu[0]) == NULL) )
         return MTRR_TYPE_WRBACK;
 
@@ -719,6 +735,21 @@ uint8_t epte_get_entry_emt(
     if ( hvm_get_mem_pinned_cacheattr(d, gfn, &type) )
         return type;
 
+    if ( !iommu_enabled )
+    {
+        *igmt = 1;
+        return MTRR_TYPE_WRBACK;
+    }
+
+    if ( direct_mmio )
+        return MTRR_TYPE_UNCACHABLE;
+
+    if ( iommu_snoop )
+    {
+        *igmt = 1;
+        return MTRR_TYPE_WRBACK;
+    }
+
     gmtrr_mtype = get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT));
     hmtrr_mtype = get_mtrr_type(&mtrr_state, (mfn << PAGE_SHIFT));
     return ((gmtrr_mtype <= hmtrr_mtype) ? gmtrr_mtype : hmtrr_mtype);
index 6193a9ad81adac88815dc689be1c2d1ebfab0629..9ab69033b05a55350e6dbab8b92a6656004caef1 100644 (file)
@@ -59,8 +59,8 @@ static void rtc_timer_update(RTCState *s)
 
         period = 1 << (period_code - 1); /* period in 32 Khz cycles */
         period = DIV_ROUND((period * 1000000000ULL), 32768); /* period in ns */
-        create_periodic_time(v, &s->pt, period, RTC_IRQ,
-                             0, rtc_periodic_cb, s);
+        create_periodic_time(v, &s->pt, period, period, RTC_IRQ,
+                             rtc_periodic_cb, s);
     }
     else
     {
@@ -160,6 +160,7 @@ static inline int from_bcd(RTCState *s, int a)
 static void rtc_set_time(RTCState *s)
 {
     struct tm *tm = &s->current_tm;
+    struct domain *d = vrtc_domain(s);
     unsigned long before, after; /* XXX s_time_t */
       
     ASSERT(spin_is_locked(&s->lock));
@@ -180,6 +181,12 @@ static void rtc_set_time(RTCState *s)
 
     after = mktime(tm->tm_year, tm->tm_mon, tm->tm_mday,
                    tm->tm_hour, tm->tm_min, tm->tm_sec);
+
+    /* We use the guest's setting of the RTC to define the local-time 
+     * offset for this domain. */
+    d->time_offset_seconds += (after - before);
+    update_domain_wallclock_time(d);
+    /* Also tell qemu-dm about it so it will be remembered for next boot. */
     send_timeoffset_req(after - before);
 }
 
index d09ce8dade2ea6fa061194edd6fc64433a951777..28b723fa0ddcbfb5ef85b31ede522e67c1dae77c 100644 (file)
@@ -68,24 +68,25 @@ static unsigned long svm_nextrip_insn_length(struct vcpu *v)
     if ( !cpu_has_svm_nrips || (vmcb->nextrip <= vmcb->rip) )
         return 0;
 
+#ifndef NDEBUG
     switch ( vmcb->exitcode )
     {
     case VMEXIT_CR0_READ... VMEXIT_DR15_WRITE:
         /* faults due to instruction intercepts */
         /* (exitcodes 84-95) are reserved */
     case VMEXIT_IDTR_READ ... VMEXIT_TR_WRITE:
-    case VMEXIT_RDTSC ... VMEXIT_SWINT:
-    case VMEXIT_INVD ... VMEXIT_INVLPGA:
+    case VMEXIT_RDTSC ... VMEXIT_MSR:
     case VMEXIT_VMRUN ...  VMEXIT_MWAIT_CONDITIONAL:
-    case VMEXIT_IOIO:
         /* ...and the rest of the #VMEXITs */
     case VMEXIT_CR0_SEL_WRITE:
-    case VMEXIT_MSR:
     case VMEXIT_EXCEPTION_BP:
-        return vmcb->nextrip - vmcb->rip;
+        break;
+    default:
+        BUG();
     }
-  
-    return 0;
+#endif
+
+    return vmcb->nextrip - vmcb->rip;
 }
 
 /* First byte: Length. Following bytes: Opcode bytes. */
index 17bf00155b6805e340df2bc4c52c32b1ab56e73d..30af14cb1796fb3537e07f216d9a2df6677d7da0 100644 (file)
@@ -57,6 +57,8 @@
 #endif
 
 ENTRY(svm_asm_do_resume)
+        call svm_intr_assist
+
         get_current(bx)
         CLGI
 
@@ -67,7 +69,6 @@ ENTRY(svm_asm_do_resume)
         jnz  .Lsvm_process_softirqs
 
         call svm_asid_handle_vmrun
-        call svm_intr_assist
 
         cmpb $0,addr_of(tb_init_done)
         jnz  .Lsvm_trace
index ff95cb64c1653f848b4c405d40bfb0923bb6c578..b6499070c0c6d20d1c582633dbc2ca665e1c49d0 100644 (file)
@@ -80,7 +80,8 @@ static void enable_intr_window(struct vcpu *v, struct hvm_intack intack)
 
     ASSERT(intack.source != hvm_intsrc_none);
 
-    HVMTRACE_2D(INJ_VIRQ, 0x0, /*fake=*/ 1);
+    HVMTRACE_3D(INTR_WINDOW, intack.vector, intack.source,
+                vmcb->eventinj.fields.v?vmcb->eventinj.fields.vector:-1);
 
     /*
      * Create a dummy virtual interrupt to intercept as soon as the
@@ -100,61 +101,6 @@ static void enable_intr_window(struct vcpu *v, struct hvm_intack intack)
     vmcb->general1_intercepts |= GENERAL1_INTERCEPT_VINTR;
 }
 
-extern int vmsi_deliver(struct domain *d, int pirq);
-static int hvm_pci_msi_assert(struct domain *d, int pirq)
-{
-    return vmsi_deliver(d, pirq);
-}
-
-static void svm_dirq_assist(struct vcpu *v)
-{
-    unsigned int irq;
-    uint32_t device, intx;
-    struct domain *d = v->domain;
-    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
-    struct dev_intx_gsi_link *digl;
-
-    if ( !iommu_enabled || (v->vcpu_id != 0) || (hvm_irq_dpci == NULL) )
-        return;
-
-    for ( irq = find_first_bit(hvm_irq_dpci->dirq_mask, NR_IRQS);
-          irq < NR_IRQS;
-          irq = find_next_bit(hvm_irq_dpci->dirq_mask, NR_IRQS, irq + 1) )
-    {
-        if ( !test_and_clear_bit(irq, &hvm_irq_dpci->dirq_mask) )
-            continue;
-
-        spin_lock(&d->event_lock);
-        if ( test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[irq].flags) )
-        {
-            hvm_pci_msi_assert(d, irq);
-            spin_unlock(&d->event_lock);
-            continue;
-        }
-
-        stop_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)]);
-
-        list_for_each_entry ( digl, &hvm_irq_dpci->mirq[irq].digl_list, list )
-        {
-            device = digl->device;
-            intx = digl->intx;
-            hvm_pci_intx_assert(d, device, intx);
-            hvm_irq_dpci->mirq[irq].pending++;
-        }
-
-        /*
-         * Set a timer to see if the guest can finish the interrupt or not. For
-         * example, the guest OS may unmask the PIC during boot, before the
-         * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
-         * guest will never deal with the irq, then the physical interrupt line
-         * will never be deasserted.
-         */
-        set_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)],
-                  NOW() + PT_IRQ_TIME_OUT);
-        spin_unlock(&d->event_lock);
-    }
-}
-
 asmlinkage void svm_intr_assist(void) 
 {
     struct vcpu *v = current;
@@ -163,7 +109,7 @@ asmlinkage void svm_intr_assist(void)
 
     /* Crank the handle on interrupt state. */
     pt_update_irq(v);
-    svm_dirq_assist(v);
+    hvm_dirq_assist(v);
 
     do {
         intack = hvm_vcpu_has_pending_irq(v);
index c635f5204aeef649f9c4156565bf994a06edada6..bd320fbe1fed60d68dd1638ca463842f644055da 100644 (file)
@@ -488,28 +488,40 @@ static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
     {
     case x86_seg_cs:
         memcpy(reg, &vmcb->cs, sizeof(*reg));
+        reg->attr.fields.g = reg->limit > 0xFFFFF;
         break;
     case x86_seg_ds:
         memcpy(reg, &vmcb->ds, sizeof(*reg));
+        if ( reg->attr.fields.type != 0 )
+            reg->attr.fields.type |= 0x1;
         break;
     case x86_seg_es:
         memcpy(reg, &vmcb->es, sizeof(*reg));
+        if ( reg->attr.fields.type != 0 )
+            reg->attr.fields.type |= 0x1;
         break;
     case x86_seg_fs:
         svm_sync_vmcb(v);
         memcpy(reg, &vmcb->fs, sizeof(*reg));
+        if ( reg->attr.fields.type != 0 )
+            reg->attr.fields.type |= 0x1;
         break;
     case x86_seg_gs:
         svm_sync_vmcb(v);
         memcpy(reg, &vmcb->gs, sizeof(*reg));
+        if ( reg->attr.fields.type != 0 )
+            reg->attr.fields.type |= 0x1;
         break;
     case x86_seg_ss:
         memcpy(reg, &vmcb->ss, sizeof(*reg));
         reg->attr.fields.dpl = vmcb->cpl;
+        if ( reg->attr.fields.type == 0 )
+            reg->attr.fields.db = 0;
         break;
     case x86_seg_tr:
         svm_sync_vmcb(v);
         memcpy(reg, &vmcb->tr, sizeof(*reg));
+        reg->attr.fields.type |= 0x2;
         break;
     case x86_seg_gdtr:
         memcpy(reg, &vmcb->gdtr, sizeof(*reg));
@@ -739,6 +751,23 @@ static void svm_inject_exception(
     struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
     eventinj_t event = vmcb->eventinj;
 
+    switch ( trapnr )
+    {
+    case TRAP_debug:
+        if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
+        {
+            __restore_debug_registers(curr);
+            vmcb->dr6 |= 0x4000;
+        }
+    case TRAP_int3:
+        if ( curr->domain->debugger_attached )
+        {
+            /* Debug/Int3: Trap to debugger. */
+            domain_pause_for_debugger();
+            return;
+        }
+    }
+
     if ( unlikely(event.fields.v) &&
          (event.fields.type == X86_EVENTTYPE_HW_EXCEPTION) )
     {
@@ -765,13 +794,6 @@ static void svm_inject_exception(
     {
         HVMTRACE_2D(INJ_EXC, trapnr, errcode);
     }
-
-    if ( (trapnr == TRAP_debug) &&
-         (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
-    {
-        __restore_debug_registers(curr);
-        vmcb->dr6 |= 0x4000;
-    }
 }
 
 static int svm_event_pending(struct vcpu *v)
@@ -878,7 +900,7 @@ static void svm_do_nested_pgfault(paddr_t gpa, struct cpu_user_regs *regs)
      * If this GFN is emulated MMIO or marked as read-only, pass the fault
      * to the mmio handler.
      */
-    mfn = gfn_to_mfn_current(gfn, &p2mt);
+    mfn = gfn_to_mfn_type_current(gfn, &p2mt, p2m_guest);
     if ( (p2mt == p2m_mmio_dm) || (p2mt == p2m_ram_ro) )
     {
         if ( !handle_mmio() )
index 73894f881c286eaaf497e430687a1649b7e4767d..080c05c762c1d8fdbe3cf606fc494d2ac289a4b6 100644 (file)
@@ -138,7 +138,7 @@ static int construct_vmcb(struct vcpu *v)
                             CR_INTERCEPT_CR8_WRITE);
 
     /* I/O and MSR permission bitmaps. */
-    arch_svm->msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE));
+    arch_svm->msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
     if ( arch_svm->msrpm == NULL )
         return -ENOMEM;
     memset(arch_svm->msrpm, 0xff, MSRPM_SIZE);
index 7250de3a7d3961fc3f9dddc3543db40af8a85bf4..7f63699ab27e1893869dd2a8ea02c8bb6eb0c32b 100644 (file)
@@ -344,8 +344,8 @@ static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq)
         }
         else
 #endif
-            target = apic_round_robin(vioapic_domain(vioapic),
-                                      vector, deliver_bitmask);
+            target = apic_lowest_prio(vioapic_domain(vioapic),
+                                      deliver_bitmask);
         if ( target != NULL )
         {
             ioapic_inj_irq(vioapic, target, vector, trig_mode, delivery_mode);
index b6389e6c47d998a6d720df22e17de21c75c66c99..a18ea9e3d14bfa0fe728d9b62e46a88da2a6ac81 100644 (file)
@@ -37,6 +37,7 @@
 
 /* Viridian CPUID 4000004, Implementation Recommendations. */
 #define CPUID4A_MSR_BASED_APIC  (1 << 3)
+#define CPUID4A_RELAX_TIMER_INT (1 << 5)
 
 int cpuid_viridian_leaves(unsigned int leaf, unsigned int *eax,
                           unsigned int *ebx, unsigned int *ecx,
@@ -84,7 +85,8 @@ int cpuid_viridian_leaves(unsigned int leaf, unsigned int *eax,
         if ( (d->arch.hvm_domain.viridian.guest_os_id.raw == 0) ||
              (d->arch.hvm_domain.viridian.guest_os_id.fields.os < 4) )
             break;
-        *eax = CPUID4A_MSR_BASED_APIC;
+        *eax = (CPUID4A_MSR_BASED_APIC |
+                CPUID4A_RELAX_TIMER_INT);
         *ebx = 2047; /* long spin count */
         break;
     }
index 2da4b7fb734b154c34186687b01d13a3f10513cb..68e9b27632c01830ee89729cfa0f458b9b0b7f43 100644 (file)
@@ -377,26 +377,30 @@ static int vlapic_accept_irq(struct vcpu *v, int delivery_mode,
 }
 
 /* This function is used by both ioapic and lapic.The bitmap is for vcpu_id. */
-struct vlapic *apic_round_robin(
-    struct domain *d, uint8_t vector, uint32_t bitmap)
+struct vlapic *apic_lowest_prio(struct domain *d, uint32_t bitmap)
 {
-    int next, old;
-    struct vlapic *target = NULL;
+    int old = d->arch.hvm_domain.irq.round_robin_prev_vcpu;
+    uint32_t ppr, target_ppr = UINT_MAX;
+    struct vlapic *vlapic, *target = NULL;
+    struct vcpu *v;
 
-    old = next = d->arch.hvm_domain.irq.round_robin_prev_vcpu;
+    if ( unlikely((v = d->vcpu[old]) == NULL) )
+        return NULL;
 
     do {
-        if ( ++next == MAX_VIRT_CPUS ) 
-            next = 0;
-        if ( (d->vcpu[next] == NULL) || !test_bit(next, &bitmap) )
-            continue;
-        target = vcpu_vlapic(d->vcpu[next]);
-        if ( vlapic_enabled(target) )
-            break;
-        target = NULL;
-    } while ( next != old );
+        v = v->next_in_list ? : d->vcpu[0];
+        vlapic = vcpu_vlapic(v);
+        if ( test_bit(v->vcpu_id, &bitmap) && vlapic_enabled(vlapic) &&
+             ((ppr = vlapic_get_ppr(vlapic)) < target_ppr) )
+        {
+            target = vlapic;
+            target_ppr = ppr;
+        }
+    } while ( v->vcpu_id != old );
 
-    d->arch.hvm_domain.irq.round_robin_prev_vcpu = next;
+    if ( target != NULL )
+        d->arch.hvm_domain.irq.round_robin_prev_vcpu =
+            vlapic_vcpu(target)->vcpu_id;
 
     return target;
 }
@@ -456,7 +460,7 @@ int vlapic_ipi(
 
     if ( delivery_mode == APIC_DM_LOWEST )
     {
-        target = apic_round_robin(vlapic_domain(v), vector, lpr_map);
+        target = apic_lowest_prio(vlapic_domain(v), lpr_map);
         if ( target != NULL )
             rc = vlapic_accept_irq(vlapic_vcpu(target), delivery_mode,
                                    vector, level, trig_mode);
@@ -701,8 +705,9 @@ static int vlapic_write(struct vcpu *v, unsigned long address,
                             (uint32_t)val * vlapic->hw.timer_divisor;
 
         vlapic_set_reg(vlapic, APIC_TMICT, val);
-        create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq,
-                             !vlapic_lvtt_period(vlapic), vlapic_pt_cb,
+        create_periodic_time(current, &vlapic->pt, period, 
+                             vlapic_lvtt_period(vlapic) ? period : 0,
+                             vlapic->pt.irq, vlapic_pt_cb,
                              &vlapic->timer_last_update);
         vlapic->timer_last_update = vlapic->pt.last_plt_gtime;
 
@@ -861,8 +866,9 @@ static void lapic_rearm(struct vlapic *s)
     period = ((uint64_t)APIC_BUS_CYCLE_NS *
               (uint32_t)tmict * s->hw.timer_divisor);
     s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK;
-    create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
-                         !vlapic_lvtt_period(s), vlapic_pt_cb,
+    create_periodic_time(vlapic_vcpu(s), &s->pt, period,
+                         vlapic_lvtt_period(s) ? period : 0,
+                         s->pt.irq, vlapic_pt_cb,
                          &s->timer_last_update);
     s->timer_last_update = s->pt.last_plt_gtime;
 }
index 6eefb61bfa0da052bab036b56e3b650bf3dea907..37c1e5c14f8544d67eb5d1895dc75659273c7c3d 100644 (file)
@@ -134,7 +134,7 @@ int vmsi_deliver(struct domain *d, int pirq)
                 "vector=%x trig_mode=%x\n",
                 dest, dest_mode, delivery_mode, vector, trig_mode);
 
-    if ( !test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags) )
+    if ( !( hvm_irq_dpci->mirq[pirq].flags & HVM_IRQ_DPCI_GUEST_MSI ) )
     {
         gdprintk(XENLOG_WARNING, "pirq %x not msi \n", pirq);
         return 0;
@@ -152,7 +152,7 @@ int vmsi_deliver(struct domain *d, int pirq)
     {
     case dest_LowestPrio:
     {
-        target = apic_round_robin(d, vector, deliver_bitmask);
+        target = apic_lowest_prio(d, deliver_bitmask);
         if ( target != NULL )
             vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode);
         else
@@ -193,3 +193,296 @@ int vmsi_deliver(struct domain *d, int pirq)
     return 1;
 }
 
+/* MSI-X mask bit hypervisor interception */
+struct msixtbl_entry
+{
+    struct list_head list;
+    atomic_t refcnt;    /* how many bind_pt_irq called for the device */
+
+    /* TODO: resolve the potential race by destruction of pdev */
+    struct pci_dev *pdev;
+    unsigned long gtable;       /* gpa of msix table */
+    unsigned long table_len;
+    unsigned long table_flags[MAX_MSIX_TABLE_ENTRIES / BITS_PER_LONG + 1];
+
+    struct rcu_head rcu;
+};
+
+static struct msixtbl_entry *msixtbl_find_entry(
+    struct vcpu *v, unsigned long addr)
+{
+    struct msixtbl_entry *entry;
+    struct domain *d = v->domain;
+
+    list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list )
+        if ( addr >= entry->gtable &&
+             addr < entry->gtable + entry->table_len )
+            return entry;
+
+    return NULL;
+}
+
+static void __iomem *msixtbl_addr_to_virt(
+    struct msixtbl_entry *entry, unsigned long addr)
+{
+    int idx, nr_page;
+
+    if ( !entry )
+        return NULL;
+
+    nr_page = (addr >> PAGE_SHIFT) -
+              (entry->gtable >> PAGE_SHIFT);
+
+    if ( !entry->pdev )
+        return NULL;
+
+    idx = entry->pdev->msix_table_idx[nr_page];
+    if ( !idx )
+        return NULL;
+
+    return (void *)(fix_to_virt(idx) +
+                    (addr & ((1UL << PAGE_SHIFT) - 1)));
+}
+
+static int msixtbl_read(
+    struct vcpu *v, unsigned long address,
+    unsigned long len, unsigned long *pval)
+{
+    unsigned long offset;
+    struct msixtbl_entry *entry;
+    void *virt;
+    int r = X86EMUL_UNHANDLEABLE;
+
+    rcu_read_lock();
+
+    if ( len != 4 )
+        goto out;
+
+    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
+    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+        goto out;
+
+    entry = msixtbl_find_entry(v, address);
+    virt = msixtbl_addr_to_virt(entry, address);
+    if ( !virt )
+        goto out;
+
+    *pval = readl(virt);
+    r = X86EMUL_OKAY;
+
+out:
+    rcu_read_unlock();
+    return r;
+}
+
+static int msixtbl_write(struct vcpu *v, unsigned long address,
+                        unsigned long len, unsigned long val)
+{
+    unsigned long offset;
+    struct msixtbl_entry *entry;
+    void *virt;
+    int nr_entry;
+    int r = X86EMUL_UNHANDLEABLE;
+
+    rcu_read_lock();
+
+    if ( len != 4 )
+        goto out;
+
+    entry = msixtbl_find_entry(v, address);
+    nr_entry = (address - entry->gtable) % PCI_MSIX_ENTRY_SIZE;
+
+    offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
+    if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET)
+    {
+        set_bit(nr_entry, &entry->table_flags);
+        goto out;
+    }
+
+    /* exit to device model if address/data has been modified */
+    if ( test_and_clear_bit(nr_entry, &entry->table_flags) )
+        goto out;
+
+    virt = msixtbl_addr_to_virt(entry, address);
+    if ( !virt )
+        goto out;
+
+    writel(val, virt);
+    r = X86EMUL_OKAY;
+
+out:
+    rcu_read_unlock();
+    return r;
+}
+
+static int msixtbl_range(struct vcpu *v, unsigned long addr)
+{
+    struct msixtbl_entry *entry;
+    void *virt;
+
+    rcu_read_lock();
+
+    entry = msixtbl_find_entry(v, addr);
+    virt = msixtbl_addr_to_virt(entry, addr);
+
+    rcu_read_unlock();
+
+    return !!virt;
+}
+
+struct hvm_mmio_handler msixtbl_mmio_handler = {
+    .check_handler = msixtbl_range,
+    .read_handler = msixtbl_read,
+    .write_handler = msixtbl_write
+};
+
+static void add_msixtbl_entry(struct domain *d,
+                              struct pci_dev *pdev,
+                              uint64_t gtable,
+                              struct msixtbl_entry *entry)
+{
+    u32 len;
+
+    memset(entry, 0, sizeof(struct msixtbl_entry));
+        
+    INIT_LIST_HEAD(&entry->list);
+    INIT_RCU_HEAD(&entry->rcu);
+    atomic_set(&entry->refcnt, 0);
+
+    len = pci_msix_get_table_len(pdev);
+    entry->table_len = len;
+    entry->pdev = pdev;
+    entry->gtable = (unsigned long) gtable;
+
+    list_add_rcu(&entry->list, &d->arch.hvm_domain.msixtbl_list);
+}
+
+static void free_msixtbl_entry(struct rcu_head *rcu)
+{
+    struct msixtbl_entry *entry;
+
+    entry = container_of (rcu, struct msixtbl_entry, rcu);
+
+    xfree(entry);
+}
+
+static void del_msixtbl_entry(struct msixtbl_entry *entry)
+{
+    list_del_rcu(&entry->list);
+    call_rcu(&entry->rcu, free_msixtbl_entry);
+}
+
+int msixtbl_pt_register(struct domain *d, int pirq, uint64_t gtable)
+{
+    irq_desc_t *irq_desc;
+    struct msi_desc *msi_desc;
+    struct pci_dev *pdev;
+    struct msixtbl_entry *entry, *new_entry;
+    int r = -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    /*
+     * xmalloc() with irq_disabled causes the failure of check_lock() 
+     * for xenpool->lock. So we allocate an entry beforehand.
+     */
+    new_entry = xmalloc(struct msixtbl_entry);
+    if ( !new_entry )
+        return -ENOMEM;
+
+    irq_desc = domain_spin_lock_irq_desc(d, pirq, NULL);
+    if ( !irq_desc )
+    {
+        xfree(new_entry);
+        return r;
+    }
+
+    if ( irq_desc->handler != &pci_msi_type )
+        goto out;
+
+    msi_desc = irq_desc->msi_desc;
+    if ( !msi_desc )
+        goto out;
+
+    pdev = msi_desc->dev;
+
+    spin_lock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list )
+        if ( pdev == entry->pdev )
+            goto found;
+
+    entry = new_entry;
+    new_entry = NULL;
+    add_msixtbl_entry(d, pdev, gtable, entry);
+
+found:
+    atomic_inc(&entry->refcnt);
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+    r = 0;
+
+out:
+    spin_unlock_irq(&irq_desc->lock);
+    xfree(new_entry);
+    return r;
+}
+
+void msixtbl_pt_unregister(struct domain *d, int pirq)
+{
+    irq_desc_t *irq_desc;
+    struct msi_desc *msi_desc;
+    struct pci_dev *pdev;
+    struct msixtbl_entry *entry;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    irq_desc = domain_spin_lock_irq_desc(d, pirq, NULL);
+    if ( !irq_desc )
+        return;
+
+    if ( irq_desc->handler != &pci_msi_type )
+        goto out;
+
+    msi_desc = irq_desc->msi_desc;
+    if ( !msi_desc )
+        goto out;
+
+    pdev = msi_desc->dev;
+
+    spin_lock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list )
+        if ( pdev == entry->pdev )
+            goto found;
+
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+
+out:
+    spin_unlock_irq(&irq_desc->lock);
+    return;
+
+found:
+    if ( !atomic_dec_and_test(&entry->refcnt) )
+        del_msixtbl_entry(entry);
+
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+    spin_unlock_irq(&irq_desc->lock);
+}
+
+void msixtbl_pt_cleanup(struct domain *d, int pirq)
+{
+    struct msixtbl_entry *entry, *temp;
+    unsigned long flags;
+
+    /* msixtbl_list_lock must be acquired with irq_disabled for check_lock() */
+    local_irq_save(flags); 
+    spin_lock(&d->arch.hvm_domain.msixtbl_list_lock);
+
+    list_for_each_entry_safe( entry, temp,
+                              &d->arch.hvm_domain.msixtbl_list, list )
+        del_msixtbl_entry(entry);
+
+    spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
+    local_irq_restore(flags);
+}
index c595cb2c27ea7c2396e51263622bd2e08c5873a9..8720efcceecba993c7416285e5d08e0d9dc5c9fa 100644 (file)
@@ -122,6 +122,8 @@ vmx_asm_vmexit_handler:
 
 .globl vmx_asm_do_vmentry
 vmx_asm_do_vmentry:
+        call vmx_intr_assist
+
         get_current(bx)
         cli
 
@@ -131,11 +133,15 @@ vmx_asm_do_vmentry:
         cmpl $0,(r(dx),r(ax),1)
         jnz  .Lvmx_process_softirqs
 
-        call vmx_intr_assist
-
-        testb $0xff,VCPU_vmx_emul(r(bx))
-        jnz  .Lvmx_goto_realmode
+        testb $0xff,VCPU_vmx_emulate(r(bx))
+        jnz .Lvmx_goto_emulator
+        testb $0xff,VCPU_vmx_realmode(r(bx))
+        jz .Lvmx_not_realmode
+        cmpw $0,VCPU_vm86_seg_mask(r(bx))
+        jnz .Lvmx_goto_emulator
+        call_with_regs(vmx_enter_realmode) 
 
+.Lvmx_not_realmode:
         mov  VCPU_hvm_guest_cr2(r(bx)),r(ax)
         mov  r(ax),%cr2
         call vmx_trace_vmentry
@@ -179,15 +185,17 @@ vmx_asm_do_vmentry:
 
 /*.Lvmx_resume:*/
         VMRESUME
+        sti
         call vm_resume_fail
         ud2
 
 .Lvmx_launch:
         VMLAUNCH
+        sti
         call vm_launch_fail
         ud2
 
-.Lvmx_goto_realmode:
+.Lvmx_goto_emulator:
         sti
         call_with_regs(vmx_realmode)
         jmp  vmx_asm_do_vmentry
index a4d42350181d6135b4c074ba1a6ac38df45c7be7..afd9d419190c6aacf8f792e64586cfae54ac37af 100644 (file)
@@ -74,6 +74,13 @@ static void enable_intr_window(struct vcpu *v, struct hvm_intack intack)
 
     ASSERT(intack.source != hvm_intsrc_none);
 
+    if ( unlikely(tb_init_done) )
+    {
+        unsigned int intr = __vmread(VM_ENTRY_INTR_INFO);
+        HVMTRACE_3D(INTR_WINDOW, intack.vector, intack.source,
+                    (intr & INTR_INFO_VALID_MASK) ? intr & 0xff : -1);
+    }
+
     if ( (intack.source == hvm_intsrc_nmi) && cpu_has_vmx_vnmi )
     {
         /*
@@ -103,61 +110,6 @@ static void enable_intr_window(struct vcpu *v, struct hvm_intack intack)
     }
 }
 
-extern int vmsi_deliver(struct domain *d, int pirq);
-static int hvm_pci_msi_assert(struct domain *d, int pirq)
-{
-    return vmsi_deliver(d, pirq);
-}
-
-static void vmx_dirq_assist(struct vcpu *v)
-{
-    unsigned int irq;
-    uint32_t device, intx;
-    struct domain *d = v->domain;
-    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
-    struct dev_intx_gsi_link *digl;
-
-    if ( !iommu_enabled || (v->vcpu_id != 0) || (hvm_irq_dpci == NULL) )
-        return;
-
-    for ( irq = find_first_bit(hvm_irq_dpci->dirq_mask, NR_IRQS);
-          irq < NR_IRQS;
-          irq = find_next_bit(hvm_irq_dpci->dirq_mask, NR_IRQS, irq + 1) )
-    {
-        if ( !test_and_clear_bit(irq, &hvm_irq_dpci->dirq_mask) )
-            continue;
-
-        spin_lock(&d->event_lock);
-        if ( test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[irq].flags) )
-        {
-            hvm_pci_msi_assert(d, irq);
-            spin_unlock(&d->event_lock);
-            continue;
-        }
-
-        stop_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)]);
-
-        list_for_each_entry ( digl, &hvm_irq_dpci->mirq[irq].digl_list, list )
-        {
-            device = digl->device;
-            intx = digl->intx;
-            hvm_pci_intx_assert(d, device, intx);
-            hvm_irq_dpci->mirq[irq].pending++;
-        }
-
-        /*
-         * Set a timer to see if the guest can finish the interrupt or not. For
-         * example, the guest OS may unmask the PIC during boot, before the
-         * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
-         * guest will never deal with the irq, then the physical interrupt line
-         * will never be deasserted.
-         */
-        set_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)],
-                  NOW() + PT_IRQ_TIME_OUT);
-        spin_unlock(&d->event_lock);
-    }
-}
-
 asmlinkage void vmx_intr_assist(void)
 {
     struct hvm_intack intack;
@@ -165,9 +117,17 @@ asmlinkage void vmx_intr_assist(void)
     unsigned int tpr_threshold = 0;
     enum hvm_intblk intblk;
 
+    /* Block event injection when single step with MTF. */
+    if ( unlikely(v->arch.hvm_vcpu.single_step) )
+    {
+        v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG;
+        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+        return;
+    }
+
     /* Crank the handle on interrupt state. */
     pt_update_irq(v);
-    vmx_dirq_assist(v);
+    hvm_dirq_assist(v);
 
     do {
         intack = hvm_vcpu_has_pending_irq(v);
@@ -195,12 +155,12 @@ asmlinkage void vmx_intr_assist(void)
 
     if ( intack.source == hvm_intsrc_nmi )
     {
-        vmx_inject_nmi(v);
+        vmx_inject_nmi();
     }
     else
     {
         HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
-        vmx_inject_extint(v, intack.vector);
+        vmx_inject_extint(intack.vector);
         pt_intr_post(v, intack);
     }
 
index 5d13f4e60b693676136dbd23929f51c6f31cdd77..b3574c20b40a3653e16ee9c9996670db16324dd8 100644 (file)
@@ -69,7 +69,8 @@ static void realmode_deliver_exception(
     frame[1] = csr->sel;
     frame[2] = regs->eflags & ~X86_EFLAGS_RF;
 
-    if ( hvmemul_ctxt->ctxt.addr_size == 32 )
+    /* We can't test hvmemul_ctxt->ctxt.sp_size: it may not be initialised. */
+    if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db )
     {
         regs->esp -= 6;
         pstk = regs->esp;
@@ -102,31 +103,13 @@ static void realmode_deliver_exception(
 static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     struct vcpu *curr = current;
-    unsigned long seg_reg_dirty;
     uint32_t intr_info;
     int rc;
 
-    seg_reg_dirty = hvmemul_ctxt->seg_reg_dirty;
-    hvmemul_ctxt->seg_reg_dirty = 0;
+    perfc_incr(realmode_emulations);
 
     rc = hvm_emulate_one(hvmemul_ctxt);
 
-    if ( test_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty) )
-    {
-        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_CS;
-        if ( hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel & 3 )
-            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_CS;
-    }
-
-    if ( test_bit(x86_seg_ss, &hvmemul_ctxt->seg_reg_dirty) )
-    {
-        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_SS;
-        if ( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->sel & 3 )
-            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_SS;
-    }
-
-    hvmemul_ctxt->seg_reg_dirty |= seg_reg_dirty;
-
     if ( rc == X86EMUL_UNHANDLEABLE )
     {
         gdprintk(XENLOG_ERR, "Failed to emulate insn.\n");
@@ -148,17 +131,25 @@ static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
             hvmemul_ctxt->exn_insn_len = 0;
         }
 
-        if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE )
+        if ( unlikely(curr->domain->debugger_attached) &&
+             ((hvmemul_ctxt->exn_vector == TRAP_debug) ||
+              (hvmemul_ctxt->exn_vector == TRAP_int3)) )
+        {
+            domain_pause_for_debugger();
+        }
+        else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE )
         {
             gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n",
                      hvmemul_ctxt->exn_vector);
             goto fail;
         }
-
-        realmode_deliver_exception(
-            hvmemul_ctxt->exn_vector,
-            hvmemul_ctxt->exn_insn_len,
-            hvmemul_ctxt);
+        else
+        {
+            realmode_deliver_exception(
+                hvmemul_ctxt->exn_vector,
+                hvmemul_ctxt->exn_insn_len,
+                hvmemul_ctxt);
+        }
     }
 
     return;
@@ -201,7 +192,8 @@ void vmx_realmode(struct cpu_user_regs *regs)
         intr_info = 0;
     }
 
-    while ( curr->arch.hvm_vmx.vmxemul &&
+    curr->arch.hvm_vmx.vmx_emulate = 1;
+    while ( curr->arch.hvm_vmx.vmx_emulate &&
             !softirq_pending(smp_processor_id()) &&
             (curr->arch.hvm_vcpu.io_state == HVMIO_none) )
     {
@@ -211,13 +203,27 @@ void vmx_realmode(struct cpu_user_regs *regs)
          * in real mode, because we don't emulate protected-mode IDT vectoring.
          */
         if ( unlikely(!(++emulations & 15)) &&
-             !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) &&
+             curr->arch.hvm_vmx.vmx_realmode && 
              hvm_local_events_need_delivery(curr) )
             break;
+
         realmode_emulate_one(&hvmemul_ctxt);
+
+        /* Stop emulating unless our segment state is not safe */
+        if ( curr->arch.hvm_vmx.vmx_realmode )
+            curr->arch.hvm_vmx.vmx_emulate = 
+                (curr->arch.hvm_vmx.vm86_segment_mask != 0);
+        else
+            curr->arch.hvm_vmx.vmx_emulate = 
+                 ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3)
+                  || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3));
     }
 
-    if ( !curr->arch.hvm_vmx.vmxemul )
+    /* Need to emulate next time if we've started an IO operation */
+    if ( curr->arch.hvm_vcpu.io_state != HVMIO_none )
+        curr->arch.hvm_vmx.vmx_emulate = 1;
+
+    if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode )
     {
         /*
          * Cannot enter protected mode with bogus selector RPLs and DPLs.
index 8fdeb40008c0860abe600f9030c9270cbf4bbce7..c86b55e6f73c34cf32e2b77f54388dc3075fd36d 100644 (file)
@@ -55,6 +55,25 @@ static DEFINE_PER_CPU(struct list_head, active_vmcs_list);
 
 static u32 vmcs_revision_id __read_mostly;
 
+static void __init vmx_display_features(void)
+{
+    int printed = 0;
+
+    printk("VMX: Supported advanced features:\n");
+
+#define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
+    P(cpu_has_vmx_virtualize_apic_accesses, "APIC MMIO access virtualisation");
+    P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow");
+    P(cpu_has_vmx_ept, "Extended Page Tables (EPT)");
+    P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)");
+    P(cpu_has_vmx_vnmi, "Virtual NMI");
+    P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap");
+#undef P
+
+    if ( !printed )
+        printk(" - none\n");
+}
+
 static u32 adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr)
 {
     u32 vmx_msr_low, vmx_msr_high, ctl = ctl_min | ctl_opt;
@@ -99,6 +118,7 @@ static void vmx_init_vmcs_config(void)
            (opt_softtsc ? CPU_BASED_RDTSC_EXITING : 0));
     opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
            CPU_BASED_TPR_SHADOW |
+           CPU_BASED_MONITOR_TRAP_FLAG |
            CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
     _vmx_cpu_based_exec_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
@@ -147,14 +167,15 @@ static void vmx_init_vmcs_config(void)
 #endif
 
     min = VM_EXIT_ACK_INTR_ON_EXIT;
-    opt = 0;
+    opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT;
 #ifdef __x86_64__
     min |= VM_EXIT_IA32E_MODE;
 #endif
     _vmx_vmexit_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_EXIT_CTLS);
 
-    min = opt = 0;
+    min = 0;
+    opt = VM_ENTRY_LOAD_GUEST_PAT;
     _vmx_vmentry_control = adjust_vmx_controls(
         min, opt, MSR_IA32_VMX_ENTRY_CTLS);
 
@@ -168,6 +189,7 @@ static void vmx_init_vmcs_config(void)
         vmx_vmexit_control         = _vmx_vmexit_control;
         vmx_vmentry_control        = _vmx_vmentry_control;
         cpu_has_vmx_ins_outs_instr_info = !!(vmx_basic_msr_high & (1U<<22));
+        vmx_display_features();
     }
     else
     {
@@ -444,6 +466,8 @@ static void vmx_set_host_env(struct vcpu *v)
 {
     unsigned int cpu = smp_processor_id();
 
+    __vmwrite(HOST_GDTR_BASE,
+              (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY));
     __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
 
     __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
@@ -496,8 +520,6 @@ static int construct_vmcs(struct vcpu *v)
 
     /* VMCS controls. */
     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
-    __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
-    __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
 
     v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
     v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
@@ -511,9 +533,18 @@ static int construct_vmcs(struct vcpu *v)
     else
     {
         v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+        vmx_vmexit_control &= ~(VM_EXIT_SAVE_GUEST_PAT |
+                                VM_EXIT_LOAD_HOST_PAT);
+        vmx_vmentry_control &= ~VM_ENTRY_LOAD_GUEST_PAT;
     }
 
+    /* Do not enable Monitor Trap Flag unless start single step debug */
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+
     __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+    __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
+    __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
+
     if ( cpu_has_vmx_secondary_exec_control )
         __vmwrite(SECONDARY_VM_EXEC_CONTROL,
                   v->arch.hvm_vmx.secondary_exec_control);
@@ -535,15 +566,14 @@ static int construct_vmcs(struct vcpu *v)
         vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS);
         vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP);
         vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP);
+        if ( cpu_has_vmx_pat && paging_mode_hap(d) )
+            vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT);
     }
 
     /* I/O access bitmap. */
     __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
     __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
 
-    /* Host GDTR base. */
-    __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v));
-
     /* Host data selectors. */
     __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
     __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
@@ -669,6 +699,21 @@ static int construct_vmcs(struct vcpu *v)
         __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vmx.vpid);
     }
 
+    if ( cpu_has_vmx_pat && paging_mode_hap(d) )
+    {
+        u64 host_pat, guest_pat;
+
+        rdmsrl(MSR_IA32_CR_PAT, host_pat);
+        guest_pat = 0x7040600070406ULL;
+
+        __vmwrite(HOST_PAT, host_pat);
+        __vmwrite(GUEST_PAT, guest_pat);
+#ifdef __i386__
+        __vmwrite(HOST_PAT_HIGH, host_pat >> 32);
+        __vmwrite(GUEST_PAT_HIGH, guest_pat >> 32);
+#endif
+    }
+
     vmx_vmcs_exit(v);
 
     paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
@@ -868,7 +913,11 @@ void vmx_do_resume(struct vcpu *v)
     if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
     {
         unsigned long intercepts = __vmread(EXCEPTION_BITMAP);
-        unsigned long mask = (1U << TRAP_debug) | (1U << TRAP_int3);
+        unsigned long mask = 1u << TRAP_int3;
+
+        if ( !cpu_has_monitor_trap_flag )
+            mask |= 1u << TRAP_debug;
+
         v->arch.hvm_vcpu.debug_state_latch = debug_state;
         if ( debug_state )
             intercepts |= mask;
@@ -881,15 +930,6 @@ void vmx_do_resume(struct vcpu *v)
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
-static void vmx_dump_sel(char *name, enum x86_segment seg)
-{
-    struct segment_register sreg;
-    hvm_get_segment_register(current, seg, &sreg);
-    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", 
-           name, sreg.sel, sreg.attr.bytes, sreg.limit,
-           (unsigned long long)sreg.base);
-}
-
 static unsigned long vmr(unsigned long field)
 {
     int rc;
@@ -898,6 +938,28 @@ static unsigned long vmr(unsigned long field)
     return rc ? 0 : val;
 }
 
+static void vmx_dump_sel(char *name, uint32_t selector)
+{
+    uint32_t sel, attr, limit;
+    uint64_t base;
+    sel = vmr(selector);
+    attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
+    limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
+    base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
+    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n",
+           name, sel, attr, limit, base);
+}
+
+static void vmx_dump_sel2(char *name, uint32_t lim)
+{
+    uint32_t limit;
+    uint64_t base;
+    limit = vmr(lim);
+    base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+    printk("%s:                           limit=0x%08x, base=0x%016"PRIx64"\n",
+           name, limit, base);
+}
+
 void vmcs_dump_vcpu(struct vcpu *v)
 {
     struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
@@ -939,16 +1001,18 @@ void vmcs_dump_vcpu(struct vcpu *v)
            (unsigned long long)vmr(GUEST_SYSENTER_ESP),
            (int)vmr(GUEST_SYSENTER_CS),
            (unsigned long long)vmr(GUEST_SYSENTER_EIP));
-    vmx_dump_sel("CS", x86_seg_cs);
-    vmx_dump_sel("DS", x86_seg_ds);
-    vmx_dump_sel("SS", x86_seg_ss);
-    vmx_dump_sel("ES", x86_seg_es);
-    vmx_dump_sel("FS", x86_seg_fs);
-    vmx_dump_sel("GS", x86_seg_gs);
-    vmx_dump_sel("GDTR", x86_seg_gdtr);
-    vmx_dump_sel("LDTR", x86_seg_ldtr);
-    vmx_dump_sel("IDTR", x86_seg_idtr);
-    vmx_dump_sel("TR", x86_seg_tr);
+    vmx_dump_sel("CS", GUEST_CS_SELECTOR);
+    vmx_dump_sel("DS", GUEST_DS_SELECTOR);
+    vmx_dump_sel("SS", GUEST_SS_SELECTOR);
+    vmx_dump_sel("ES", GUEST_ES_SELECTOR);
+    vmx_dump_sel("FS", GUEST_FS_SELECTOR);
+    vmx_dump_sel("GS", GUEST_GS_SELECTOR);
+    vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
+    vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
+    vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
+    vmx_dump_sel("TR", GUEST_TR_SELECTOR);
+    printk("Guest PAT = 0x%08x%08x\n",
+           (uint32_t)vmr(GUEST_PAT_HIGH), (uint32_t)vmr(GUEST_PAT));
     x  = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
     x |= (uint32_t)vmr(TSC_OFFSET);
     printk("TSC Offset = %016llx\n", x);
@@ -987,6 +1051,8 @@ void vmcs_dump_vcpu(struct vcpu *v)
            (unsigned long long)vmr(HOST_SYSENTER_ESP),
            (int)vmr(HOST_SYSENTER_CS),
            (unsigned long long)vmr(HOST_SYSENTER_EIP));
+    printk("Host PAT = 0x%08x%08x\n",
+           (uint32_t)vmr(HOST_PAT_HIGH), (uint32_t)vmr(HOST_PAT));
 
     printk("*** Control State ***\n");
     printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
index 407b677899d360cc5ac4cebbab11517991e37899..3374ebb6571d2042b5e76349d2c672b551b83964 100644 (file)
@@ -49,6 +49,7 @@
 #include <asm/hvm/vpt.h>
 #include <public/hvm/save.h>
 #include <asm/hvm/trace.h>
+#include <asm/xenoprof.h>
 
 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
 
@@ -132,6 +133,7 @@ static void vmx_vcpu_destroy(struct vcpu *v)
 {
     vmx_destroy_vmcs(v);
     vpmu_destroy(v);
+    passive_domain_destroy(v);
 }
 
 #ifdef __x86_64__
@@ -185,7 +187,7 @@ static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
     check_long_mode:
         if ( !(hvm_long_mode_enabled(v)) )
         {
-            vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+            vmx_inject_hw_exception(TRAP_gp_fault, 0);
             return HNDL_exception_raised;
         }
         break;
@@ -282,7 +284,7 @@ static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs)
  uncanonical_address:
     HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", ecx);
  gp_fault:
-    vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+    vmx_inject_hw_exception(TRAP_gp_fault, 0);
  exception_raised:
     return HNDL_exception_raised;
 }
@@ -304,9 +306,6 @@ static void vmx_restore_host_msrs(void)
         wrmsrl(msr_index[i], host_msr_state->msrs[i]);
         clear_bit(i, &host_msr_state->flags);
     }
-
-    if ( cpu_has_nx && !(read_efer() & EFER_NX) )
-        write_efer(read_efer() | EFER_NX);
 }
 
 static void vmx_save_guest_msrs(struct vcpu *v)
@@ -340,39 +339,23 @@ static void vmx_restore_guest_msrs(struct vcpu *v)
         clear_bit(i, &guest_flags);
     }
 
-    if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & (EFER_NX | EFER_SCE) )
+    if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE )
     {
         HVM_DBG_LOG(DBG_LEVEL_2,
                     "restore guest's EFER with value %lx",
                     v->arch.hvm_vcpu.guest_efer);
-        write_efer((read_efer() & ~(EFER_NX | EFER_SCE)) |
-                   (v->arch.hvm_vcpu.guest_efer & (EFER_NX | EFER_SCE)));
+        write_efer((read_efer() & ~EFER_SCE) |
+                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
     }
 }
 
 #else  /* __i386__ */
 
 #define vmx_save_host_msrs()        ((void)0)
-
-static void vmx_restore_host_msrs(void)
-{
-    if ( cpu_has_nx && !(read_efer() & EFER_NX) )
-        write_efer(read_efer() | EFER_NX);
-}
+#define vmx_restore_host_msrs()     ((void)0)
 
 #define vmx_save_guest_msrs(v)      ((void)0)
-
-static void vmx_restore_guest_msrs(struct vcpu *v)
-{
-    if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_NX )
-    {
-        HVM_DBG_LOG(DBG_LEVEL_2,
-                    "restore guest's EFER with value %lx",
-                    v->arch.hvm_vcpu.guest_efer);
-        write_efer((read_efer() & ~EFER_NX) |
-                   (v->arch.hvm_vcpu.guest_efer & EFER_NX));
-    }
-}
+#define vmx_restore_guest_msrs(v)   ((void)0)
 
 static enum handler_return long_mode_do_msr_read(struct cpu_user_regs *regs)
 {
@@ -702,6 +685,26 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
     vpmu_load(v);
 }
 
+
+/* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
+ * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
+ * The guest thinks it's got ring-0 segments, so we need to fudge
+ * things.  We store the ring-3 version in the VMCS to avoid lots of
+ * shuffling on vmenter and vmexit, and translate in these accessors. */
+
+#define rm_cs_attr (((union segment_attributes) {                       \
+        .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define rm_ds_attr (((union segment_attributes) {                       \
+        .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_ds_attr (((union segment_attributes) {                     \
+        .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_tr_attr (((union segment_attributes) {                     \
+        .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+
 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
@@ -777,14 +780,85 @@ static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
     /* Unusable flag is folded into Present flag. */
     if ( attr & (1u<<16) )
         reg->attr.fields.p = 0;
+
+    /* Adjust for virtual 8086 mode */
+    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr 
+         && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
+    {
+        struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
+        if ( seg == x86_seg_tr ) 
+            *reg = *sreg;
+        else if ( reg->base != sreg->base || seg == x86_seg_ss )
+        {
+            /* If the guest's reloaded the segment, remember the new version.
+             * We can't tell if the guest reloaded the segment with another 
+             * one that has the same base.  By default we assume it hasn't,
+             * since we don't want to lose big-real-mode segment attributes,
+             * but for SS we assume it has: the Ubuntu graphical bootloader
+             * does this and gets badly confused if we leave the old SS in 
+             * place. */
+            reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
+            *sreg = *reg;
+        }
+        else 
+        {
+            /* Always give realmode guests a selector that matches the base
+             * but keep the attr and limit from before */
+            *reg = *sreg;
+            reg->sel = reg->base >> 4;
+        }
+    }
 }
 
 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
-    uint32_t attr;
+    uint32_t attr, sel, limit;
+    uint64_t base;
 
+    sel = reg->sel;
     attr = reg->attr.bytes;
+    limit = reg->limit;
+    base = reg->base;
+
+    /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
+    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
+    {
+        /* Remember the proper contents */
+        v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
+        
+        if ( seg == x86_seg_tr ) 
+        {
+            if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
+            {
+                sel = 0;
+                attr = vm86_tr_attr;
+                limit = 0xff;
+                base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
+                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+            }
+            else
+                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+        }
+        else
+        {
+            /* Try to fake it out as a 16bit data segment.  This could
+             * cause confusion for the guest if it reads the selector,
+             * but otherwise we have to emulate if *any* segment hasn't
+             * been reloaded. */
+            if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
+                 && reg->attr.fields.p )
+            {
+                sel = base >> 4;
+                attr = vm86_ds_attr;
+                limit = 0xffff;
+                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+            }
+            else 
+                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+        }
+    }
+
     attr = ((attr & 0xf00) << 4) | (attr & 0xff);
 
     /* Not-present must mean unusable. */
@@ -792,67 +866,67 @@ static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
         attr |= (1u << 16);
 
     /* VMX has strict consistency requirement for flag G. */
-    attr |= !!(reg->limit >> 20) << 15;
+    attr |= !!(limit >> 20) << 15;
 
     vmx_vmcs_enter(v);
 
     switch ( seg )
     {
     case x86_seg_cs:
-        __vmwrite(GUEST_CS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_CS_LIMIT, reg->limit);
-        __vmwrite(GUEST_CS_BASE, reg->base);
+        __vmwrite(GUEST_CS_SELECTOR, sel);
+        __vmwrite(GUEST_CS_LIMIT, limit);
+        __vmwrite(GUEST_CS_BASE, base);
         __vmwrite(GUEST_CS_AR_BYTES, attr);
         break;
     case x86_seg_ds:
-        __vmwrite(GUEST_DS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_DS_LIMIT, reg->limit);
-        __vmwrite(GUEST_DS_BASE, reg->base);
+        __vmwrite(GUEST_DS_SELECTOR, sel);
+        __vmwrite(GUEST_DS_LIMIT, limit);
+        __vmwrite(GUEST_DS_BASE, base);
         __vmwrite(GUEST_DS_AR_BYTES, attr);
         break;
     case x86_seg_es:
-        __vmwrite(GUEST_ES_SELECTOR, reg->sel);
-        __vmwrite(GUEST_ES_LIMIT, reg->limit);
-        __vmwrite(GUEST_ES_BASE, reg->base);
+        __vmwrite(GUEST_ES_SELECTOR, sel);
+        __vmwrite(GUEST_ES_LIMIT, limit);
+        __vmwrite(GUEST_ES_BASE, base);
         __vmwrite(GUEST_ES_AR_BYTES, attr);
         break;
     case x86_seg_fs:
-        __vmwrite(GUEST_FS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_FS_LIMIT, reg->limit);
-        __vmwrite(GUEST_FS_BASE, reg->base);
+        __vmwrite(GUEST_FS_SELECTOR, sel);
+        __vmwrite(GUEST_FS_LIMIT, limit);
+        __vmwrite(GUEST_FS_BASE, base);
         __vmwrite(GUEST_FS_AR_BYTES, attr);
         break;
     case x86_seg_gs:
-        __vmwrite(GUEST_GS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_GS_LIMIT, reg->limit);
-        __vmwrite(GUEST_GS_BASE, reg->base);
+        __vmwrite(GUEST_GS_SELECTOR, sel);
+        __vmwrite(GUEST_GS_LIMIT, limit);
+        __vmwrite(GUEST_GS_BASE, base);
         __vmwrite(GUEST_GS_AR_BYTES, attr);
         break;
     case x86_seg_ss:
-        __vmwrite(GUEST_SS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_SS_LIMIT, reg->limit);
-        __vmwrite(GUEST_SS_BASE, reg->base);
+        __vmwrite(GUEST_SS_SELECTOR, sel);
+        __vmwrite(GUEST_SS_LIMIT, limit);
+        __vmwrite(GUEST_SS_BASE, base);
         __vmwrite(GUEST_SS_AR_BYTES, attr);
         break;
     case x86_seg_tr:
-        __vmwrite(GUEST_TR_SELECTOR, reg->sel);
-        __vmwrite(GUEST_TR_LIMIT, reg->limit);
-        __vmwrite(GUEST_TR_BASE, reg->base);
+        __vmwrite(GUEST_TR_SELECTOR, sel);
+        __vmwrite(GUEST_TR_LIMIT, limit);
+        __vmwrite(GUEST_TR_BASE, base);
         /* VMX checks that the the busy flag (bit 1) is set. */
         __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
         break;
     case x86_seg_gdtr:
-        __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_GDTR_BASE, reg->base);
+        __vmwrite(GUEST_GDTR_LIMIT, limit);
+        __vmwrite(GUEST_GDTR_BASE, base);
         break;
     case x86_seg_idtr:
-        __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_IDTR_BASE, reg->base);
+        __vmwrite(GUEST_IDTR_LIMIT, limit);
+        __vmwrite(GUEST_IDTR_BASE, base);
         break;
     case x86_seg_ldtr:
-        __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
-        __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_LDTR_BASE, reg->base);
+        __vmwrite(GUEST_LDTR_SELECTOR, sel);
+        __vmwrite(GUEST_LDTR_LIMIT, limit);
+        __vmwrite(GUEST_LDTR_BASE, base);
         __vmwrite(GUEST_LDTR_AR_BYTES, attr);
         break;
     default:
@@ -968,6 +1042,7 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
     switch ( cr )
     {
     case 0: {
+        int realmode;
         unsigned long hw_cr0_mask =
             X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
 
@@ -996,9 +1071,44 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
                 vmx_fpu_enter(v);
         }
 
-        v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
-        if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
-            v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
+        realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE); 
+        if ( realmode != v->arch.hvm_vmx.vmx_realmode )
+        {
+            enum x86_segment s; 
+            struct segment_register reg[x86_seg_tr + 1];
+
+            /* Entering or leaving real mode: adjust the segment registers.
+             * Need to read them all either way, as realmode reads can update
+             * the saved values we'll use when returning to prot mode. */
+            for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+                vmx_get_segment_register(v, s, &reg[s]);
+            v->arch.hvm_vmx.vmx_realmode = realmode;
+            
+            if ( realmode )
+            {
+                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+                    vmx_set_segment_register(v, s, &reg[s]);
+                v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
+                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+                __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
+            }
+            else 
+            {
+                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) 
+                    if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
+                        vmx_set_segment_register(
+                            v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
+                v->arch.hvm_vcpu.hw_cr[4] =
+                    ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
+                     |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
+                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+                __vmwrite(EXCEPTION_BITMAP, 
+                          HVM_TRAP_MASK
+                          | (paging_mode_hap(v->domain) ?
+                             0 : (1U << TRAP_page_fault))
+                          | (1U << TRAP_no_device));
+            }
+        }
 
         v->arch.hvm_vcpu.hw_cr[0] =
             v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
@@ -1026,6 +1136,8 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
         if ( paging_mode_hap(v->domain) )
             v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
         v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+        if ( v->arch.hvm_vmx.vmx_realmode ) 
+            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
         if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
         {
             v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
@@ -1059,8 +1171,8 @@ static void vmx_update_guest_efer(struct vcpu *v)
 #endif
 
     if ( v == current )
-        write_efer((read_efer() & ~(EFER_NX|EFER_SCE)) |
-                   (v->arch.hvm_vcpu.guest_efer & (EFER_NX|EFER_SCE)));
+        write_efer((read_efer() & ~EFER_SCE) |
+                   (v->arch.hvm_vcpu.guest_efer & EFER_SCE));
 }
 
 static void vmx_flush_guest_tlbs(void)
@@ -1092,10 +1204,10 @@ void ept_sync_domain(struct domain *d)
     }
 }
 
-static void __vmx_inject_exception(
-    struct vcpu *v, int trap, int type, int error_code)
+static void __vmx_inject_exception(int trap, int type, int error_code)
 {
     unsigned long intr_fields;
+    struct vcpu *curr = current;
 
     /*
      * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
@@ -1113,16 +1225,35 @@ static void __vmx_inject_exception(
 
     __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
 
-    if ( trap == TRAP_page_fault )
-        HVMTRACE_LONG_2D(PF_INJECT, error_code,
-            TRC_PAR_LONG(v->arch.hvm_vcpu.guest_cr[2]));
-    else
-        HVMTRACE_2D(INJ_EXC, trap, error_code);
+    /* Can't inject exceptions in virtual 8086 mode because they would 
+     * use the protected-mode IDT.  Emulate at the next vmenter instead. */
+    if ( curr->arch.hvm_vmx.vmx_realmode ) 
+        curr->arch.hvm_vmx.vmx_emulate = 1;
 }
 
-void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
+void vmx_inject_hw_exception(int trap, int error_code)
 {
     unsigned long intr_info = __vmread(VM_ENTRY_INTR_INFO);
+    struct vcpu *curr = current;
+
+    switch ( trap )
+    {
+    case TRAP_debug:
+        if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF )
+        {
+            __restore_debug_registers(curr);
+            write_debugreg(6, read_debugreg(6) | 0x4000);
+        }
+        if ( cpu_has_monitor_trap_flag )
+            break;
+    case TRAP_int3:
+        if ( curr->domain->debugger_attached )
+        {
+            /* Debug/Int3: Trap to debugger. */
+            domain_pause_for_debugger();
+            return;
+        }
+    }
 
     if ( unlikely(intr_info & INTR_INFO_VALID_MASK) &&
          (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) )
@@ -1132,37 +1263,34 @@ void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
             error_code = 0;
     }
 
-    __vmx_inject_exception(v, trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
+    __vmx_inject_exception(trap, X86_EVENTTYPE_HW_EXCEPTION, error_code);
+
+    if ( trap == TRAP_page_fault )
+        HVMTRACE_LONG_2D(PF_INJECT, error_code,
+                         TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2]));
+    else
+        HVMTRACE_2D(INJ_EXC, trap, error_code);
 }
 
-void vmx_inject_extint(struct vcpu *v, int trap)
+void vmx_inject_extint(int trap)
 {
-    __vmx_inject_exception(v, trap, X86_EVENTTYPE_EXT_INTR,
+    __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR,
                            HVM_DELIVER_NO_ERROR_CODE);
 }
 
-void vmx_inject_nmi(struct vcpu *v)
+void vmx_inject_nmi(void)
 {
-    __vmx_inject_exception(v, 2, X86_EVENTTYPE_NMI,
+    __vmx_inject_exception(2, X86_EVENTTYPE_NMI,
                            HVM_DELIVER_NO_ERROR_CODE);
 }
 
 static void vmx_inject_exception(
     unsigned int trapnr, int errcode, unsigned long cr2)
 {
-    struct vcpu *curr = current;
-
-    vmx_inject_hw_exception(curr, trapnr, errcode);
-
     if ( trapnr == TRAP_page_fault )
-        curr->arch.hvm_vcpu.guest_cr[2] = cr2;
+        current->arch.hvm_vcpu.guest_cr[2] = cr2;
 
-    if ( (trapnr == TRAP_debug) &&
-         (guest_cpu_user_regs()->eflags & X86_EFLAGS_TF) )
-    {
-        __restore_debug_registers(curr);
-        write_debugreg(6, read_debugreg(6) | 0x4000);
-    }
+    vmx_inject_hw_exception(trapnr, errcode);
 }
 
 static int vmx_event_pending(struct vcpu *v)
@@ -1186,8 +1314,29 @@ static void vmx_set_uc_mode(struct vcpu *v)
 
 static void vmx_set_info_guest(struct vcpu *v)
 {
+    unsigned long intr_shadow;
+
     vmx_vmcs_enter(v);
+
     __vmwrite(GUEST_DR7, v->arch.guest_context.debugreg[7]);
+
+    /* 
+     * If the interruptibility-state field indicates blocking by STI,
+     * setting the TF flag in the EFLAGS may cause VM entry to fail
+     * and crash the guest. See SDM 3B 22.3.1.5.
+     * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but
+     * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs
+     * immediately vmexit and hence make no progress.
+     */
+    intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO);
+    if ( v->domain->debugger_attached &&
+         (v->arch.guest_context.user_regs.eflags & X86_EFLAGS_TF) &&
+         (intr_shadow & VMX_INTR_SHADOW_STI) )
+    {
+        intr_shadow &= ~VMX_INTR_SHADOW_STI;
+        __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
+    }
+
     vmx_vmcs_exit(v);
 }
 
@@ -1313,7 +1462,7 @@ static void __update_guest_eip(unsigned long inst_len)
     }
 
     if ( regs->eflags & X86_EFLAGS_TF )
-        vmx_inject_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE, 0);
+        vmx_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
 }
 
 static void vmx_fpu_dirty_intercept(void)
@@ -1634,7 +1783,6 @@ static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
 {
     u64 msr_content = 0;
     u32 ecx = regs->ecx, eax, edx;
-    struct vcpu *v = current;
 
     HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
 
@@ -1666,6 +1814,8 @@ static int vmx_msr_read_intercept(struct cpu_user_regs *regs)
     default:
         if ( vpmu_do_rdmsr(regs) )
             goto done;
+        if ( passive_domain_do_rdmsr(regs) )
+            goto done;
         switch ( long_mode_do_msr_read(regs) )
         {
             case HNDL_unhandled:
@@ -1708,7 +1858,7 @@ done:
     return X86EMUL_OKAY;
 
 gp_fault:
-    vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+    vmx_inject_hw_exception(TRAP_gp_fault, 0);
     return X86EMUL_EXCEPTION;
 }
 
@@ -1845,7 +1995,7 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
 
         if ( (rc < 0) ||
              (vmx_add_host_load_msr(ecx) < 0) )
-            vmx_inject_hw_exception(v, TRAP_machine_check, 0);
+            vmx_inject_hw_exception(TRAP_machine_check, 0);
         else
         {
             __vmwrite(GUEST_IA32_DEBUGCTL, msr_content);
@@ -1861,6 +2011,8 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
     default:
         if ( vpmu_do_wrmsr(regs) )
             return X86EMUL_OKAY;
+        if ( passive_domain_do_wrmsr(regs) )
+            return X86EMUL_OKAY;
 
         if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) ) 
             break;
@@ -1883,7 +2035,7 @@ static int vmx_msr_write_intercept(struct cpu_user_regs *regs)
     return X86EMUL_OKAY;
 
 gp_fault:
-    vmx_inject_hw_exception(v, TRAP_gp_fault, 0);
+    vmx_inject_hw_exception(TRAP_gp_fault, 0);
     return X86EMUL_EXCEPTION;
 }
 
@@ -1899,7 +2051,8 @@ static void vmx_do_extint(struct cpu_user_regs *regs)
     fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
     fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
     fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+    fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs);
+#ifdef CONFIG_X86_MCE_THERMAL
     fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
 #endif
 
@@ -1929,10 +2082,13 @@ static void vmx_do_extint(struct cpu_user_regs *regs)
     case ERROR_APIC_VECTOR:
         smp_error_interrupt(regs);
         break;
+    case CMCI_APIC_VECTOR:
+        smp_cmci_interrupt(regs);
+        break;
     case PMU_APIC_VECTOR:
         smp_pmu_apic_interrupt(regs);
         break;
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_MCE_THERMAL
     case THERMAL_APIC_VECTOR:
         smp_thermal_interrupt(regs);
         break;
@@ -1964,44 +2120,68 @@ static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
 {
     unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
     struct domain *d = current->domain;
-    unsigned long gfn = gpa >> PAGE_SHIFT;
+    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
     mfn_t mfn;
     p2m_type_t t;
 
-    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
-    {
-        gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr
-                 " exceeded its width limit.\n", gpa);
-        goto crash;
-    }
+    mfn = gfn_to_mfn_guest(d, gfn, &t);
 
-    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) ||
-         unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) )
+    /* There are three legitimate reasons for taking an EPT violation. 
+     * One is a guest access to MMIO space. */
+    if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
     {
-        gdprintk(XENLOG_ERR, "EPT violation: reserved bit or "
-                 "pdptr load violation.\n");
-        goto crash;
+        handle_mmio();
+        return;
     }
 
-    mfn = gfn_to_mfn(d, gfn, &t);
-    if ( (t != p2m_ram_ro) && p2m_is_ram(t) && paging_mode_log_dirty(d) )
+    /* The second is log-dirty mode, writing to a read-only page;
+     * The third is populating a populate-on-demand page. */
+    if ( (gla_validity == EPT_GLA_VALIDITY_MATCH
+          || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
+         && p2m_is_ram(t) && (t != p2m_ram_ro) )
     {
-        paging_mark_dirty(d, mfn_x(mfn));
-        p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
-        flush_tlb_mask(d->domain_dirty_cpumask);
+        if ( paging_mode_log_dirty(d) )
+        {
+            paging_mark_dirty(d, mfn_x(mfn));
+            p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
+            flush_tlb_mask(d->domain_dirty_cpumask);
+        }
         return;
     }
 
-    /* This can only happen in log-dirty mode, writing back A/D bits. */
-    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) )
-        goto crash;
-
-    ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH);
-    handle_mmio();
-
-    return;
+    /* Everything else is an error. */
+    gla = __vmread(GUEST_LINEAR_ADDRESS);
+    gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
+             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", 
+             qualification, 
+             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
+             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
+             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
+             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
+             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
+             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
+             gpa, mfn_x(mfn), t);
+
+    if ( qualification & EPT_GAW_VIOLATION )
+        gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n", 
+                 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
+
+    switch ( gla_validity )
+    {
+    case EPT_GLA_VALIDITY_PDPTR_LOAD:
+        gdprintk(XENLOG_ERR, " --- PDPTR load failed\n"); 
+        break;
+    case EPT_GLA_VALIDITY_GPT_WALK:
+        gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
+        break;
+    case EPT_GLA_VALIDITY_RSVD:
+        gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
+        break;
+    case EPT_GLA_VALIDITY_MATCH:
+        gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
+        break;
+    }
 
- crash:
     domain_crash(d);
 }
 
@@ -2038,6 +2218,17 @@ static void vmx_failed_vmentry(unsigned int exit_reason,
     domain_crash(curr->domain);
 }
 
+asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+
+    /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3.  Since
+     * we have CR4.VME == 1 and our own TSS with an empty interrupt
+     * redirection bitmap, all software INTs will be handled by vm86 */
+    v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
+    regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+}
+
 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned int exit_reason, idtv_info;
@@ -2056,12 +2247,52 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
 
     perfc_incra(vmexits, exit_reason);
 
-    if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
-        local_irq_enable();
+    /* Handle the interrupt we missed before allowing any more in. */
+    if ( exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT )
+        vmx_do_extint(regs);
+
+    /* Now enable interrupts so it's safe to take locks. */
+    local_irq_enable();
 
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
         return vmx_failed_vmentry(exit_reason, regs);
 
+    if ( v->arch.hvm_vmx.vmx_realmode )
+    {
+        unsigned int vector;
+
+        /* Put RFLAGS back the way the guest wants it */
+        regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+        regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
+
+        /* Unless this exit was for an interrupt, we've hit something
+         * vm86 can't handle.  Try again, using the emulator. */
+        switch ( exit_reason )
+        {
+        case EXIT_REASON_EXCEPTION_NMI:
+            vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
+            if ( vector != TRAP_page_fault
+                 && vector != TRAP_nmi 
+                 && vector != TRAP_machine_check ) 
+            {
+                perfc_incr(realmode_exits);
+                v->arch.hvm_vmx.vmx_emulate = 1;
+                return;
+            }
+        case EXIT_REASON_EXTERNAL_INTERRUPT:
+        case EXIT_REASON_INIT:
+        case EXIT_REASON_SIPI:
+        case EXIT_REASON_PENDING_VIRT_INTR:
+        case EXIT_REASON_PENDING_VIRT_NMI:
+        case EXIT_REASON_MACHINE_CHECK:
+            break;
+        default:
+            v->arch.hvm_vmx.vmx_emulate = 1;
+            perfc_incr(realmode_exits);
+            return;
+        }
+    }
+
     hvm_maybe_deassert_evtchn_irq();
 
     /* Event delivery caused this intercept? Queue for redelivery. */
@@ -2128,7 +2359,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
              */
             exit_qualification = __vmread(EXIT_QUALIFICATION);
             write_debugreg(6, exit_qualification | 0xffff0ff0);
-            if ( !v->domain->debugger_attached )
+            if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag )
                 goto exit_and_crash;
             domain_pause_for_debugger();
             break;
@@ -2166,7 +2397,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
             }
 
             v->arch.hvm_vcpu.guest_cr[2] = exit_qualification;
-            vmx_inject_hw_exception(v, TRAP_page_fault, regs->error_code);
+            vmx_inject_hw_exception(TRAP_page_fault, regs->error_code);
             break;
         case TRAP_nmi:
             if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
@@ -2185,7 +2416,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
         break;
     }
     case EXIT_REASON_EXTERNAL_INTERRUPT:
-        vmx_do_extint(regs);
+        /* Already handled above. */
         break;
     case EXIT_REASON_TRIPLE_FAULT:
         hvm_triple_fault();
@@ -2286,7 +2517,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_VMWRITE:
     case EXIT_REASON_VMXOFF:
     case EXIT_REASON_VMXON:
-        vmx_inject_hw_exception(v, TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        vmx_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
         break;
 
     case EXIT_REASON_TPR_BELOW_THRESHOLD:
@@ -2295,7 +2526,7 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
     case EXIT_REASON_IO_INSTRUCTION:
     case EXIT_REASON_APIC_ACCESS:
         if ( !handle_mmio() )
-            hvm_inject_exception(TRAP_gp_fault, 0, 0);
+            vmx_inject_hw_exception(TRAP_gp_fault, 0);
         break;
 
     case EXIT_REASON_INVD:
@@ -2318,6 +2549,15 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
         break;
     }
 
+    case EXIT_REASON_MONITOR_TRAP_FLAG:
+    {
+        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+        if ( v->domain->debugger_attached && v->arch.hvm_vcpu.single_step )
+            domain_pause_for_debugger();
+        break;
+    }
+
     default:
     exit_and_crash:
         gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
index 5062d9c28d462c7513f1594ffb3dd948099e2b08..37054f22ebf327d5d59ec9b7ec5eca57de1555c8 100644 (file)
 #include <asm/hvm/vmx/vpmu.h>
 #include <asm/hvm/vmx/vpmu_core2.h>
 
+u32 core2_counters_msr[] =   {
+    MSR_CORE_PERF_FIXED_CTR0,
+    MSR_CORE_PERF_FIXED_CTR1,
+    MSR_CORE_PERF_FIXED_CTR2};
+
+/* Core 2 Non-architectual Performance Control MSRs. */
+u32 core2_ctrls_msr[] = {
+    MSR_CORE_PERF_FIXED_CTR_CTRL,
+    MSR_IA32_PEBS_ENABLE,
+    MSR_IA32_DS_AREA};
+
+struct pmumsr core2_counters = {
+    3,
+    core2_counters_msr
+};
+
+struct pmumsr core2_ctrls = {
+    3,
+    core2_ctrls_msr
+};
 static int arch_pmc_cnt;
 
 static int core2_get_pmc_count(void)
@@ -276,7 +296,8 @@ static int core2_vpmu_msr_common_check(u32 msr_index, int *type, int *index)
         return 0;
 
     if ( unlikely(!(vpmu->flags & VPMU_CONTEXT_ALLOCATED)) &&
-         !core2_vpmu_alloc_resource(current) )
+        (vpmu->context != NULL ||
+         !core2_vpmu_alloc_resource(current)) )
         return 0;
     vpmu->flags |= VPMU_CONTEXT_ALLOCATED;
 
@@ -315,7 +336,7 @@ static int core2_vpmu_do_wrmsr(struct cpu_user_regs *regs)
     case MSR_CORE_PERF_GLOBAL_STATUS:
         gdprintk(XENLOG_INFO, "Can not write readonly MSR: "
                  "MSR_PERF_GLOBAL_STATUS(0x38E)!\n");
-        vmx_inject_hw_exception(current, TRAP_gp_fault, 0);
+        vmx_inject_hw_exception(TRAP_gp_fault, 0);
         return 1;
     case MSR_IA32_PEBS_ENABLE:
         if ( msr_content & 1 )
@@ -461,13 +482,14 @@ static void core2_vpmu_destroy(struct vcpu *v)
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
     struct core2_vpmu_context *core2_vpmu_cxt = vpmu->context;
 
-    if ( !vpmu->flags & VPMU_CONTEXT_ALLOCATED )
+    if ( !(vpmu->flags & VPMU_CONTEXT_ALLOCATED) )
         return;
     xfree(core2_vpmu_cxt->pmu_enable);
     xfree(vpmu->context);
     if ( cpu_has_vmx_msr_bitmap )
         core2_vpmu_unset_msr_bitmap(v->arch.hvm_vmx.msr_bitmap);
     release_pmu_ownship(PMU_OWNER_HVM);
+    vpmu->flags &= ~VPMU_CONTEXT_ALLOCATED;
 }
 
 struct arch_vpmu_ops core2_vpmu_ops = {
index 30486ce93595543570aaf5ef8dd299270114da1c..cf43f1e5448f11c8b6281b7ef820d3762694c4e7 100644 (file)
@@ -56,7 +56,7 @@ static int vpic_get_priority(struct hvm_hw_vpic *vpic, uint8_t mask)
 
     /* prio = ffs(mask ROR vpic->priority_add); */
     asm ( "ror %%cl,%b1 ; bsf %1,%0"
-          : "=r" (prio) : "r" ((uint32_t)mask), "c" (vpic->priority_add) );
+          : "=r" (prio) : "q" ((uint32_t)mask), "c" (vpic->priority_add) );
     return prio;
 }
 
index 1b4b4f58404e5e2fc5347ef1be894986350ad9c6..929d4cf57d2a1b07a80614ba61c503732c852566 100644 (file)
@@ -209,7 +209,8 @@ static void pt_timer_fn(void *data)
         set_timer(&pt->timer, pt->scheduled);
     }
 
-    vcpu_kick(pt->vcpu);
+    if ( !pt_irq_masked(pt) )
+        vcpu_kick(pt->vcpu);
 
     pt_unlock(pt);
 }
@@ -355,8 +356,8 @@ void pt_migrate(struct vcpu *v)
 }
 
 void create_periodic_time(
-    struct vcpu *v, struct periodic_time *pt, uint64_t period,
-    uint8_t irq, char one_shot, time_cb *cb, void *data)
+    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+    uint64_t period, uint8_t irq, time_cb *cb, void *data)
 {
     ASSERT(pt->source != 0);
 
@@ -368,13 +369,13 @@ void create_periodic_time(
     pt->do_not_freeze = 0;
     pt->irq_issued = 0;
 
-    /* Periodic timer must be at least 0.9ms. */
-    if ( (period < 900000) && !one_shot )
+    /* Periodic timer must be at least 0.1ms. */
+    if ( (period < 100000) && period )
     {
         if ( !test_and_set_bool(pt->warned_timeout_too_short) )
             gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too "
                      "small period %"PRIu64"\n", period);
-        period = 900000;
+        period = 100000;
     }
 
     pt->period = period;
@@ -382,15 +383,27 @@ void create_periodic_time(
     pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu);
     pt->irq = irq;
     pt->period_cycles = (u64)period;
-    pt->one_shot = one_shot;
-    pt->scheduled = NOW() + period;
-    /*
-     * Offset LAPIC ticks from other timer ticks. Otherwise guests which use
-     * LAPIC ticks for process accounting can see long sequences of process
-     * ticks incorrectly accounted to interrupt processing.
-     */
-    if ( pt->source == PTSRC_lapic )
-        pt->scheduled += period >> 1;
+    pt->one_shot = !period;
+    pt->scheduled = NOW() + delta;
+
+    if ( !pt->one_shot )
+    {
+        if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VPT_ALIGN] )
+        {
+            pt->scheduled = align_timer(pt->scheduled, pt->period);
+        }
+        else if ( pt->source == PTSRC_lapic )
+        {
+            /*
+             * Offset LAPIC ticks from other timer ticks. Otherwise guests
+             * which use LAPIC ticks for process accounting can see long
+             * sequences of process ticks incorrectly accounted to interrupt
+             * processing (seen with RHEL3 guest).
+             */
+            pt->scheduled += delta >> 1;
+        }
+    }
+
     pt->cb = cb;
     pt->priv = data;
 
index 555d937c4d68a79f1a810e326e8dfb4120d833ee..29cd86ee3a52ab95233b5562a36c5112b19fa8e6 100644 (file)
@@ -74,6 +74,7 @@ BUILD_SMP_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_SMP_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 BUILD_SMP_INTERRUPT(pmu_apic_interrupt,PMU_APIC_VECTOR)
 BUILD_SMP_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
+BUILD_SMP_INTERRUPT(cmci_interrupt, CMCI_APIC_VECTOR)
 
 #define IRQ(x,y) \
     IRQ##x##y##_interrupt
@@ -390,7 +391,7 @@ void __init init_IRQ(void)
 
     init_8259A(0);
 
-    for ( i = 0; i < NR_IRQS; i++ )
+    for ( i = 0; i < NR_VECTORS; i++ )
     {
         irq_desc[i].status  = IRQ_DISABLED;
         irq_desc[i].handler = &no_irq_type;
@@ -409,8 +410,8 @@ void __init init_IRQ(void)
     }
 
     /* Never allocate the hypercall vector or Linux/BSD fast-trap vector. */
-    vector_irq[HYPERCALL_VECTOR] = NEVER_ASSIGN;
-    vector_irq[0x80] = NEVER_ASSIGN;
+    vector_irq[HYPERCALL_VECTOR] = NEVER_ASSIGN_IRQ;
+    vector_irq[0x80] = NEVER_ASSIGN_IRQ;
 
     apic_intr_init();
 
index aa21f18104c52c023cc8357499c4a5046186a351..f99907439e8db0f7b0c0abfe1f7c9aaec7280054 100644 (file)
@@ -49,7 +49,6 @@ atomic_t irq_mis_count;
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
 
 int skip_ioapic_setup;
 
@@ -84,12 +83,11 @@ int disable_timer_pin_1 __initdata;
 
 static struct irq_pin_list {
     int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
+} irq_2_pin[PIN_MAP_SIZE] = {
+    [0 ... PIN_MAP_SIZE-1].pin = -1
+};
 static int irq_2_pin_free_entry = NR_IRQS;
 
-int vector_irq[NR_VECTORS] __read_mostly = {
-    [0 ... NR_VECTORS - 1] = FREE_TO_ASSIGN};
-
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -665,57 +663,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
-
-int free_irq_vector(int vector)
-{
-    int irq;
-
-    BUG_ON((vector > LAST_DYNAMIC_VECTOR) || (vector < FIRST_DYNAMIC_VECTOR));
-
-    spin_lock(&vector_lock);
-    if ((irq = vector_irq[vector]) == AUTO_ASSIGN)
-        vector_irq[vector] = FREE_TO_ASSIGN;
-    spin_unlock(&vector_lock);
-
-    return (irq == AUTO_ASSIGN) ? 0 : -EINVAL;
-}
-
-int assign_irq_vector(int irq)
-{
-    static unsigned current_vector = FIRST_DYNAMIC_VECTOR;
-    unsigned vector;
-
-    BUG_ON(irq >= NR_IRQ_VECTORS);
-
-    spin_lock(&vector_lock);
-
-    if ((irq != AUTO_ASSIGN) && (IO_APIC_VECTOR(irq) > 0)) {
-        spin_unlock(&vector_lock);
-        return IO_APIC_VECTOR(irq);
-    }
-
-    vector = current_vector;
-    while (vector_irq[vector] != FREE_TO_ASSIGN) {
-        vector += 8;
-        if (vector > LAST_DYNAMIC_VECTOR)
-            vector = FIRST_DYNAMIC_VECTOR + ((vector + 1) & 7);
-
-        if (vector == current_vector) {
-            spin_unlock(&vector_lock);
-            return -ENOSPC;
-        }
-    }
-
-    current_vector = vector;
-    vector_irq[vector] = irq;
-    if (irq != AUTO_ASSIGN)
-        IO_APIC_VECTOR(irq) = vector;
-
-    spin_unlock(&vector_lock);
-
-    return vector;
-}
+u8 irq_vector[NR_IRQS] __read_mostly;
 
 static struct hw_interrupt_type ioapic_level_type;
 static struct hw_interrupt_type ioapic_edge_type;
@@ -1018,11 +966,6 @@ static void __init enable_IO_APIC(void)
     int i, apic;
     unsigned long flags;
 
-    for (i = 0; i < PIN_MAP_SIZE; i++) {
-        irq_2_pin[i].pin = -1;
-        irq_2_pin[i].next = 0;
-    }
-
     /* Initialise dynamic irq_2_pin free list. */
     for (i = NR_IRQS; i < PIN_MAP_SIZE; i++)
         irq_2_pin[i].next = i + 1;
@@ -1257,14 +1200,16 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
 static int __init timer_irq_works(void)
 {
     extern unsigned long pit0_ticks;
-    unsigned long t1;
+    unsigned long t1, flags;
 
     t1 = pit0_ticks;
     mb();
 
+    local_save_flags(flags);
     local_irq_enable();
     /* Let ten ticks pass... */
     mdelay((10 * 1000) / HZ);
+    local_irq_restore(flags);
 
     /*
      * Expect a few ticks at least, to be sure some possible
@@ -1547,42 +1492,33 @@ static struct hw_interrupt_type ioapic_level_type = {
     .set_affinity      = set_ioapic_affinity_vector,
 };
 
-static void mask_msi_vector(unsigned int vector)
-{
-    mask_msi_irq(vector);
-}
-
-static void unmask_msi_vector(unsigned int vector)
-{
-    unmask_msi_irq(vector);
-}
-
 static unsigned int startup_msi_vector(unsigned int vector)
 {
-    dprintk(XENLOG_INFO, "startup msi vector %x\n", vector);
-    unmask_msi_irq(vector);
+    unmask_msi_vector(vector);
     return 0;
 }
 
 static void ack_msi_vector(unsigned int vector)
 {
-    ack_APIC_irq();
+    if ( msi_maskable_irq(irq_desc[vector].msi_desc) )
+        ack_APIC_irq(); /* ACKTYPE_NONE */
 }
 
 static void end_msi_vector(unsigned int vector)
 {
+    if ( !msi_maskable_irq(irq_desc[vector].msi_desc) )
+        ack_APIC_irq(); /* ACKTYPE_EOI */
 }
 
 static void shutdown_msi_vector(unsigned int vector)
 {
-    dprintk(XENLOG_INFO, "shutdown msi vector %x\n", vector);
-    mask_msi_irq(vector);
+    mask_msi_vector(vector);
 }
 
 static void set_msi_affinity_vector(unsigned int vector, cpumask_t cpu_mask)
 {
     set_native_irq_info(vector, cpu_mask);
-    set_msi_irq_affinity(vector, cpu_mask);
+    set_msi_affinity(vector, cpu_mask);
 }
 
 /*
@@ -1717,6 +1653,9 @@ static inline void check_timer(void)
 {
     int apic1, pin1, apic2, pin2;
     int vector;
+    unsigned long flags;
+
+    local_irq_save(flags);
 
     /*
      * get/set the timer IRQ vector:
@@ -1758,6 +1697,7 @@ static inline void check_timer(void)
          */
         unmask_IO_APIC_irq(0);
         if (timer_irq_works()) {
+            local_irq_restore(flags);
             if (disable_timer_pin_1 > 0)
                 clear_IO_APIC_pin(apic1, pin1);
             return;
@@ -1775,6 +1715,7 @@ static inline void check_timer(void)
          */
         setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
         if (timer_irq_works()) {
+            local_irq_restore(flags);
             printk("works.\n");
             if (pin1 != -1)
                 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
@@ -1802,6 +1743,7 @@ static inline void check_timer(void)
     enable_8259A_irq(0);
 
     if (timer_irq_works()) {
+        local_irq_restore(flags);
         printk(" works.\n");
         return;
     }
@@ -1817,6 +1759,8 @@ static inline void check_timer(void)
 
     unlock_ExtINT_logic();
 
+    local_irq_restore(flags);
+
     if (timer_irq_works()) {
         printk(" works.\n");
         return;
@@ -1835,6 +1779,20 @@ static inline void check_timer(void)
  */
 #define PIC_IRQS       (1 << PIC_CASCADE_IR)
 
+static struct IO_APIC_route_entry *ioapic_pm_state;
+
+void ioapic_pm_state_alloc(void)
+{
+    int i, nr_entry = 0;
+
+    for (i = 0; i < nr_ioapics; i++)
+        nr_entry += nr_ioapic_registers[i];
+
+    ioapic_pm_state = _xmalloc(sizeof(struct IO_APIC_route_entry)*nr_entry,
+                               sizeof(struct IO_APIC_route_entry));
+    BUG_ON(ioapic_pm_state == NULL);
+}
+
 void __init setup_IO_APIC(void)
 {
     enable_IO_APIC();
@@ -1857,40 +1815,16 @@ void __init setup_IO_APIC(void)
     init_IO_APIC_traps();
     check_timer();
     print_IO_APIC();
+    ioapic_pm_state_alloc();
 
     register_keyhandler('z', print_IO_APIC_keyhandler, "print ioapic info");
 }
 
-struct IO_APIC_route_entry *ioapic_pm_state=NULL;
-
-void ioapic_pm_state_alloc(void)
+void ioapic_suspend(void)
 {
-    int i, nr_entry = 0;
-
-    if (ioapic_pm_state != NULL)
-        return;
-
-    for (i = 0; i < nr_ioapics; i++)
-        nr_entry += nr_ioapic_registers[i];
-
-    ioapic_pm_state = _xmalloc(sizeof(struct IO_APIC_route_entry)*nr_entry,
-                               sizeof(struct IO_APIC_route_entry));
-}
-
-int ioapic_suspend(void)
-{
-    struct IO_APIC_route_entry *entry;
+    struct IO_APIC_route_entry *entry = ioapic_pm_state;
     unsigned long flags;
-    int apic,i;
-
-    ioapic_pm_state_alloc();
-
-    if (ioapic_pm_state == NULL) {
-        printk("Cannot suspend ioapic due to lack of memory\n");
-        return 1;
-    }
-
-    entry = ioapic_pm_state;
+    int apic, i;
 
     spin_lock_irqsave(&ioapic_lock, flags);
     for (apic = 0; apic < nr_ioapics; apic++) {
@@ -1900,23 +1834,14 @@ int ioapic_suspend(void)
         }
     }
     spin_unlock_irqrestore(&ioapic_lock, flags);
-
-    return 0;
 }
 
-int ioapic_resume(void)
+void ioapic_resume(void)
 {
-    struct IO_APIC_route_entry *entry;
+    struct IO_APIC_route_entry *entry = ioapic_pm_state;
     unsigned long flags;
     union IO_APIC_reg_00 reg_00;
-    int i,apic;
-    
-    if (ioapic_pm_state == NULL){
-        printk("Cannot resume ioapic due to lack of memory\n");
-        return 1;
-    }
-    
-    entry = ioapic_pm_state;
+    int i, apic;
 
     spin_lock_irqsave(&ioapic_lock, flags);
     for (apic = 0; apic < nr_ioapics; apic++){
@@ -1931,8 +1856,6 @@ int ioapic_resume(void)
         }
     }
     spin_unlock_irqrestore(&ioapic_lock, flags);
-
-    return 0;
 }
 
 /* --------------------------------------------------------------------------
@@ -2196,7 +2119,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
     if ( new_rte.vector >= FIRST_DYNAMIC_VECTOR )
         new_irq = vector_irq[new_rte.vector];
 
-    if ( (old_irq != new_irq) && (old_irq != -1) && IO_APIC_IRQ(old_irq) )
+    if ( (old_irq != new_irq) && (old_irq >= 0) && IO_APIC_IRQ(old_irq) )
     {
         if ( irq_desc[IO_APIC_VECTOR(old_irq)].action )
         {
@@ -2208,7 +2131,7 @@ int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
         remove_pin_at_irq(old_irq, apic, pin);
     }
 
-    if ( (new_irq != -1) && IO_APIC_IRQ(new_irq) )
+    if ( (new_irq >= 0) && IO_APIC_IRQ(new_irq) )
     {
         if ( irq_desc[IO_APIC_VECTOR(new_irq)].action )
         {
index 3ce7b240ef31164d8b6b610bd8ae530810c9635f..18baba02dbec6888c292930f35b8841097d15a66 100644 (file)
 #include <xen/iommu.h>
 #include <asm/msi.h>
 #include <asm/current.h>
+#include <asm/flushtlb.h>
 #include <public/physdev.h>
 
 /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */
 int opt_noirqbalance = 0;
 boolean_param("noirqbalance", opt_noirqbalance);
 
-irq_desc_t irq_desc[NR_IRQS];
+irq_desc_t irq_desc[NR_VECTORS];
+
+static DEFINE_SPINLOCK(vector_lock);
+int vector_irq[NR_VECTORS] __read_mostly = {
+    [0 ... NR_VECTORS - 1] = FREE_TO_ASSIGN_IRQ
+};
 
 static void __do_IRQ_guest(int vector);
 
@@ -53,6 +59,56 @@ struct hw_interrupt_type no_irq_type = {
 
 atomic_t irq_err_count;
 
+int free_irq_vector(int vector)
+{
+    int irq;
+
+    BUG_ON((vector > LAST_DYNAMIC_VECTOR) || (vector < FIRST_DYNAMIC_VECTOR));
+
+    spin_lock(&vector_lock);
+    if ((irq = vector_irq[vector]) == AUTO_ASSIGN_IRQ)
+        vector_irq[vector] = FREE_TO_ASSIGN_IRQ;
+    spin_unlock(&vector_lock);
+
+    return (irq == AUTO_ASSIGN_IRQ) ? 0 : -EINVAL;
+}
+
+int assign_irq_vector(int irq)
+{
+    static unsigned current_vector = FIRST_DYNAMIC_VECTOR;
+    unsigned vector;
+
+    BUG_ON(irq >= NR_IRQS);
+
+    spin_lock(&vector_lock);
+
+    if ((irq != AUTO_ASSIGN_IRQ) && (IO_APIC_VECTOR(irq) > 0)) {
+        spin_unlock(&vector_lock);
+        return IO_APIC_VECTOR(irq);
+    }
+
+    vector = current_vector;
+    while (vector_irq[vector] != FREE_TO_ASSIGN_IRQ) {
+        vector += 8;
+        if (vector > LAST_DYNAMIC_VECTOR)
+            vector = FIRST_DYNAMIC_VECTOR + ((vector + 1) & 7);
+
+        if (vector == current_vector) {
+            spin_unlock(&vector_lock);
+            return -ENOSPC;
+        }
+    }
+
+    current_vector = vector;
+    vector_irq[vector] = irq;
+    if (irq != AUTO_ASSIGN_IRQ)
+        IO_APIC_VECTOR(irq) = vector;
+
+    spin_unlock(&vector_lock);
+
+    return vector;
+}
+
 asmlinkage void do_IRQ(struct cpu_user_regs *regs)
 {
     unsigned int      vector = regs->entry_vector;
@@ -103,7 +159,7 @@ asmlinkage void do_IRQ(struct cpu_user_regs *regs)
     spin_unlock(&desc->lock);
 }
 
-int request_irq(unsigned int irq,
+int request_irq_vector(unsigned int vector,
         void (*handler)(int, void *, struct cpu_user_regs *),
         unsigned long irqflags, const char * devname, void *dev_id)
 {
@@ -116,7 +172,7 @@ int request_irq(unsigned int irq,
      * which interrupt is which (messes up the interrupt freeing
      * logic etc).
      */
-    if (irq >= NR_IRQS)
+    if (vector >= NR_VECTORS)
         return -EINVAL;
     if (!handler)
         return -EINVAL;
@@ -129,34 +185,32 @@ int request_irq(unsigned int irq,
     action->name = devname;
     action->dev_id = dev_id;
 
-    retval = setup_irq(irq, action);
+    retval = setup_irq_vector(vector, action);
     if (retval)
         xfree(action);
 
     return retval;
 }
 
-void free_irq(unsigned int irq)
+void release_irq_vector(unsigned int vector)
 {
-    unsigned int  vector = irq_to_vector(irq);
-    irq_desc_t   *desc = &irq_desc[vector];
+    irq_desc_t *desc = &irq_desc[vector];
     unsigned long flags;
 
     spin_lock_irqsave(&desc->lock,flags);
     desc->action  = NULL;
     desc->depth   = 1;
     desc->status |= IRQ_DISABLED;
-    desc->handler->shutdown(irq);
+    desc->handler->shutdown(vector);
     spin_unlock_irqrestore(&desc->lock,flags);
 
     /* Wait to make sure it's not being used on another CPU */
     do { smp_mb(); } while ( desc->status & IRQ_INPROGRESS );
 }
 
-int setup_irq(unsigned int irq, struct irqaction *new)
+int setup_irq_vector(unsigned int vector, struct irqaction *new)
 {
-    unsigned int  vector = irq_to_vector(irq);
-    irq_desc_t   *desc = &irq_desc[vector];
+    irq_desc_t *desc = &irq_desc[vector];
     unsigned long flags;
  
     spin_lock_irqsave(&desc->lock,flags);
@@ -206,16 +260,42 @@ struct pending_eoi {
 static DEFINE_PER_CPU(struct pending_eoi, pending_eoi[NR_VECTORS]);
 #define pending_eoi_sp(p) ((p)[NR_VECTORS-1].vector)
 
-static struct timer irq_guest_eoi_timer[NR_IRQS];
+static inline void set_pirq_eoi(struct domain *d, unsigned int irq)
+{
+    if ( d->arch.pirq_eoi_map )
+        set_bit(irq, d->arch.pirq_eoi_map);
+}
+
+static inline void clear_pirq_eoi(struct domain *d, unsigned int irq)
+{
+    if ( d->arch.pirq_eoi_map )
+        clear_bit(irq, d->arch.pirq_eoi_map);
+}
+
+static void _irq_guest_eoi(irq_desc_t *desc)
+{
+    irq_guest_action_t *action = (irq_guest_action_t *)desc->action;
+    unsigned int i, vector = desc - irq_desc;
+
+    if ( !(desc->status & IRQ_GUEST_EOI_PENDING) )
+        return;
+
+    for ( i = 0; i < action->nr_guests; ++i )
+        clear_pirq_eoi(action->guest[i],
+                       domain_vector_to_irq(action->guest[i], vector));
+
+    desc->status &= ~(IRQ_INPROGRESS|IRQ_GUEST_EOI_PENDING);
+    desc->handler->enable(vector);
+}
+
+static struct timer irq_guest_eoi_timer[NR_VECTORS];
 static void irq_guest_eoi_timer_fn(void *data)
 {
     irq_desc_t *desc = data;
-    unsigned vector = desc - irq_desc;
     unsigned long flags;
 
     spin_lock_irqsave(&desc->lock, flags);
-    desc->status &= ~IRQ_INPROGRESS;
-    desc->handler->enable(vector);
+    _irq_guest_eoi(desc);
     spin_unlock_irqrestore(&desc->lock, flags);
 }
 
@@ -272,8 +352,22 @@ static void __do_IRQ_guest(int vector)
 
     if ( already_pending == action->nr_guests )
     {
-        desc->handler->disable(vector);
         stop_timer(&irq_guest_eoi_timer[vector]);
+        desc->handler->disable(vector);
+        desc->status |= IRQ_GUEST_EOI_PENDING;
+        for ( i = 0; i < already_pending; ++i )
+        {
+            d = action->guest[i];
+            set_pirq_eoi(d, domain_vector_to_irq(d, vector));
+            /*
+             * Could check here whether the guest unmasked the event by now
+             * (or perhaps just re-issue the send_guest_pirq()), and if it
+             * can now accept the event,
+             * - clear all the pirq_eoi bits we already set,
+             * - re-enable the vector, and
+             * - skip the timer setup below.
+             */
+        }
         init_timer(&irq_guest_eoi_timer[vector],
                    irq_guest_eoi_timer_fn, desc, smp_processor_id());
         set_timer(&irq_guest_eoi_timer[vector], NOW() + MILLISECS(1));
@@ -310,7 +404,7 @@ irq_desc_t *domain_spin_lock_irq_desc(
 }
 
 /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */
-static void flush_ready_eoi(void *unused)
+static void flush_ready_eoi(void)
 {
     struct pending_eoi *peoi = this_cpu(pending_eoi);
     irq_desc_t         *desc;
@@ -364,7 +458,7 @@ static void set_eoi_ready(void *data)
     __set_eoi_ready(desc);
     spin_unlock(&desc->lock);
 
-    flush_ready_eoi(NULL);
+    flush_ready_eoi();
 }
 
 static void __pirq_guest_eoi(struct domain *d, int irq)
@@ -382,8 +476,12 @@ static void __pirq_guest_eoi(struct domain *d, int irq)
     action = (irq_guest_action_t *)desc->action;
     vector = desc - irq_desc;
 
-    ASSERT(!test_bit(irq, d->pirq_mask) ||
-           (action->ack_type != ACKTYPE_NONE));
+    if ( action->ack_type == ACKTYPE_NONE )
+    {
+        ASSERT(!test_bit(irq, d->pirq_mask));
+        stop_timer(&irq_guest_eoi_timer[vector]);
+        _irq_guest_eoi(desc);
+    }
 
     if ( unlikely(!test_and_clear_bit(irq, d->pirq_mask)) ||
          unlikely(--action->in_flight != 0) )
@@ -408,7 +506,7 @@ static void __pirq_guest_eoi(struct domain *d, int irq)
     {
         __set_eoi_ready(desc);
         spin_unlock(&desc->lock);
-        flush_ready_eoi(NULL);
+        flush_ready_eoi();
         local_irq_enable();
     }
     else
@@ -446,7 +544,7 @@ int pirq_guest_unmask(struct domain *d)
 }
 
 extern int ioapic_ack_new;
-int pirq_acktype(struct domain *d, int irq)
+static int pirq_acktype(struct domain *d, int irq)
 {
     irq_desc_t  *desc;
     unsigned int vector;
@@ -463,13 +561,18 @@ int pirq_acktype(struct domain *d, int irq)
     /*
      * Edge-triggered IO-APIC and LAPIC interrupts need no final
      * acknowledgement: we ACK early during interrupt processing.
-     * MSIs are treated as edge-triggered interrupts.
      */
     if ( !strcmp(desc->handler->typename, "IO-APIC-edge") ||
-         !strcmp(desc->handler->typename, "local-APIC-edge") ||
-         !strcmp(desc->handler->typename, "PCI-MSI") )
+         !strcmp(desc->handler->typename, "local-APIC-edge") )
         return ACKTYPE_NONE;
 
+    /*
+     * MSIs are treated as edge-triggered interrupts, except
+     * when there is no proper way to mask them.
+     */
+    if ( desc->handler == &pci_msi_type )
+        return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI;
+
     /*
      * Level-triggered IO-APIC interrupts need to be acknowledged on the CPU
      * on which they were received. This is because we tickle the LAPIC to EOI.
@@ -510,7 +613,7 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
 {
     unsigned int        vector;
     irq_desc_t         *desc;
-    irq_guest_action_t *action;
+    irq_guest_action_t *action, *newaction = NULL;
     int                 rc = 0;
     cpumask_t           cpumask = CPU_MASK_NONE;
 
@@ -520,7 +623,10 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
  retry:
     desc = domain_spin_lock_irq_desc(v->domain, irq, NULL);
     if ( desc == NULL )
-        return -EINVAL;
+    {
+        rc = -EINVAL;
+        goto out;
+    }
 
     action = (irq_guest_action_t *)desc->action;
     vector = desc - irq_desc;
@@ -533,19 +639,25 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
                     "Cannot bind IRQ %d to guest. In use by '%s'.\n",
                     irq, desc->action->name);
             rc = -EBUSY;
-            goto out;
+            goto unlock_out;
         }
 
-        action = xmalloc(irq_guest_action_t);
-        if ( (desc->action = (struct irqaction *)action) == NULL )
+        if ( newaction == NULL )
         {
+            spin_unlock_irq(&desc->lock);
+            if ( (newaction = xmalloc(irq_guest_action_t)) != NULL )
+                goto retry;
             gdprintk(XENLOG_INFO,
-                    "Cannot bind IRQ %d to guest. Out of memory.\n",
-                    irq);
+                     "Cannot bind IRQ %d to guest. Out of memory.\n",
+                     irq);
             rc = -ENOMEM;
             goto out;
         }
 
+        action = newaction;
+        desc->action = (struct irqaction *)action;
+        newaction = NULL;
+
         action->nr_guests   = 0;
         action->in_flight   = 0;
         action->shareable   = will_share;
@@ -564,11 +676,13 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
     }
     else if ( !will_share || !action->shareable )
     {
-        gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. "
-               "Will not share with others.\n",
-                irq);
+        gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. %s.\n",
+                 irq,
+                 will_share ?
+                 "Others do not share" :
+                 "Will not share with others");
         rc = -EBUSY;
-        goto out;
+        goto unlock_out;
     }
     else if ( action->nr_guests == 0 )
     {
@@ -588,17 +702,26 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share)
         gdprintk(XENLOG_INFO, "Cannot bind IRQ %d to guest. "
                "Already at max share.\n", irq);
         rc = -EBUSY;
-        goto out;
+        goto unlock_out;
     }
 
     action->guest[action->nr_guests++] = v->domain;
 
- out:
+    if ( action->ack_type != ACKTYPE_NONE )
+        set_pirq_eoi(v->domain, irq);
+    else
+        clear_pirq_eoi(v->domain, irq);
+
+ unlock_out:
     spin_unlock_irq(&desc->lock);
+ out:
+    if ( newaction != NULL )
+        xfree(newaction);
     return rc;
 }
 
-static void __pirq_guest_unbind(struct domain *d, int irq, irq_desc_t *desc)
+static irq_guest_action_t *__pirq_guest_unbind(
+    struct domain *d, int irq, irq_desc_t *desc)
 {
     unsigned int        vector;
     irq_guest_action_t *action;
@@ -635,6 +758,10 @@ static void __pirq_guest_unbind(struct domain *d, int irq, irq_desc_t *desc)
             spin_lock_irq(&desc->lock);
         }
         break;
+    case ACKTYPE_NONE:
+        stop_timer(&irq_guest_eoi_timer[vector]);
+        _irq_guest_eoi(desc);
+        break;
     }
 
     /*
@@ -644,7 +771,7 @@ static void __pirq_guest_unbind(struct domain *d, int irq, irq_desc_t *desc)
     BUG_ON(test_bit(irq, d->pirq_mask));
 
     if ( action->nr_guests != 0 )
-        return;
+        return NULL;
 
     BUG_ON(action->in_flight != 0);
 
@@ -672,15 +799,18 @@ static void __pirq_guest_unbind(struct domain *d, int irq, irq_desc_t *desc)
     BUG_ON(!cpus_empty(action->cpu_eoi_map));
 
     desc->action = NULL;
-    xfree(action);
     desc->status &= ~IRQ_GUEST;
     desc->status &= ~IRQ_INPROGRESS;
     kill_timer(&irq_guest_eoi_timer[vector]);
     desc->handler->shutdown(vector);
+
+    /* Caller frees the old guest descriptor block. */
+    return action;
 }
 
 void pirq_guest_unbind(struct domain *d, int irq)
 {
+    irq_guest_action_t *oldaction = NULL;
     irq_desc_t *desc;
     int vector;
 
@@ -699,16 +829,19 @@ void pirq_guest_unbind(struct domain *d, int irq)
     }
     else
     {
-        __pirq_guest_unbind(d, irq, desc);
+        oldaction = __pirq_guest_unbind(d, irq, desc);
     }
 
     spin_unlock_irq(&desc->lock);
+
+    if ( oldaction != NULL )
+        xfree(oldaction);
 }
 
 int pirq_guest_force_unbind(struct domain *d, int irq)
 {
     irq_desc_t *desc;
-    irq_guest_action_t *action;
+    irq_guest_action_t *action, *oldaction = NULL;
     int i, bound = 0;
 
     WARN_ON(!spin_is_locked(&d->event_lock));
@@ -727,10 +860,14 @@ int pirq_guest_force_unbind(struct domain *d, int irq)
         goto out;
 
     bound = 1;
-    __pirq_guest_unbind(d, irq, desc);
+    oldaction = __pirq_guest_unbind(d, irq, desc);
 
  out:
     spin_unlock_irq(&desc->lock);
+
+    if ( oldaction != NULL )
+        xfree(oldaction);
+
     return bound;
 }
 
@@ -742,15 +879,15 @@ int get_free_pirq(struct domain *d, int type, int index)
 
     if ( type == MAP_PIRQ_TYPE_GSI )
     {
-        for ( i = 16; i < NR_PIRQS; i++ )
+        for ( i = 16; i < NR_IRQS; i++ )
             if ( !d->arch.pirq_vector[i] )
                 break;
-        if ( i == NR_PIRQS )
+        if ( i == NR_IRQS )
             return -ENOSPC;
     }
     else
     {
-        for ( i = NR_PIRQS - 1; i >= 16; i-- )
+        for ( i = NR_IRQS - 1; i >= 16; i-- )
             if ( !d->arch.pirq_vector[i] )
                 break;
         if ( i == 16 )
@@ -767,21 +904,24 @@ int map_domain_pirq(
     int old_vector, old_pirq;
     irq_desc_t *desc;
     unsigned long flags;
+    struct msi_desc *msi_desc;
+    struct pci_dev *pdev = NULL;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
     ASSERT(spin_is_locked(&d->event_lock));
 
     if ( !IS_PRIV(current->domain) )
         return -EPERM;
 
-    if ( pirq < 0 || pirq >= NR_PIRQS || vector < 0 || vector >= NR_VECTORS )
+    if ( pirq < 0 || pirq >= NR_IRQS || vector < 0 || vector >= NR_VECTORS )
     {
         dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or vector %d\n",
                 d->domain_id, pirq, vector);
         return -EINVAL;
     }
 
-    old_vector = d->arch.pirq_vector[pirq];
-    old_pirq = d->arch.vector_pirq[vector];
+    old_vector = domain_irq_to_vector(d, pirq);
+    old_pirq = domain_vector_to_irq(d, vector);
 
     if ( (old_vector && (old_vector != vector) ) ||
          (old_pirq && (old_pirq != pirq)) )
@@ -800,25 +940,39 @@ int map_domain_pirq(
     }
 
     desc = &irq_desc[vector];
-    spin_lock_irqsave(&desc->lock, flags);
 
     if ( type == MAP_PIRQ_TYPE_MSI )
     {
         struct msi_info *msi = (struct msi_info *)data;
+
+        ret = -ENODEV;
+        if ( !cpu_has_apic )
+            goto done;
+
+        pdev = pci_get_pdev(msi->bus, msi->devfn);
+        ret = pci_enable_msi(msi, &msi_desc);
+        if ( ret )
+            goto done;
+
+        spin_lock_irqsave(&desc->lock, flags);
+
         if ( desc->handler != &no_irq_type )
             dprintk(XENLOG_G_ERR, "dom%d: vector %d in use\n",
-                    d->domain_id, vector);
+              d->domain_id, vector);
         desc->handler = &pci_msi_type;
-        ret = pci_enable_msi(msi);
-        if ( ret )
-            goto done;
+        d->arch.pirq_vector[pirq] = vector;
+        d->arch.vector_pirq[vector] = pirq;
+        setup_msi_irq(pdev, msi_desc);
+        spin_unlock_irqrestore(&desc->lock, flags);
+    } else
+    {
+        spin_lock_irqsave(&desc->lock, flags);
+        d->arch.pirq_vector[pirq] = vector;
+        d->arch.vector_pirq[vector] = pirq;
+        spin_unlock_irqrestore(&desc->lock, flags);
     }
 
-    d->arch.pirq_vector[pirq] = vector;
-    d->arch.vector_pirq[vector] = pirq;
-
-done:
-    spin_unlock_irqrestore(&desc->lock, flags);
+ done:
     return ret;
 }
 
@@ -829,16 +983,18 @@ int unmap_domain_pirq(struct domain *d, int pirq)
     irq_desc_t *desc;
     int vector, ret = 0;
     bool_t forced_unbind;
+    struct msi_desc *msi_desc = NULL;
 
-    if ( (pirq < 0) || (pirq >= NR_PIRQS) )
+    if ( (pirq < 0) || (pirq >= NR_IRQS) )
         return -EINVAL;
 
     if ( !IS_PRIV(current->domain) )
         return -EINVAL;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
     ASSERT(spin_is_locked(&d->event_lock));
 
-    vector = d->arch.pirq_vector[pirq];
+    vector = domain_irq_to_vector(d, pirq);
     if ( vector <= 0 )
     {
         dprintk(XENLOG_G_ERR, "dom%d: pirq %d not mapped\n",
@@ -853,18 +1009,19 @@ int unmap_domain_pirq(struct domain *d, int pirq)
                 d->domain_id, pirq);
 
     desc = &irq_desc[vector];
+
+    if ( (msi_desc = desc->msi_desc) != NULL )
+        pci_disable_msi(msi_desc);
+
     spin_lock_irqsave(&desc->lock, flags);
 
-    BUG_ON(vector != d->arch.pirq_vector[pirq]);
+    BUG_ON(vector != domain_irq_to_vector(d, pirq));
 
-    if ( desc->msi_desc )
-        pci_disable_msi(vector);
+    if ( msi_desc )
+        teardown_msi_vector(vector);
 
     if ( desc->handler == &pci_msi_type )
-    {
         desc->handler = &no_irq_type;
-        free_irq_vector(vector);
-    }
 
     if ( !forced_unbind )
     {
@@ -878,6 +1035,11 @@ int unmap_domain_pirq(struct domain *d, int pirq)
     }
 
     spin_unlock_irqrestore(&desc->lock, flags);
+    if (msi_desc)
+    {
+        msi_free_vector(msi_desc);
+        free_irq_vector(vector);
+    }
 
     ret = irq_deny_access(d, pirq);
     if ( ret )
@@ -892,13 +1054,15 @@ void free_domain_pirqs(struct domain *d)
 {
     int i;
 
+    spin_lock(&pcidevs_lock);
     spin_lock(&d->event_lock);
 
-    for ( i = 0; i < NR_PIRQS; i++ )
+    for ( i = 0; i < NR_IRQS; i++ )
         if ( d->arch.pirq_vector[i] > 0 )
             unmap_domain_pirq(d, i);
 
     spin_unlock(&d->event_lock);
+    spin_unlock(&pcidevs_lock);
 }
 
 extern void dump_ioapic_irq_info(void);
@@ -940,7 +1104,8 @@ static void dump_irqs(unsigned char key)
                        (test_bit(d->pirq_to_evtchn[irq],
                                  &shared_info(d, evtchn_pending)) ?
                         'P' : '-'),
-                       (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_GUEST_LONG(d),
+                       (test_bit(d->pirq_to_evtchn[irq] /
+                                 BITS_PER_EVTCHN_WORD(d),
                                  &vcpu_info(d->vcpu[0], evtchn_pending_sel)) ?
                         'S' : '-'),
                        (test_bit(d->pirq_to_evtchn[irq],
@@ -974,28 +1139,38 @@ __initcall(setup_dump_irqs);
 
 void fixup_irqs(cpumask_t map)
 {
-    unsigned int irq, sp;
+    unsigned int vector, sp;
     static int warned;
     irq_guest_action_t *action;
     struct pending_eoi *peoi;
+    irq_desc_t         *desc;
+    unsigned long       flags;
 
     /* Direct all future interrupts away from this CPU. */
-    for ( irq = 0; irq < NR_IRQS; irq++ )
+    for ( vector = 0; vector < NR_VECTORS; vector++ )
     {
         cpumask_t mask;
-        if ( irq == 2 )
+        if ( vector_to_irq(vector) == 2 )
             continue;
 
-        cpus_and(mask, irq_desc[irq].affinity, map);
+        desc = &irq_desc[vector];
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        cpus_and(mask, desc->affinity, map);
         if ( any_online_cpu(mask) == NR_CPUS )
         {
-            printk("Breaking affinity for irq %i\n", irq);
+            printk("Breaking affinity for vector %u (irq %i)\n",
+                   vector, vector_to_irq(vector));
             mask = map;
         }
-        if ( irq_desc[irq].handler->set_affinity )
-            irq_desc[irq].handler->set_affinity(irq, mask);
-        else if ( irq_desc[irq].action && !(warned++) )
-            printk("Cannot set affinity for irq %i\n", irq);
+        if ( desc->handler->set_affinity )
+            desc->handler->set_affinity(vector, mask);
+        else if ( desc->action && !(warned++) )
+            printk("Cannot set affinity for vector %u (irq %i)\n",
+                   vector, vector_to_irq(vector));
+
+        spin_unlock_irqrestore(&desc->lock, flags);
     }
 
     /* Service any interrupts that beat us in the re-direction race. */
@@ -1004,11 +1179,11 @@ void fixup_irqs(cpumask_t map)
     local_irq_disable();
 
     /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */
-    for ( irq = 0; irq < NR_IRQS; irq++ )
+    for ( vector = 0; vector < NR_VECTORS; vector++ )
     {
-        if ( !(irq_desc[irq].status & IRQ_GUEST) )
+        if ( !(irq_desc[vector].status & IRQ_GUEST) )
             continue;
-        action = (irq_guest_action_t *)irq_desc[irq].action;
+        action = (irq_guest_action_t *)irq_desc[vector].action;
         cpu_clear(smp_processor_id(), action->cpu_eoi_map);
     }
 
@@ -1016,6 +1191,6 @@ void fixup_irqs(cpumask_t map)
     peoi = this_cpu(pending_eoi);
     for ( sp = 0; sp < pending_eoi_sp(peoi); sp++ )
         peoi[sp].ready = 1;
-    flush_ready_eoi(NULL);
+    flush_ready_eoi();
 }
 #endif
index d98eb77fa7c6d286bfbd6671bf79938ce12926f6..4d15e4b0639474e2d1143abf3a7fd506e8dd3b85 100644 (file)
@@ -150,6 +150,9 @@ void arch_crash_save_vmcoreinfo(void)
        VMCOREINFO_SYMBOL(dom_xen);
        VMCOREINFO_SYMBOL(dom_io);
 
+#ifdef CONFIG_X86_32
+    VMCOREINFO_SYMBOL(xenheap_phys_end);
+#endif
 #ifdef CONFIG_X86_PAE
        VMCOREINFO_SYMBOL_ALIAS(pgd_l3, idle_pg_table);
 #endif
index 3b7d6a1356d68cf16fe3cad5d1b4d9f6eef1fa08..ebad6ef11d694fed6717b79e5cf8793bb0099867 100644 (file)
@@ -49,31 +49,22 @@ struct microcode_info {
     char buffer[1];
 };
 
-static void microcode_fini_cpu(int cpu)
+static void __microcode_fini_cpu(int cpu)
 {
     struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-    spin_lock(&microcode_mutex);
-    xfree(uci->mc.valid_mc);
-    uci->mc.valid_mc = NULL;
-    uci->valid = 0;
-    spin_unlock(&microcode_mutex);
+    xfree(uci->mc.mc_valid);
+    memset(uci, 0, sizeof(*uci));
 }
 
-static int collect_cpu_info(int cpu)
+static void microcode_fini_cpu(int cpu)
 {
-    int err = 0;
-    struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-    memset(uci, 0, sizeof(*uci));
-    err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
-    if ( !err )
-        uci->valid = 1;
-
-    return err;
+    spin_lock(&microcode_mutex);
+    __microcode_fini_cpu(cpu);
+    spin_unlock(&microcode_mutex);
 }
 
-static int microcode_resume_cpu(int cpu)
+int microcode_resume_cpu(int cpu)
 {
     int err = 0;
     struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -81,7 +72,7 @@ static int microcode_resume_cpu(int cpu)
 
     gdprintk(XENLOG_INFO, "microcode: CPU%d resumed\n", cpu);
 
-    if ( !uci->mc.valid_mc )
+    if ( !uci->mc.mc_valid )
         return -EIO;
 
     /*
@@ -95,16 +86,15 @@ static int microcode_resume_cpu(int cpu)
         return err;
     }
 
-    if ( memcmp(&nsig, &uci->cpu_sig, sizeof(nsig)) )
+    if ( microcode_ops->microcode_resume_match(cpu, &nsig) )
+    {
+        return microcode_ops->apply_microcode(cpu);
+    }
+    else
     {
         microcode_fini_cpu(cpu);
-        /* Should we look for a new ucode here? */
         return -EIO;
     }
-
-    err = microcode_ops->apply_microcode(cpu);
-
-    return err;
 }
 
 static int microcode_update_cpu(const void *buf, size_t size)
@@ -115,20 +105,11 @@ static int microcode_update_cpu(const void *buf, size_t size)
 
     spin_lock(&microcode_mutex);
 
-    /*
-     * Check if the system resume is in progress (uci->valid != NULL),
-     * otherwise just request a firmware:
-     */
-    if ( uci->valid )
-    {
-        err = microcode_resume_cpu(cpu);
-    }
+    err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+    if ( likely(!err) )
+        err = microcode_ops->cpu_request_microcode(cpu, buf, size);
     else
-    {
-        err = collect_cpu_info(cpu);
-        if ( !err && uci->valid )
-            err = microcode_ops->cpu_request_microcode(cpu, buf, size);
-    }
+        __microcode_fini_cpu(cpu);
 
     spin_unlock(&microcode_mutex);
 
@@ -153,7 +134,6 @@ static long do_microcode_update(void *_info)
     error = info->error;
     xfree(info);
     return error;
-
 }
 
 int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len)
index ca72f195af0d88686163d4b6a130ccfbf3ba579c..77053c258df53aebf5ffebd258c0516dc75cf0dd 100644 (file)
 #define MC_HEADER_SIZE          (sizeof(struct microcode_header_amd))
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
 #define DWSIZE                  (sizeof(uint32_t))
-/* For now we support a fixed ucode total size only */
-#define get_totalsize(mc) \
-        ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
-         + MC_HEADER_SIZE)
 
 /* serialize access to the physical write */
 static DEFINE_SPINLOCK(microcode_update_lock);
 
 struct equiv_cpu_entry *equiv_cpu_table;
 
-static long install_equiv_cpu_table(const void *, uint32_t, long);
-
 static int collect_cpu_info(int cpu, struct cpu_signature *csig)
 {
     struct cpuinfo_x86 *c = &cpu_data[cpu];
+    uint32_t dummy;
 
     memset(csig, 0, sizeof(*csig));
 
@@ -60,13 +55,10 @@ static int collect_cpu_info(int cpu, struct cpu_signature *csig)
     {
         printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
                cpu);
-        return -1;
+        return -EINVAL;
     }
 
-    asm volatile (
-        "movl %1, %%ecx; rdmsr"
-        : "=a" (csig->rev)
-        : "i" (MSR_AMD_PATCHLEVEL) : "ecx" );
+    rdmsr(MSR_AMD_PATCHLEVEL, csig->rev, dummy);
 
     printk(KERN_INFO "microcode: collect_cpu_info: patch_id=0x%x\n",
            csig->rev);
@@ -74,29 +66,17 @@ static int collect_cpu_info(int cpu, struct cpu_signature *csig)
     return 0;
 }
 
-static int get_matching_microcode(void *mc, int cpu)
+static int microcode_fits(void *mc, int cpu)
 {
     struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
     struct microcode_header_amd *mc_header = mc;
-    unsigned long total_size = get_totalsize(mc_header);
-    void *new_mc;
     unsigned int current_cpu_id;
-    unsigned int equiv_cpu_id = 0x00;
+    unsigned int equiv_cpu_id = 0x0;
     unsigned int i;
 
     /* We should bind the task to the CPU */
     BUG_ON(cpu != raw_smp_processor_id());
 
-    /* This is a tricky part. We might be called from a write operation
-     * to the device file instead of the usual process of firmware
-     * loading. This routine needs to be able to distinguish both
-     * cases. This is done by checking if there already is a equivalent
-     * CPU table installed. If not, we're written through
-     * /dev/cpu/microcode.
-     * Since we ignore all checks. The error case in which going through
-     * firmware loading and that table is not loaded has already been
-     * checked earlier.
-     */
     if ( equiv_cpu_table == NULL )
     {
         printk(KERN_INFO "microcode: CPU%d microcode update with "
@@ -111,7 +91,7 @@ static int get_matching_microcode(void *mc, int cpu)
     {
         if ( current_cpu_id == equiv_cpu_table[i].installed_cpu )
         {
-            equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
+            equiv_cpu_id = equiv_cpu_table[i].equiv_cpu & 0xffff;
             break;
         }
     }
@@ -119,171 +99,136 @@ static int get_matching_microcode(void *mc, int cpu)
     if ( !equiv_cpu_id )
     {
         printk(KERN_ERR "microcode: CPU%d cpu_id "
-               "not found in equivalent cpu table \n", cpu);
-        return 0;
-    }
-
-    if ( (mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff) )
-    {
-        printk(KERN_INFO
-               "microcode: CPU%d patch does not match "
-               "(patch is %x, cpu extended is %x) \n",
-               cpu, mc_header->processor_rev_id[0],
-               (equiv_cpu_id & 0xff));
-        return 0;
+               "not found in equivalent cpu table\n", cpu);
+        return -EINVAL;
     }
 
-    if ( (mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff) )
+    if ( (mc_header->processor_rev_id) != equiv_cpu_id )
     {
         printk(KERN_INFO "microcode: CPU%d patch does not match "
                "(patch is %x, cpu base id is %x) \n",
-               cpu, mc_header->processor_rev_id[1],
-               ((equiv_cpu_id >> 16) & 0xff));
-        return 0;
+               cpu, mc_header->processor_rev_id, equiv_cpu_id);
+        return -EINVAL;
     }
 
     if ( mc_header->patch_id <= uci->cpu_sig.rev )
-        return 0;
+        return -EINVAL;
 
     printk(KERN_INFO "microcode: CPU%d found a matching microcode "
            "update with version 0x%x (current=0x%x)\n",
            cpu, mc_header->patch_id, uci->cpu_sig.rev);
 
- out:
-    new_mc = xmalloc_bytes(UCODE_MAX_SIZE);
-    if ( new_mc == NULL )
-    {
-        printk(KERN_ERR "microcode: error, can't allocate memory\n");
-        return -ENOMEM;
-    }
-    memset(new_mc, 0, UCODE_MAX_SIZE);
-
-    /* free previous update file */
-    xfree(uci->mc.mc_amd);
-
-    memcpy(new_mc, mc, total_size);
-
-    uci->mc.mc_amd = new_mc;
-    return 1;
+out:
+    return 0;
 }
 
 static int apply_microcode(int cpu)
 {
     unsigned long flags;
-    uint32_t eax, edx, rev;
-    int cpu_num = raw_smp_processor_id();
-    struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-    uint64_t addr;
+    struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+    uint32_t rev, dummy;
+    struct microcode_amd *mc_amd = uci->mc.mc_amd;
 
     /* We should bind the task to the CPU */
-    BUG_ON(cpu_num != cpu);
+    BUG_ON(raw_smp_processor_id() != cpu);
 
-    if ( uci->mc.mc_amd == NULL )
+    if ( mc_amd == NULL )
         return -EINVAL;
 
     spin_lock_irqsave(&microcode_update_lock, flags);
 
-    addr = (unsigned long)&uci->mc.mc_amd->hdr.data_code;
-    edx = (uint32_t)(addr >> 32);
-    eax = (uint32_t)addr;
-
-    asm volatile (
-        "movl %0, %%ecx; wrmsr" :
-        : "i" (MSR_AMD_PATCHLOADER), "a" (eax), "d" (edx) : "ecx" );
+    wrmsrl(MSR_AMD_PATCHLOADER, (unsigned long)&mc_amd->hdr.data_code);
 
     /* get patch id after patching */
-    asm volatile (
-        "movl %1, %%ecx; rdmsr"
-        : "=a" (rev)
-        : "i" (MSR_AMD_PATCHLEVEL) : "ecx");
+    rdmsr(MSR_AMD_PATCHLEVEL, rev, dummy);
 
     spin_unlock_irqrestore(&microcode_update_lock, flags);
 
     /* check current patch id and patch's id for match */
-    if ( rev != uci->mc.mc_amd->hdr.patch_id )
+    if ( rev != mc_amd->hdr.patch_id )
     {
         printk(KERN_ERR "microcode: CPU%d update from revision "
-               "0x%x to 0x%x failed\n", cpu_num,
-               uci->mc.mc_amd->hdr.patch_id, rev);
+               "0x%x to 0x%x failed\n", cpu,
+               mc_amd->hdr.patch_id, rev);
         return -EIO;
     }
 
     printk("microcode: CPU%d updated from revision "
            "0x%x to 0x%x \n",
-           cpu_num, uci->cpu_sig.rev, uci->mc.mc_amd->hdr.patch_id);
+           cpu, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
 
     uci->cpu_sig.rev = rev;
 
     return 0;
 }
 
-static long get_next_ucode_from_buffer_amd(void **mc, const void *buf,
-                                           unsigned long size, long offset)
+static int get_next_ucode_from_buffer_amd(void *mc, const void *buf,
+                                         size_t size, unsigned long *offset)
 {
     struct microcode_header_amd *mc_header;
-    unsigned long total_size;
-    const uint8_t *buf_pos = buf;
+    size_t total_size;
+    const uint8_t *bufp = buf;
+    unsigned long off;
+
+    off = *offset;
 
     /* No more data */
-    if ( offset >= size )
-        return 0;
+    if ( off >= size )
+        return 1;
 
-    if ( buf_pos[offset] != UCODE_UCODE_TYPE )
+    if ( bufp[off] != UCODE_UCODE_TYPE )
     {
         printk(KERN_ERR "microcode: error! "
                "Wrong microcode payload type field\n");
         return -EINVAL;
     }
 
-    mc_header = (struct microcode_header_amd *)(&buf_pos[offset+8]);
+    mc_header = (struct microcode_header_amd *)(&bufp[off+8]);
 
-    total_size = (unsigned long) (buf_pos[offset+4] +
-                                  (buf_pos[offset+5] << 8));
+    total_size = (unsigned long) (bufp[off+4] + (bufp[off+5] << 8));
 
     printk(KERN_INFO "microcode: size %lu, total_size %lu, offset %ld\n",
-           size, total_size, offset);
+           (unsigned long)size, total_size, off);
 
-    if ( (offset + total_size) > size )
+    if ( (off + total_size) > size )
     {
         printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
         return -EINVAL;
     }
 
-    *mc = xmalloc_bytes(UCODE_MAX_SIZE);
-    if ( *mc == NULL )
-    {
-        printk(KERN_ERR "microcode: error! "
-               "Can not allocate memory for microcode patch\n");
-        return -ENOMEM;
-    }
+    memset(mc, 0, UCODE_MAX_SIZE);
+    memcpy(mc, (const void *)(&bufp[off + 8]), total_size);
 
-    memset(*mc, 0, UCODE_MAX_SIZE);
-    memcpy(*mc, (const void *)(buf + offset + 8), total_size);
+    *offset = off + total_size + 8;
 
-    return offset + total_size + 8;
+    return 0;
 }
 
-static long install_equiv_cpu_table(const void *buf,
-                                    uint32_t size, long offset)
+static int install_equiv_cpu_table(const void *buf, uint32_t size,
+                                   unsigned long *offset)
 {
     const uint32_t *buf_pos = buf;
+    unsigned long off;
+
+    off = *offset;
+    *offset = 0;
 
     /* No more data */
-    if ( offset >= size )
-        return 0;
+    if ( off >= size )
+        return -EINVAL;
 
     if ( buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE )
     {
         printk(KERN_ERR "microcode: error! "
-               "Wrong microcode equivalnet cpu table type field\n");
-        return 0;
+               "Wrong microcode equivalent cpu table type field\n");
+        return -EINVAL;
     }
 
     if ( size == 0 )
     {
         printk(KERN_ERR "microcode: error! "
                "Wrong microcode equivalnet cpu table length\n");
-        return 0;
+        return -EINVAL;
     }
 
     equiv_cpu_table = xmalloc_bytes(size);
@@ -291,20 +236,24 @@ static long install_equiv_cpu_table(const void *buf,
     {
         printk(KERN_ERR "microcode: error, can't allocate "
                "memory for equiv CPU table\n");
-        return 0;
+        return -ENOMEM;
     }
 
     memset(equiv_cpu_table, 0, size);
     memcpy(equiv_cpu_table, (const void *)&buf_pos[3], size);
 
-    return size + 12; /* add header length */
+    *offset = size + 12;       /* add header length */
+
+    return 0;
 }
 
 static int cpu_request_microcode(int cpu, const void *buf, size_t size)
 {
     const uint32_t *buf_pos;
-    long offset = 0;
+    unsigned long offset = 0;
     int error = 0;
+    int ret;
+    struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
     void *mc;
 
     /* We should bind the task to the CPU */
@@ -319,41 +268,63 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t size)
         return -EINVAL;
     }
 
-    offset = install_equiv_cpu_table(buf, (uint32_t)(buf_pos[2]), offset);
-    if ( !offset )
+    error = install_equiv_cpu_table(buf, (uint32_t)(buf_pos[2]), &offset);
+    if ( error )
     {
         printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
         return -EINVAL;
     }
 
-    while ( (offset =
-             get_next_ucode_from_buffer_amd(&mc, buf, size, offset)) > 0 )
+    mc = xmalloc_bytes(UCODE_MAX_SIZE);
+    if ( mc == NULL )
     {
-        error = get_matching_microcode(mc, cpu);
-        if ( error < 0 )
-            break;
-        /*
-         * It's possible the data file has multiple matching ucode,
-         * lets keep searching till the latest version
-         */
-        if ( error == 1 )
-            error = apply_microcode(cpu);
-        xfree(mc);
+        printk(KERN_ERR "microcode: error! "
+               "Can not allocate memory for microcode patch\n");
+        error = -ENOMEM;
+        goto out;
     }
-    if ( offset > 0 )
+
+    /* implicitely validates uci->mc.mc_valid */
+    uci->mc.mc_amd = mc;
+
+    /*
+     * It's possible the data file has multiple matching ucode,
+     * lets keep searching till the latest version
+     */
+    while ( (ret = get_next_ucode_from_buffer_amd(mc, buf, size, &offset)) == 0)
     {
+        error = microcode_fits(mc, cpu);
+        if (error != 0)
+            continue;
+
+        error = apply_microcode(cpu);
+        if (error == 0)
+            break;
+    }
+
+    /* On success keep the microcode patch for
+     * re-apply on resume.
+     */
+    if (error) {
         xfree(mc);
-        xfree(equiv_cpu_table);
-        equiv_cpu_table = NULL;
+        mc = NULL;
     }
-    if ( offset < 0 )
-        error = offset;
+    uci->mc.mc_amd = mc;
+
+out:
+    xfree(equiv_cpu_table);
+    equiv_cpu_table = NULL;
 
     return error;
 }
 
+static int microcode_resume_match(int cpu, struct cpu_signature *nsig)
+{
+    return 0;
+}
+
 static struct microcode_ops microcode_amd_ops = {
-    .get_matching_microcode           = get_matching_microcode,
+    .microcode_resume_match           = microcode_resume_match,
     .cpu_request_microcode            = cpu_request_microcode,
     .collect_cpu_info                 = collect_cpu_info,
     .apply_microcode                  = apply_microcode,
index 92cc62bccbc53dda2181f55a5dd3527eed6363a1..7d8657ff2ffa10e35c8ebd7035a16a55553ff889 100644 (file)
@@ -64,6 +64,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
     struct cpuinfo_x86 *c = &cpu_data[cpu_num];
     unsigned int val[2];
 
+    BUG_ON(cpu_num != smp_processor_id());
+
     memset(csig, 0, sizeof(*csig));
 
     if ( (c->x86_vendor != X86_VENDOR_INTEL) || (c->x86 < 6) ||
@@ -323,6 +325,7 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t size)
     long offset = 0;
     int error = 0;
     void *mc;
+    unsigned int matching_count = 0;
 
     /* We should bind the task to the CPU */
     BUG_ON(cpu != raw_smp_processor_id());
@@ -341,7 +344,7 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t size)
          */
         if ( error == 1 )
         {
-            apply_microcode(cpu);
+            matching_count++;
             error = 0;
         }
         xfree(mc);
@@ -351,11 +354,22 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t size)
     if ( offset < 0 )
         error = offset;
 
+    if ( !error && matching_count )
+        apply_microcode(cpu);
+
     return error;
 }
 
+static int microcode_resume_match(int cpu, struct cpu_signature *nsig)
+{
+    struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+    return (sigmatch(nsig->sig, uci->cpu_sig.sig, nsig->pf, uci->cpu_sig.pf) &&
+            (uci->cpu_sig.rev > nsig->rev));
+}
+
 static struct microcode_ops microcode_intel_ops = {
-    .get_matching_microcode           = get_matching_microcode,
+    .microcode_resume_match           = microcode_resume_match,
     .cpu_request_microcode            = cpu_request_microcode,
     .collect_cpu_info                 = collect_cpu_info,
     .apply_microcode                  = apply_microcode,
index 4c8b40a9011c9c751cd6f517c0f981f94d2d7fd9..d11c2a1ce4a8a0158507022e9be45fde7db60dba 100644 (file)
@@ -160,6 +160,9 @@ unsigned long total_pages;
 
 #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
 
+int opt_allow_hugepage;
+boolean_param("allowhugepage", opt_allow_hugepage);
+
 #define l1_disallow_mask(d)                                     \
     ((d != dom_io) &&                                           \
      (rangeset_is_empty((d)->iomem_caps) &&                     \
@@ -176,12 +179,6 @@ l2_pgentry_t *compat_idle_pg_table_l2 = NULL;
 #define l3_disallow_mask(d) L3_DISALLOW_MASK
 #endif
 
-static void queue_deferred_ops(struct domain *d, unsigned int ops)
-{
-    ASSERT(d == current->domain);
-    this_cpu(percpu_mm_info).deferred_ops |= ops;
-}
-
 void __init init_frametable(void)
 {
     unsigned long nr_pages, page_step, i, mfn;
@@ -202,11 +199,6 @@ void __init init_frametable(void)
     }
 
     memset(frame_table, 0, nr_pages << PAGE_SHIFT);
-
-#if defined(__x86_64__)
-    for ( i = 0; i < max_page; i ++ )
-        spin_lock_init(&frame_table[i].lock);
-#endif
 }
 
 void __init arch_init_memory(void)
@@ -287,15 +279,40 @@ void __init arch_init_memory(void)
     subarch_init_memory();
 }
 
-int memory_is_conventional_ram(paddr_t p)
+int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
 {
+    uint64_t maddr = pfn_to_paddr(mfn);
     int i;
 
     for ( i = 0; i < e820.nr_map; i++ )
     {
-        if ( (e820.map[i].type == E820_RAM) &&
-             (e820.map[i].addr <= p) &&
-             (e820.map[i].size > p) )
+        switch ( e820.map[i].type )
+        {
+        case E820_RAM:
+            if ( mem_type & RAM_TYPE_CONVENTIONAL )
+                break;
+            continue;
+        case E820_RESERVED:
+            if ( mem_type & RAM_TYPE_RESERVED )
+                break;
+            continue;
+        case E820_UNUSABLE:
+            if ( mem_type & RAM_TYPE_UNUSABLE )
+                break;
+            continue;
+        case E820_ACPI:
+        case E820_NVS:
+            if ( mem_type & RAM_TYPE_ACPI )
+                break;
+            continue;
+        default:
+            /* unknown */
+            continue;
+        }
+        
+        /* Test the range. */
+        if ( (e820.map[i].addr <= maddr) &&
+             ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
             return 1;
     }
 
@@ -326,7 +343,7 @@ void share_xen_page_with_guest(
 
     page_set_owner(page, d);
     wmb(); /* install valid domain ptr before updating refcnt. */
-    ASSERT(page->count_info == 0);
+    ASSERT((page->count_info & ~PGC_xen_heap) == 0);
 
     /* Only add to the allocation list if the domain isn't dying. */
     if ( !d->is_dying )
@@ -334,7 +351,7 @@ void share_xen_page_with_guest(
         page->count_info |= PGC_allocated | 1;
         if ( unlikely(d->xenheap_pages++ == 0) )
             get_knownalive_domain(d);
-        list_add_tail(&page->list, &d->xenpage_list);
+        page_list_add_tail(page, &d->xenpage_list);
     }
 
     spin_unlock(&d->page_alloc_lock);
@@ -354,14 +371,14 @@ void share_xen_page_with_privileged_guests(
 #else
 /*
  * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
- * We cannot safely shadow the idle page table, nor shadow (v1) page tables
- * (detected by lack of an owning domain). As required for correctness, we
+ * We cannot safely shadow the idle page table, nor shadow page tables
+ * (detected by zero reference count). As required for correctness, we
  * always shadow PDPTs above 4GB.
  */
-#define l3tab_needs_shadow(mfn)                         \
-    (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
-      (page_get_owner(mfn_to_page(mfn)) != NULL) &&     \
-      ((mfn) & 1)) || /* odd MFNs are shadowed */       \
+#define l3tab_needs_shadow(mfn)                          \
+    (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) &&  \
+      (mfn_to_page(mfn)->count_info & PGC_count_mask) && \
+      ((mfn) & 1)) || /* odd MFNs are shadowed */        \
      ((mfn) >= 0x100000))
 #endif
 
@@ -465,14 +482,18 @@ void update_cr3(struct vcpu *v)
 }
 
 
-static void invalidate_shadow_ldt(struct vcpu *v)
+static void invalidate_shadow_ldt(struct vcpu *v, int flush)
 {
     int i;
     unsigned long pfn;
     struct page_info *page;
-    
+
+    BUG_ON(unlikely(in_irq()));
+
+    spin_lock(&v->arch.shadow_ldt_lock);
+
     if ( v->arch.shadow_ldt_mapcnt == 0 )
-        return;
+        goto out;
 
     v->arch.shadow_ldt_mapcnt = 0;
 
@@ -487,11 +508,12 @@ static void invalidate_shadow_ldt(struct vcpu *v)
         put_page_and_type(page);
     }
 
-    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
-    if ( v == current )
-        queue_deferred_ops(v->domain, DOP_FLUSH_TLB | DOP_RELOAD_LDT);
-    else
-        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+    /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
+    if ( flush )
+        flush_tlb_mask(v->vcpu_dirty_cpumask);
+
+ out:
+    spin_unlock(&v->arch.shadow_ldt_lock);
 }
 
 
@@ -542,8 +564,10 @@ int map_ldt_shadow_page(unsigned int off)
 
     nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW);
 
+    spin_lock(&v->arch.shadow_ldt_lock);
     l1e_write(&v->arch.perdomain_ptes[off + 16], nl1e);
     v->arch.shadow_ldt_mapcnt++;
+    spin_unlock(&v->arch.shadow_ldt_lock);
 
     return 1;
 }
@@ -566,24 +590,48 @@ static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
 static int get_page_and_type_from_pagenr(unsigned long page_nr, 
                                          unsigned long type,
                                          struct domain *d,
+                                         int partial,
                                          int preemptible)
 {
     struct page_info *page = mfn_to_page(page_nr);
     int rc;
 
-    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
+    if ( likely(partial >= 0) &&
+         unlikely(!get_page_from_pagenr(page_nr, d)) )
         return -EINVAL;
 
     rc = (preemptible ?
           get_page_type_preemptible(page, type) :
           (get_page_type(page, type) ? 0 : -EINVAL));
 
-    if ( rc )
+    if ( unlikely(rc) && partial >= 0 )
         put_page(page);
 
     return rc;
 }
 
+static int get_data_page(
+    struct page_info *page, struct domain *d, int writeable)
+{
+    int rc;
+
+    if ( writeable )
+        rc = get_page_and_type(page, d, PGT_writable_page);
+    else
+        rc = get_page(page, d);
+
+    return rc;
+}
+
+static void put_data_page(
+    struct page_info *page, int writeable)
+{
+    if ( writeable )
+        put_page_and_type(page);
+    else
+        put_page(page);
+}
+
 /*
  * We allow root tables to map each other (a.k.a. linear page tables). It
  * needs some special care with reference counts and access permissions:
@@ -642,7 +690,16 @@ get_##level##_linear_pagetable(                                             \
 
 int is_iomem_page(unsigned long mfn)
 {
-    return (!mfn_valid(mfn) || (page_get_owner(mfn_to_page(mfn)) == dom_io));
+    struct page_info *page;
+
+    if ( !mfn_valid(mfn) )
+        return 1;
+
+    /* Caller must know that it is an iomem page, or a reference is held. */
+    page = mfn_to_page(mfn);
+    ASSERT((page->count_info & PGC_count_mask) != 0);
+
+    return (page_get_owner(page) == dom_io);
 }
 
 
@@ -655,7 +712,6 @@ get_page_from_l1e(
     uint32_t l1f = l1e_get_flags(l1e);
     struct vcpu *curr = current;
     struct domain *owner;
-    int okay;
 
     if ( !(l1f & _PAGE_PRESENT) )
         return 1;
@@ -666,8 +722,13 @@ get_page_from_l1e(
         return 0;
     }
 
-    if ( is_iomem_page(mfn) )
+    if ( !mfn_valid(mfn) ||
+         (owner = page_get_owner_and_reference(page)) == dom_io )
     {
+        /* Only needed the reference to confirm dom_io ownership. */
+        if ( mfn_valid(mfn) )
+            put_page(page);
+
         /* DOMID_IO reverts to caller for privilege checks. */
         if ( d == dom_io )
             d = curr->domain;
@@ -683,37 +744,32 @@ get_page_from_l1e(
         return 1;
     }
 
+    if ( owner == NULL )
+        goto could_not_pin;
+
     /*
      * Let privileged domains transfer the right to map their target
      * domain's pages. This is used to allow stub-domain pvfb export to dom0,
      * until pvfb supports granted mappings. At that time this minor hack
      * can go away.
      */
-    owner = page_get_owner(page);
-    if ( unlikely(d != owner) && (owner != NULL) &&
-         (d != curr->domain) && IS_PRIV_FOR(d, owner) )
+    if ( unlikely(d != owner) && (d != curr->domain) && IS_PRIV_FOR(d, owner) )
         d = owner;
 
     /* Foreign mappings into guests in shadow external mode don't
      * contribute to writeable mapping refcounts.  (This allows the
      * qemu-dm helper process in dom0 to map the domain's memory without
      * messing up the count of "real" writable mappings.) */
-    okay = (((l1f & _PAGE_RW) && 
-             !(unlikely(paging_mode_external(d) && (d != curr->domain))))
-            ? get_page_and_type(page, d, PGT_writable_page)
-            : get_page(page, d));
-    if ( !okay )
-    {
-        MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
-                " for dom%d",
-                mfn, get_gpfn_from_mfn(mfn),
-                l1e_get_intpte(l1e), d->domain_id);
-    }
-    else if ( pte_flags_to_cacheattr(l1f) !=
-              ((page->count_info >> PGC_cacheattr_base) & 7) )
+    if ( (l1f & _PAGE_RW) &&
+         !(paging_mode_external(d) && (d != curr->domain)) &&
+         !get_page_type(page, PGT_writable_page) )
+        goto could_not_pin;
+
+    if ( pte_flags_to_cacheattr(l1f) !=
+         ((page->count_info >> PGC_cacheattr_base) & 7) )
     {
-        uint32_t x, nx, y = page->count_info;
-        uint32_t cacheattr = pte_flags_to_cacheattr(l1f);
+        unsigned long x, nx, y = page->count_info;
+        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
 
         if ( is_xen_heap_page(page) )
         {
@@ -739,7 +795,16 @@ get_page_from_l1e(
 #endif
     }
 
-    return okay;
+    return 1;
+
+ could_not_pin:
+    MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
+            " for dom%d",
+            mfn, get_gpfn_from_mfn(mfn),
+            l1e_get_intpte(l1e), d->domain_id);
+    if ( owner != NULL )
+        put_page(page);
+    return 0;
 }
 
 
@@ -749,6 +814,7 @@ static int
 get_page_from_l2e(
     l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
 {
+    unsigned long mfn = l2e_get_pfn(l2e);
     int rc;
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
@@ -760,10 +826,37 @@ get_page_from_l2e(
         return -EINVAL;
     }
 
-    rc = get_page_and_type_from_pagenr(
-        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
-    if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
-        rc = 0;
+    if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
+    {
+        rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
+        if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+            rc = 0;
+    }
+    else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
+    {
+        rc = -EINVAL;
+    }
+    else
+    {
+        unsigned long m = mfn;
+        int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
+  
+        do {
+            rc = get_data_page(mfn_to_page(m), d, writeable);
+            if ( unlikely(!rc) )
+            {
+                while ( m-- > mfn )
+                    put_data_page(mfn_to_page(m), writeable);
+                return -EINVAL;
+            }
+        } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
+
+#ifdef __x86_64__
+        map_pages_to_xen(
+            (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES,
+            PAGE_HYPERVISOR | l2e_get_flags(l2e));
+#endif
+    }
 
     return rc;
 }
@@ -772,7 +865,7 @@ get_page_from_l2e(
 define_get_linear_pagetable(l3);
 static int
 get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
 {
     int rc;
 
@@ -786,7 +879,7 @@ get_page_from_l3e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
     if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
         rc = 0;
 
@@ -797,7 +890,7 @@ get_page_from_l3e(
 define_get_linear_pagetable(l4);
 static int
 get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
 {
     int rc;
 
@@ -811,7 +904,7 @@ get_page_from_l4e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
     if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
         rc = 0;
 
@@ -939,7 +1032,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
              (d == e) )
         {
             for_each_vcpu ( d, v )
-                invalidate_shadow_ldt(v);
+                invalidate_shadow_ldt(v, 1);
         }
         put_page(page);
     }
@@ -952,32 +1045,67 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
  */
 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
 {
-    if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
-         (l2e_get_pfn(l2e) != pfn) )
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
+        return 1;
+
+    if ( l2e_get_flags(l2e) & _PAGE_PSE )
+    {
+        unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
+        int writeable = l2e_get_flags(l2e) & _PAGE_RW;
+
+        ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
+        do {
+            put_data_page(mfn_to_page(m), writeable);
+        } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
+    }
+    else
     {
         put_page_and_type(l2e_get_page(l2e));
-        return 0;
     }
-    return 1;
+
+    return 0;
 }
 
+static int __put_page_type(struct page_info *, int preemptible);
 
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-                             int preemptible)
+                             int partial, int preemptible)
 {
-    if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
-         (l3e_get_pfn(l3e) != pfn) )
-        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
-    return 1;
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
+        return 1;
+
+#ifdef __x86_64__
+    if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
+    {
+        unsigned long mfn = l3e_get_pfn(l3e);
+        int writeable = l3e_get_flags(l3e) & _PAGE_RW;
+
+        ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
+        do {
+            put_data_page(mfn_to_page(mfn), writeable);
+        } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
+
+        return 0;
+    }
+#endif
+
+    if ( unlikely(partial > 0) )
+        return __put_page_type(l3e_get_page(l3e), preemptible);
+
+    return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
 }
 
 #if CONFIG_PAGING_LEVELS >= 4
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-                             int preemptible)
+                             int partial, int preemptible)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l4e_get_page(l4e), preemptible);
         return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+    }
     return 1;
 }
 #endif
@@ -1064,7 +1192,7 @@ static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
     for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
     {
         l2e = l2e_from_page(
-            virt_to_page(page_get_owner(page)->arch.mm_perdomain_pt) + i,
+            virt_to_page(d->arch.mm_perdomain_pt) + i,
             __PAGE_HYPERVISOR);
         l2e_write(&pl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i], l2e);
     }
@@ -1184,7 +1312,7 @@ static int alloc_l3_table(struct page_info *page, int preemptible)
     unsigned long  pfn = page_to_mfn(page);
     l3_pgentry_t  *pl3e;
     unsigned int   i;
-    int            rc = 0;
+    int            rc = 0, partial = page->partial_pte;
 
 #if CONFIG_PAGING_LEVELS == 3
     /*
@@ -1213,7 +1341,8 @@ static int alloc_l3_table(struct page_info *page, int preemptible)
     if ( is_pv_32on64_domain(d) )
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
-    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
         if ( is_pv_32bit_domain(d) && (i == 3) )
         {
@@ -1224,16 +1353,17 @@ static int alloc_l3_table(struct page_info *page, int preemptible)
                 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
                                                    PGT_l2_page_table |
                                                    PGT_pae_xen_l2,
-                                                   d, preemptible);
+                                                   d, partial, preemptible);
         }
         else if ( !is_guest_l3_slot(i) ||
-                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
+                  (rc = get_page_from_l3e(pl3e[i], pfn, d,
+                                          partial, preemptible)) > 0 )
             continue;
 
         if ( rc == -EAGAIN )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 1;
+            page->partial_pte = partial ?: 1;
         }
         else if ( rc == -EINTR && i )
         {
@@ -1257,7 +1387,7 @@ static int alloc_l3_table(struct page_info *page, int preemptible)
             if ( !is_guest_l3_slot(i) )
                 continue;
             unadjust_guest_l3e(pl3e[i], d);
-            put_page_from_l3e(pl3e[i], pfn, 0);
+            put_page_from_l3e(pl3e[i], pfn, 0, 0);
         }
     }
 
@@ -1272,18 +1402,20 @@ static int alloc_l4_table(struct page_info *page, int preemptible)
     unsigned long  pfn = page_to_mfn(page);
     l4_pgentry_t  *pl4e = page_to_virt(page);
     unsigned int   i;
-    int            rc = 0;
+    int            rc = 0, partial = page->partial_pte;
 
-    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
         if ( !is_guest_l4_slot(d, i) ||
-             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
+             (rc = get_page_from_l4e(pl4e[i], pfn, d,
+                                     partial, preemptible)) > 0 )
             continue;
 
         if ( rc == -EAGAIN )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 1;
+            page->partial_pte = partial ?: 1;
         }
         else if ( rc == -EINTR )
         {
@@ -1299,7 +1431,7 @@ static int alloc_l4_table(struct page_info *page, int preemptible)
             MEM_LOG("Failure in alloc_l4_table: entry %d", i);
             while ( i-- > 0 )
                 if ( is_guest_l4_slot(d, i) )
-                    put_page_from_l4e(pl4e[i], pfn, 0);
+                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
         }
         if ( rc < 0 )
             return rc;
@@ -1377,24 +1509,20 @@ static int free_l3_table(struct page_info *page, int preemptible)
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l3_pgentry_t *pl3e;
-    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
-    int rc = 0;
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-    if ( d->arch.relmem == RELMEM_l3 )
-        return 0;
-#endif
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     pl3e = map_domain_page(pfn);
 
     do {
         if ( is_guest_l3_slot(i) )
         {
-            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+            rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
+            if ( rc < 0 )
+                break;
+            partial = 0;
             if ( rc > 0 )
                 continue;
-            if ( rc )
-                break;
             unadjust_guest_l3e(pl3e[i], d);
         }
     } while ( i-- );
@@ -1404,7 +1532,7 @@ static int free_l3_table(struct page_info *page, int preemptible)
     if ( rc == -EAGAIN )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = 1;
+        page->partial_pte = partial ?: -1;
     }
     else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
     {
@@ -1421,23 +1549,21 @@ static int free_l4_table(struct page_info *page, int preemptible)
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l4_pgentry_t *pl4e = page_to_virt(page);
-    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
-    int rc = 0;
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-    if ( d->arch.relmem == RELMEM_l4 )
-        return 0;
-#endif
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     do {
         if ( is_guest_l4_slot(d, i) )
-            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
-    } while ( rc >= 0 && i-- );
+            rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
+        if ( rc < 0 )
+            break;
+        partial = 0;
+    } while ( i-- );
 
     if ( rc == -EAGAIN )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = 1;
+        page->partial_pte = partial ?: -1;
     }
     else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
     {
@@ -1451,24 +1577,31 @@ static int free_l4_table(struct page_info *page, int preemptible)
 #define free_l4_table(page, preemptible) (-EINVAL)
 #endif
 
-static void page_lock(struct page_info *page)
+static int page_lock(struct page_info *page)
 {
-#if defined(__i386__)
-    while ( unlikely(test_and_set_bit(_PGC_locked, &page->count_info)) )
-        while ( test_bit(_PGC_locked, &page->count_info) )
+    unsigned long x, nx;
+
+    do {
+        while ( (x = page->u.inuse.type_info) & PGT_locked )
             cpu_relax();
-#else
-    spin_lock(&page->lock);
-#endif
+        nx = x + (1 | PGT_locked);
+        if ( !(x & PGT_validated) ||
+             !(x & PGT_count_mask) ||
+             !(nx & PGT_count_mask) )
+            return 0;
+    } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
+
+    return 1;
 }
 
 static void page_unlock(struct page_info *page)
 {
-#if defined(__i386__)
-    clear_bit(_PGC_locked, &page->count_info);
-#else
-    spin_unlock(&page->lock);
-#endif
+    unsigned long x, nx, y = page->u.inuse.type_info;
+
+    do {
+        x = y;
+        nx = x - (1 | PGT_locked);
+    } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
 }
 
 /* How to write an entry to the guest pagetables.
@@ -1524,40 +1657,36 @@ static inline int update_intpte(intpte_t *p,
                   (_m), (_v), (_ad))
 
 /* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, 
-                        unsigned long gl1mfn, int preserve_ad)
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+                        unsigned long gl1mfn, int preserve_ad,
+                        struct vcpu *vcpu)
 {
     l1_pgentry_t ol1e;
-    struct vcpu *curr = current;
-    struct domain *d = curr->domain;
+    struct domain *d = vcpu->domain;
     unsigned long mfn;
-    struct page_info *l1pg = mfn_to_page(gl1mfn);
+    p2m_type_t p2mt;
     int rc = 1;
 
-    page_lock(l1pg);
-
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
-        return page_unlock(l1pg), 0;
+        return 0;
 
     if ( unlikely(paging_mode_refcounts(d)) )
     {
-        rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, preserve_ad);
-        page_unlock(l1pg);
+        rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu, preserve_ad);
         return rc;
     }
 
     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
     {
         /* Translate foreign guest addresses. */
-        mfn = gmfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e));
-        if ( unlikely(mfn == INVALID_MFN) )
-            return page_unlock(l1pg), 0;
+        mfn = mfn_x(gfn_to_mfn(FOREIGNDOM, l1e_get_pfn(nl1e), &p2mt));
+        if ( !p2m_is_ram(p2mt) || unlikely(mfn == INVALID_MFN) )
+            return 0;
         ASSERT((mfn & ~(PADDR_MASK >> PAGE_SHIFT)) == 0);
         nl1e = l1e_from_pfn(mfn, l1e_get_flags(nl1e));
 
         if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(d)) )
         {
-            page_unlock(l1pg);
             MEM_LOG("Bad L1 flags %x",
                     l1e_get_flags(nl1e) & l1_disallow_mask(d));
             return 0;
@@ -1567,31 +1696,28 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
         if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
         {
             adjust_guest_l1e(nl1e, d);
-            rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
+            rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
                               preserve_ad);
-            page_unlock(l1pg);
             return rc;
         }
 
         if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
-            return page_unlock(l1pg), 0;
+            return 0;
         
         adjust_guest_l1e(nl1e, d);
-        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
+        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
                                     preserve_ad)) )
         {
             ol1e = nl1e;
             rc = 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr,
+    else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, vcpu,
                                      preserve_ad)) )
     {
-        page_unlock(l1pg);
         return 0;
     }
 
-    page_unlock(l1pg);
     put_page_from_l1e(ol1e, d);
     return rc;
 }
@@ -1601,13 +1727,13 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
 static int mod_l2_entry(l2_pgentry_t *pl2e, 
                         l2_pgentry_t nl2e, 
                         unsigned long pfn,
-                        unsigned long type,
-                        int preserve_ad)
+                        int preserve_ad,
+                        struct vcpu *vcpu)
 {
     l2_pgentry_t ol2e;
-    struct vcpu *curr = current;
-    struct domain *d = curr->domain;
+    struct domain *d = vcpu->domain;
     struct page_info *l2pg = mfn_to_page(pfn);
+    unsigned long type = l2pg->u.inuse.type_info;
     int rc = 1;
 
     if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
@@ -1616,16 +1742,13 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
         return 0;
     }
 
-    page_lock(l2pg);
-
     if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
-        return page_unlock(l2pg), 0;
+        return 0;
 
     if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
     {
         if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
         {
-            page_unlock(l2pg);
             MEM_LOG("Bad L2 flags %x",
                     l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
             return 0;
@@ -1635,30 +1758,27 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
         if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) )
         {
             adjust_guest_l2e(nl2e, d);
-            rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr, preserve_ad);
-            page_unlock(l2pg);
+            rc = UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad);
             return rc;
         }
 
         if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
-            return page_unlock(l2pg), 0;
+            return 0;
 
         adjust_guest_l2e(nl2e, d);
-        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
                                     preserve_ad)) )
         {
             ol2e = nl2e;
             rc = 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, curr,
+    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
                                      preserve_ad)) )
     {
-        page_unlock(l2pg);
         return 0;
     }
 
-    page_unlock(l2pg);
     put_page_from_l2e(ol2e, pfn);
     return rc;
 }
@@ -1668,12 +1788,11 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
                         l3_pgentry_t nl3e, 
                         unsigned long pfn,
                         int preserve_ad,
-                        int preemptible)
+                        int preemptible,
+                        struct vcpu *vcpu)
 {
     l3_pgentry_t ol3e;
-    struct vcpu *curr = current;
-    struct domain *d = curr->domain;
-    struct page_info *l3pg = mfn_to_page(pfn);
+    struct domain *d = vcpu->domain;
     int rc = 0;
 
     if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
@@ -1689,16 +1808,13 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
     if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
         return -EINVAL;
 
-    page_lock(l3pg);
-
     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
-        return page_unlock(l3pg), -EFAULT;
+        return -EFAULT;
 
     if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
     {
         if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
         {
-            page_unlock(l3pg);
             MEM_LOG("Bad L3 flags %x",
                     l3e_get_flags(nl3e) & l3_disallow_mask(d));
             return -EINVAL;
@@ -1708,28 +1824,26 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
         if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) )
         {
             adjust_guest_l3e(nl3e, d);
-            rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
-            page_unlock(l3pg);
+            rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
             return rc ? 0 : -EFAULT;
         }
 
-        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+        rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
         if ( unlikely(rc < 0) )
-            return page_unlock(l3pg), rc;
+            return rc;
         rc = 0;
 
         adjust_guest_l3e(nl3e, d);
-        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
+        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
                                     preserve_ad)) )
         {
             ol3e = nl3e;
             rc = -EFAULT;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
+    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
                                      preserve_ad)) )
     {
-        page_unlock(l3pg);
         return -EFAULT;
     }
 
@@ -1741,8 +1855,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
         pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
     }
 
-    page_unlock(l3pg);
-    put_page_from_l3e(ol3e, pfn, 0);
+    put_page_from_l3e(ol3e, pfn, 0, 0);
     return rc;
 }
 
@@ -1753,12 +1866,11 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
                         l4_pgentry_t nl4e, 
                         unsigned long pfn,
                         int preserve_ad,
-                        int preemptible)
+                        int preemptible,
+                        struct vcpu *vcpu)
 {
-    struct vcpu *curr = current;
-    struct domain *d = curr->domain;
+    struct domain *d = vcpu->domain;
     l4_pgentry_t ol4e;
-    struct page_info *l4pg = mfn_to_page(pfn);
     int rc = 0;
 
     if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
@@ -1767,16 +1879,13 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
         return -EINVAL;
     }
 
-    page_lock(l4pg);
-
     if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
-        return page_unlock(l4pg), -EFAULT;
+        return -EFAULT;
 
     if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
     {
         if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
         {
-            page_unlock(l4pg);
             MEM_LOG("Bad L4 flags %x",
                     l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
             return -EINVAL;
@@ -1786,33 +1895,30 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
         if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) )
         {
             adjust_guest_l4e(nl4e, d);
-            rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
-            page_unlock(l4pg);
+            rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
             return rc ? 0 : -EFAULT;
         }
 
-        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+        rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
         if ( unlikely(rc < 0) )
-            return page_unlock(l4pg), rc;
+            return rc;
         rc = 0;
 
         adjust_guest_l4e(nl4e, d);
-        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
+        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
                                     preserve_ad)) )
         {
             ol4e = nl4e;
             rc = -EFAULT;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
+    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
                                      preserve_ad)) )
     {
-        page_unlock(l4pg);
         return -EFAULT;
     }
 
-    page_unlock(l4pg);
-    put_page_from_l4e(ol4e, pfn, 0);
+    put_page_from_l4e(ol4e, pfn, 0, 0);
     return rc;
 }
 
@@ -1820,9 +1926,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
 
 void put_page(struct page_info *page)
 {
-    u32 nx, x, y = page->count_info;
+    unsigned long nx, x, y = page->count_info;
 
     do {
+        ASSERT((y & PGC_count_mask) != 0);
         x  = y;
         nx = x - 1;
     }
@@ -1836,40 +1943,67 @@ void put_page(struct page_info *page)
 }
 
 
+struct domain *page_get_owner_and_reference(struct page_info *page)
+{
+    unsigned long x, y = page->count_info;
+
+    do {
+        x = y;
+        /*
+         * Count ==  0: Page is not allocated, so we cannot take a reference.
+         * Count == -1: Reference count would wrap, which is invalid. 
+         * Count == -2: Remaining unused ref is reserved for get_page_light().
+         */
+        if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
+            return NULL;
+    }
+    while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
+
+    return page_get_owner(page);
+}
+
+
 int get_page(struct page_info *page, struct domain *domain)
 {
-    u32 x, nx, y = page->count_info;
-    u32 d, nd = page->u.inuse._domain;
-    u32 _domain = pickle_domptr(domain);
+    struct domain *owner = page_get_owner_and_reference(page);
+
+    if ( likely(owner == domain) )
+        return 1;
+
+    if ( owner != NULL )
+        put_page(page);
+
+    if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
+        gdprintk(XENLOG_INFO,
+                 "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%"
+                 PRtype_info "\n",
+                 page_to_mfn(page), domain, owner,
+                 page->count_info, page->u.inuse.type_info);
+    return 0;
+}
+
+/*
+ * Special version of get_page() to be used exclusively when
+ * - a page is known to already have a non-zero reference count
+ * - the page does not need its owner to be checked
+ * - it will not be called more than once without dropping the thus
+ *   acquired reference again.
+ * Due to get_page() reserving one reference, this call cannot fail.
+ */
+static void get_page_light(struct page_info *page)
+{
+    unsigned long x, nx, y = page->count_info;
 
     do {
         x  = y;
         nx = x + 1;
-        d  = nd;
-        if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
-             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
-             unlikely(d != _domain) )                /* Wrong owner? */
-        {
-            if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
-                gdprintk(XENLOG_INFO,
-                         "Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
-                         PRtype_info "\n",
-                         page_to_mfn(page), domain, unpickle_domptr(d),
-                         x, page->u.inuse.type_info);
-            return 0;
-        }
-        asm volatile (
-            LOCK_PREFIX "cmpxchg8b %2"
-            : "=d" (nd), "=a" (y),
-            "=m" (*(volatile u64 *)(&page->count_info))
-            : "0" (d), "1" (x), "c" (d), "b" (nx) );
+        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
+        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
+        y = cmpxchg(&page->count_info, x, nx);
     }
-    while ( unlikely(nd != d) || unlikely(y != x) );
-
-    return 1;
+    while ( unlikely(y != x) );
 }
 
-
 static int alloc_page_type(struct page_info *page, unsigned long type,
                            int preemptible)
 {
@@ -1898,7 +2032,7 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
         rc = alloc_segdesc_page(page);
         break;
     default:
-        printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 
+        printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n", 
                type, page->u.inuse.type_info,
                page->count_info);
         rc = -EINVAL;
@@ -1909,6 +2043,7 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
     wmb();
     if ( rc == -EAGAIN )
     {
+        get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
     else if ( rc == -EINTR )
@@ -1921,7 +2056,7 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
     {
         ASSERT(rc < 0);
         MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
-                PRtype_info ": caf=%08x taf=%" PRtype_info,
+                PRtype_info ": caf=%08lx taf=%" PRtype_info,
                 page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
                 type, page->count_info, page->u.inuse.type_info);
         page->u.inuse.type_info = 0;
@@ -1942,30 +2077,17 @@ int free_page_type(struct page_info *page, unsigned long type,
     unsigned long gmfn;
     int rc;
 
-    if ( likely(owner != NULL) )
+    if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
     {
-        /*
-         * We have to flush before the next use of the linear mapping
-         * (e.g., update_va_mapping()) or we could end up modifying a page
-         * that is no longer a page table (and hence screw up ref counts).
-         */
-        if ( current->domain == owner )
-            queue_deferred_ops(owner, DOP_FLUSH_ALL_TLBS);
-        else
-            flush_tlb_mask(owner->domain_dirty_cpumask);
-
-        if ( unlikely(paging_mode_enabled(owner)) )
-        {
-            /* A page table is dirtied when its type count becomes zero. */
-            paging_mark_dirty(owner, page_to_mfn(page));
+        /* A page table is dirtied when its type count becomes zero. */
+        paging_mark_dirty(owner, page_to_mfn(page));
 
-            if ( shadow_mode_refcounts(owner) )
-                return 0;
+        if ( shadow_mode_refcounts(owner) )
+            return 0;
 
-            gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
-            ASSERT(VALID_M2P(gmfn));
-            shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
-        }
+        gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
+        ASSERT(VALID_M2P(gmfn));
+        shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
     }
 
     if ( !(type & PGT_partial) )
@@ -1973,6 +2095,7 @@ int free_page_type(struct page_info *page, unsigned long type,
         page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
         page->partial_pte = 0;
     }
+
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
@@ -1998,6 +2121,15 @@ int free_page_type(struct page_info *page, unsigned long type,
         BUG();
     }
 
+    return rc;
+}
+
+
+static int __put_final_page_type(
+    struct page_info *page, unsigned long type, int preemptible)
+{
+    int rc = free_page_type(page, type, preemptible);
+
     /* No need for atomic update of type_info here: noone else updates it. */
     if ( rc == 0 )
     {
@@ -2016,8 +2148,8 @@ int free_page_type(struct page_info *page, unsigned long type,
     }
     else if ( rc == -EINTR )
     {
-        ASSERT(!(page->u.inuse.type_info &
-                 (PGT_count_mask|PGT_validated|PGT_partial)));
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
                (page->count_info & PGC_page_table)) )
             page->tlbflush_timestamp = tlbflush_current_time();
@@ -2028,6 +2160,7 @@ int free_page_type(struct page_info *page, unsigned long type,
     {
         BUG_ON(rc != -EAGAIN);
         wmb();
+        get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
 
@@ -2039,6 +2172,7 @@ static int __put_page_type(struct page_info *page,
                            int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     for ( ; ; )
     {
@@ -2062,7 +2196,10 @@ static int __put_page_type(struct page_info *page,
                                            x, nx)) != x) )
                     continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                return free_page_type(page, x, preemptible);
+                rc = __put_final_page_type(page, x, preemptible);
+                if ( x & PGT_partial )
+                    put_page(page);
+                break;
             }
 
             /*
@@ -2084,7 +2221,7 @@ static int __put_page_type(struct page_info *page,
             return -EINTR;
     }
 
-    return 0;
+    return rc;
 }
 
 
@@ -2092,6 +2229,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
                            int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
 
@@ -2214,10 +2352,13 @@ static int __get_page_type(struct page_info *page, unsigned long type,
             page->nr_validated_ptes = 0;
             page->partial_pte = 0;
         }
-        return alloc_page_type(page, type, preemptible);
+        rc = alloc_page_type(page, type, preemptible);
     }
 
-    return 0;
+    if ( (x & PGT_partial) && !(nx & PGT_partial) )
+        put_page(page);
+
+    return rc;
 }
 
 void put_page_type(struct page_info *page)
@@ -2266,8 +2407,8 @@ void cleanup_page_cacheattr(struct page_info *page)
 
 int new_guest_cr3(unsigned long mfn)
 {
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
     int okay;
     unsigned long old_base_mfn;
 
@@ -2277,40 +2418,40 @@ int new_guest_cr3(unsigned long mfn)
         okay = paging_mode_refcounts(d)
             ? 0 /* Old code was broken, but what should it be? */
             : mod_l4_entry(
-                    __va(pagetable_get_paddr(v->arch.guest_table)),
+                    __va(pagetable_get_paddr(curr->arch.guest_table)),
                     l4e_from_pfn(
                         mfn,
                         (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
-                    pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
+                    pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
         if ( unlikely(!okay) )
         {
             MEM_LOG("Error while installing new compat baseptr %lx", mfn);
             return 0;
         }
 
-        invalidate_shadow_ldt(v);
-        write_ptbase(v);
+        invalidate_shadow_ldt(curr, 0);
+        write_ptbase(curr);
 
         return 1;
     }
 #endif
     okay = paging_mode_refcounts(d)
         ? get_page_from_pagenr(mfn, d)
-        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
+        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
     if ( unlikely(!okay) )
     {
         MEM_LOG("Error while installing new baseptr %lx", mfn);
         return 0;
     }
 
-    invalidate_shadow_ldt(v);
+    invalidate_shadow_ldt(curr, 0);
 
-    old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+    old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
 
-    v->arch.guest_table = pagetable_from_pfn(mfn);
-    update_cr3(v);
+    curr->arch.guest_table = pagetable_from_pfn(mfn);
+    update_cr3(curr);
 
-    write_ptbase(v);
+    write_ptbase(curr);
 
     if ( likely(old_base_mfn != 0) )
     {
@@ -2340,6 +2481,10 @@ static void process_deferred_ops(void)
             flush_tlb_local();
     }
 
+    /*
+     * Do this after flushing TLBs, to ensure we see fresh LDT mappings
+     * via the linear pagetable mapping.
+     */
     if ( deferred_ops & DOP_RELOAD_LDT )
         (void)map_ldt_shadow_page(0);
 
@@ -2431,6 +2576,29 @@ static inline cpumask_t vcpumask_to_pcpumask(
     return pmask;
 }
 
+#ifdef __i386__
+static inline void *fixmap_domain_page(unsigned long mfn)
+{
+    unsigned int cpu = smp_processor_id();
+    void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
+
+    l1e_write(fix_pae_highmem_pl1e - cpu,
+              l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    flush_tlb_one_local(ptr);
+    return ptr;
+}
+static inline void fixunmap_domain_page(const void *ptr)
+{
+    unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
+
+    l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
+    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
+}
+#else
+#define fixmap_domain_page(mfn) mfn_to_virt(mfn)
+#define fixunmap_domain_page(ptr) ((void)(ptr))
+#endif
+
 int do_mmuext_op(
     XEN_GUEST_HANDLE(mmuext_op_t) uops,
     unsigned int count,
@@ -2442,8 +2610,8 @@ int do_mmuext_op(
     unsigned long mfn = 0, gmfn = 0, type;
     unsigned int done = 0;
     struct page_info *page;
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
 
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
@@ -2517,7 +2685,7 @@ int do_mmuext_op(
             if ( paging_mode_refcounts(FOREIGNDOM) )
                 break;
 
-            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
             okay = !rc;
             if ( unlikely(!okay) )
             {
@@ -2598,7 +2766,7 @@ int do_mmuext_op(
                     okay = get_page_from_pagenr(mfn, d);
                 else
                     okay = !get_page_and_type_from_pagenr(
-                        mfn, PGT_root_page_table, d, 0);
+                        mfn, PGT_root_page_table, d, 0, 0);
                 if ( unlikely(!okay) )
                 {
                     MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2606,8 +2774,8 @@ int do_mmuext_op(
                 }
             }
 
-            old_mfn = pagetable_get_pfn(v->arch.guest_table_user);
-            v->arch.guest_table_user = pagetable_from_pfn(mfn);
+            old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
+            curr->arch.guest_table_user = pagetable_from_pfn(mfn);
 
             if ( old_mfn != 0 )
             {
@@ -2627,7 +2795,7 @@ int do_mmuext_op(
     
         case MMUEXT_INVLPG_LOCAL:
             if ( !paging_mode_enabled(d) 
-                 || paging_invlpg(v, op.arg1.linear_addr) != 0 )
+                 || paging_invlpg(curr, op.arg1.linear_addr) != 0 )
                 flush_tlb_one_local(op.arg1.linear_addr);
             break;
 
@@ -2650,7 +2818,7 @@ int do_mmuext_op(
         }
 
         case MMUEXT_TLB_FLUSH_ALL:
-            flush_tlb_mask(d->domain_dirty_cpumask);
+            this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
             break;
     
         case MMUEXT_INVLPG_ALL:
@@ -2686,13 +2854,14 @@ int do_mmuext_op(
                 okay = 0;
                 MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents);
             }
-            else if ( (v->arch.guest_context.ldt_ents != ents) || 
-                      (v->arch.guest_context.ldt_base != ptr) )
+            else if ( (curr->arch.guest_context.ldt_ents != ents) || 
+                      (curr->arch.guest_context.ldt_base != ptr) )
             {
-                invalidate_shadow_ldt(v);
-                v->arch.guest_context.ldt_base = ptr;
-                v->arch.guest_context.ldt_ents = ents;
-                load_LDT(v);
+                invalidate_shadow_ldt(curr, 0);
+                this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
+                curr->arch.guest_context.ldt_base = ptr;
+                curr->arch.guest_context.ldt_ents = ents;
+                load_LDT(curr);
                 this_cpu(percpu_mm_info).deferred_ops &= ~DOP_RELOAD_LDT;
                 if ( ents != 0 )
                     this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
@@ -2700,6 +2869,66 @@ int do_mmuext_op(
             break;
         }
 
+        case MMUEXT_CLEAR_PAGE:
+        {
+            unsigned char *ptr;
+
+            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
+                                                  FOREIGNDOM, 0, 0);
+            if ( unlikely(!okay) )
+            {
+                MEM_LOG("Error while clearing mfn %lx", mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being cleared. */
+            paging_mark_dirty(d, mfn);
+
+            ptr = fixmap_domain_page(mfn);
+            clear_page(ptr);
+            fixunmap_domain_page(ptr);
+
+            put_page_and_type(page);
+            break;
+        }
+
+        case MMUEXT_COPY_PAGE:
+        {
+            const unsigned char *src;
+            unsigned char *dst;
+            unsigned long src_mfn;
+
+            src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
+            okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
+            if ( unlikely(!okay) )
+            {
+                MEM_LOG("Error while copying from mfn %lx", src_mfn);
+                break;
+            }
+
+            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
+                                                  FOREIGNDOM, 0, 0);
+            if ( unlikely(!okay) )
+            {
+                put_page(mfn_to_page(src_mfn));
+                MEM_LOG("Error while copying to mfn %lx", mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being copied to. */
+            paging_mark_dirty(d, mfn);
+
+            src = map_domain_page(src_mfn);
+            dst = fixmap_domain_page(mfn);
+            copy_page(dst, src);
+            fixunmap_domain_page(dst);
+            unmap_domain_page(src);
+
+            put_page_and_type(page);
+            put_page(mfn_to_page(src_mfn));
+            break;
+        }
+
         default:
             MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
             rc = -ENOSYS;
@@ -2748,9 +2977,7 @@ int do_mmu_update(
     struct page_info *page;
     int rc = 0, okay = 1, i = 0;
     unsigned int cmd, done = 0;
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    unsigned long type_info;
+    struct domain *d = current->domain;
     struct domain_mmap_cache mapcache;
 
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
@@ -2822,44 +3049,32 @@ int do_mmu_update(
                           (unsigned long)(req.ptr & ~PAGE_MASK));
             page = mfn_to_page(mfn);
 
-            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
+            if ( page_lock(page) )
             {
-            case PGT_l1_page_table:
-            case PGT_l2_page_table:
-            case PGT_l3_page_table:
-            case PGT_l4_page_table:
-            {
-                if ( paging_mode_refcounts(d) )
-                {
-                    MEM_LOG("mmu update on auto-refcounted domain!");
-                    break;
-                }
-
-                if ( unlikely(!get_page_type(
-                    page, type_info & (PGT_type_mask|PGT_pae_xen_l2))) )
-                    goto not_a_pt;
-
-                switch ( type_info & PGT_type_mask )
+                switch ( page->u.inuse.type_info & PGT_type_mask )
                 {
                 case PGT_l1_page_table:
                 {
                     l1_pgentry_t l1e = l1e_from_intpte(req.val);
                     okay = mod_l1_entry(va, l1e, mfn,
-                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
+                                        cmd == MMU_PT_UPDATE_PRESERVE_AD,
+                                        current);
                 }
                 break;
                 case PGT_l2_page_table:
                 {
                     l2_pgentry_t l2e = l2e_from_intpte(req.val);
-                    okay = mod_l2_entry(va, l2e, mfn, type_info,
-                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
+                    okay = mod_l2_entry(va, l2e, mfn,
+                                        cmd == MMU_PT_UPDATE_PRESERVE_AD,
+                                        current);
                 }
                 break;
                 case PGT_l3_page_table:
                 {
                     l3_pgentry_t l3e = l3e_from_intpte(req.val);
                     rc = mod_l3_entry(va, l3e, mfn,
-                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
+                                      current);
                     okay = !rc;
                 }
                 break;
@@ -2868,36 +3083,31 @@ int do_mmu_update(
                 {
                     l4_pgentry_t l4e = l4e_from_intpte(req.val);
                     rc = mod_l4_entry(va, l4e, mfn,
-                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1,
+                                      current);
                     okay = !rc;
                 }
                 break;
 #endif
+                case PGT_writable_page:
+                    perfc_incr(writable_mmu_updates);
+                    okay = paging_write_guest_entry(
+                        current, va, req.val, _mfn(mfn));
+                    break;
                 }
-
-                put_page_type(page);
+                page_unlock(page);
                 if ( rc == -EINTR )
                     rc = -EAGAIN;
             }
-            break;
-
-            default:
-            not_a_pt:
+            else if ( get_page_type(page, PGT_writable_page) )
             {
-                if ( unlikely(!get_page_type(page, PGT_writable_page)) )
-                    break;
-
                 perfc_incr(writable_mmu_updates);
-
-                okay = paging_write_guest_entry(v, va, req.val, _mfn(mfn));
-
+                okay = paging_write_guest_entry(
+                    current, va, req.val, _mfn(mfn));
                 put_page_type(page);
             }
-            break;
-            }
 
             unmap_domain_page_with_cache(va, &mapcache);
-
             put_page(page);
             break;
 
@@ -2976,7 +3186,6 @@ static int create_grant_pte_mapping(
     void *va;
     unsigned long gmfn, mfn;
     struct page_info *page;
-    u32 type;
     l1_pgentry_t ol1e;
     struct domain *d = v->domain;
 
@@ -2997,21 +3206,23 @@ static int create_grant_pte_mapping(
     va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK));
     page = mfn_to_page(mfn);
 
-    type = page->u.inuse.type_info & PGT_type_mask;
-    if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
+    if ( !page_lock(page) )
     {
-        MEM_LOG("Grant map attempted to update a non-L1 page");
         rc = GNTST_general_error;
         goto failed;
     }
 
-    page_lock(page);
+    if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(page);
+        rc = GNTST_general_error;
+        goto failed;
+    }
 
     ol1e = *(l1_pgentry_t *)va;
     if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) )
     {
         page_unlock(page);
-        put_page_type(page);
         rc = GNTST_general_error;
         goto failed;
     } 
@@ -3021,8 +3232,6 @@ static int create_grant_pte_mapping(
     if ( !paging_mode_refcounts(d) )
         put_page_from_l1e(ol1e, d);
 
-    put_page_type(page);
  failed:
     unmap_domain_page(va);
     put_page(page);
@@ -3037,7 +3246,6 @@ static int destroy_grant_pte_mapping(
     void *va;
     unsigned long gmfn, mfn;
     struct page_info *page;
-    u32 type;
     l1_pgentry_t ol1e;
 
     gmfn = addr >> PAGE_SHIFT;
@@ -3053,15 +3261,18 @@ static int destroy_grant_pte_mapping(
     va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK));
     page = mfn_to_page(mfn);
 
-    type = page->u.inuse.type_info & PGT_type_mask;
-    if ( (type != PGT_l1_page_table) || !get_page_type(page, type) )
+    if ( !page_lock(page) )
     {
-        MEM_LOG("Grant map attempted to update a non-L1 page");
         rc = GNTST_general_error;
         goto failed;
     }
 
-    page_lock(page);
+    if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(page);
+        rc = GNTST_general_error;
+        goto failed;
+    }
 
     ol1e = *(l1_pgentry_t *)va;
     
@@ -3071,7 +3282,6 @@ static int destroy_grant_pte_mapping(
         page_unlock(page);
         MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx",
                 (unsigned long)l1e_get_intpte(ol1e), addr, frame);
-        put_page_type(page);
         rc = GNTST_general_error;
         goto failed;
     }
@@ -3085,13 +3295,11 @@ static int destroy_grant_pte_mapping(
     {
         page_unlock(page);
         MEM_LOG("Cannot delete PTE entry at %p", va);
-        put_page_type(page);
         rc = GNTST_general_error;
         goto failed;
     }
 
     page_unlock(page);
-    put_page_type(page);
 
  failed:
     unmap_domain_page(va);
@@ -3119,21 +3327,40 @@ static int create_grant_va_mapping(
         MEM_LOG("Could not find L1 PTE for address %lx", va);
         return GNTST_general_error;
     }
+
+    if ( !get_page_from_pagenr(gl1mfn, current->domain) )
+    {
+        guest_unmap_l1e(v, pl1e);
+        return GNTST_general_error;
+    }
+
     l1pg = mfn_to_page(gl1mfn);
-    page_lock(l1pg);
+    if ( !page_lock(l1pg) )
+    {
+        put_page(l1pg);
+        guest_unmap_l1e(v, pl1e);
+        return GNTST_general_error;
+    }
+
+    if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(l1pg);
+        put_page(l1pg);
+        guest_unmap_l1e(v, pl1e);
+        return GNTST_general_error;
+    }
+
     ol1e = *pl1e;
     okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0);
+
     page_unlock(l1pg);
+    put_page(l1pg);
     guest_unmap_l1e(v, pl1e);
-    pl1e = NULL;
-
-    if ( !okay )
-            return GNTST_general_error;
 
-    if ( !paging_mode_refcounts(d) )
+    if ( okay && !paging_mode_refcounts(d) )
         put_page_from_l1e(ol1e, d);
 
-    return GNTST_okay;
+    return okay ? GNTST_okay : GNTST_general_error;
 }
 
 static int replace_grant_va_mapping(
@@ -3151,31 +3378,48 @@ static int replace_grant_va_mapping(
         return GNTST_general_error;
     }
 
+    if ( !get_page_from_pagenr(gl1mfn, current->domain) )
+    {
+        rc = GNTST_general_error;
+        goto out;
+    }
+
     l1pg = mfn_to_page(gl1mfn);
-    page_lock(l1pg);
+    if ( !page_lock(l1pg) )
+    {
+        rc = GNTST_general_error;
+        put_page(l1pg);
+        goto out;
+    }
+
+    if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        rc = GNTST_general_error;
+        goto unlock_and_out;
+    }
+
     ol1e = *pl1e;
 
     /* Check that the virtual address supplied is actually mapped to frame. */
     if ( unlikely(l1e_get_pfn(ol1e) != frame) )
     {
-        page_unlock(l1pg);
         MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx",
                 l1e_get_pfn(ol1e), addr, frame);
         rc = GNTST_general_error;
-        goto out;
+        goto unlock_and_out;
     }
 
     /* Delete pagetable entry. */
     if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) )
     {
-        page_unlock(l1pg);
         MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
         rc = GNTST_general_error;
-        goto out;
+        goto unlock_and_out;
     }
 
+ unlock_and_out:
     page_unlock(l1pg);
-
+    put_page(l1pg);
  out:
     guest_unmap_l1e(v, pl1e);
     return rc;
@@ -3197,6 +3441,10 @@ int create_grant_host_mapping(uint64_t addr, unsigned long frame,
     if ( !(flags & GNTMAP_readonly) )
         l1e_add_flags(pte,_PAGE_RW);
 
+    l1e_add_flags(pte,
+                  ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0)
+                   & _PAGE_AVAIL);
+
     l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5));
 
     if ( flags & GNTMAP_contains_pte )
@@ -3233,20 +3481,42 @@ int replace_grant_host_mapping(
         return GNTST_general_error;
     }
 
+    if ( !get_page_from_pagenr(gl1mfn, current->domain) )
+    {
+        guest_unmap_l1e(curr, pl1e);
+        return GNTST_general_error;
+    }
+
     l1pg = mfn_to_page(gl1mfn);
-    page_lock(l1pg);
+    if ( !page_lock(l1pg) )
+    {
+        put_page(l1pg);
+        guest_unmap_l1e(curr, pl1e);
+        return GNTST_general_error;
+    }
+
+    if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(l1pg);
+        put_page(l1pg);
+        guest_unmap_l1e(curr, pl1e);
+        return GNTST_general_error;
+    }
+
     ol1e = *pl1e;
 
     if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(),
                                 gl1mfn, curr, 0)) )
     {
         page_unlock(l1pg);
+        put_page(l1pg);
         MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
         guest_unmap_l1e(curr, pl1e);
         return GNTST_general_error;
     }
 
     page_unlock(l1pg);
+    put_page(l1pg);
     guest_unmap_l1e(curr, pl1e);
 
     rc = replace_grant_va_mapping(addr, frame, ol1e, curr);
@@ -3259,49 +3529,47 @@ int replace_grant_host_mapping(
 int steal_page(
     struct domain *d, struct page_info *page, unsigned int memflags)
 {
-    u32 _d, _nd, x, y;
+    unsigned long x, y;
 
     spin_lock(&d->page_alloc_lock);
 
+    if ( is_xen_heap_page(page) || (page_get_owner(page) != d) )
+        goto fail;
+
     /*
-     * The tricky bit: atomically release ownership while there is just one 
-     * benign reference to the page (PGC_allocated). If that reference 
-     * disappears then the deallocation routine will safely spin.
+     * We require there is just one reference (PGC_allocated). We temporarily
+     * drop this reference now so that we can safely swizzle the owner.
      */
-    _d  = pickle_domptr(d);
-    _nd = page->u.inuse._domain;
-    y   = page->count_info;
+    y = page->count_info;
     do {
         x = y;
-        if ( unlikely((x & (PGC_count_mask|PGC_allocated)) !=
-                      (1 | PGC_allocated)) || unlikely(_nd != _d) )
-        { 
-            MEM_LOG("gnttab_transfer: Bad page %p: ed=%p(%u), sd=%p,"
-                    " caf=%08x, taf=%" PRtype_info "\n", 
-                    (void *) page_to_mfn(page),
-                    d, d->domain_id, unpickle_domptr(_nd), x, 
-                    page->u.inuse.type_info);
-            spin_unlock(&d->page_alloc_lock);
-            return -1;
-        }
-        asm volatile (
-            LOCK_PREFIX "cmpxchg8b %2"
-            : "=d" (_nd), "=a" (y),
-            "=m" (*(volatile u64 *)(&page->count_info))
-            : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
-    } while (unlikely(_nd != _d) || unlikely(y != x));
+        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
+            goto fail;
+        y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
+    } while ( y != x );
 
-    /*
-     * Unlink from 'd'. At least one reference remains (now anonymous), so 
-     * noone else is spinning to try to delete this page from 'd'.
-     */
+    /* Swizzle the owner then reinstate the PGC_allocated reference. */
+    page_set_owner(page, NULL);
+    y = page->count_info;
+    do {
+        x = y;
+        BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
+    } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
+
+    /* Unlink from original owner. */
     if ( !(memflags & MEMF_no_refcount) )
         d->tot_pages--;
-    list_del(&page->list);
+    page_list_del(page, &d->page_list);
 
     spin_unlock(&d->page_alloc_lock);
-
     return 0;
+
+ fail:
+    spin_unlock(&d->page_alloc_lock);
+    MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
+            (void *)page_to_mfn(page), d, d->domain_id,
+            page_get_owner(page), page->count_info, page->u.inuse.type_info);
+    return -1;
 }
 
 int do_update_va_mapping(unsigned long va, u64 val64,
@@ -3310,30 +3578,45 @@ int do_update_va_mapping(unsigned long va, u64 val64,
     l1_pgentry_t   val = l1e_from_intpte(val64);
     struct vcpu   *v   = current;
     struct domain *d   = v->domain;
+    struct page_info *gl1pg;
     l1_pgentry_t  *pl1e;
     unsigned long  vmask, bmap_ptr, gl1mfn;
     cpumask_t      pmask;
-    int            rc  = 0;
+    int            rc;
 
     perfc_incr(calls_to_update_va);
 
-    if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
-        return -EINVAL;
-
     rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
     if ( rc )
         return rc;
 
+    rc = -EINVAL;
     pl1e = guest_map_l1e(v, va, &gl1mfn);
+    if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) )
+        goto out;
 
-    if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn, 0)) )
-        rc = -EINVAL;
+    gl1pg = mfn_to_page(gl1mfn);
+    if ( !page_lock(gl1pg) )
+    {
+        put_page(gl1pg);
+        goto out;
+    }
 
+    if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(gl1pg);
+        put_page(gl1pg);
+        goto out;
+    }
+
+    rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v) ? 0 : -EINVAL;
+
+    page_unlock(gl1pg);
+    put_page(gl1pg);
+
+ out:
     if ( pl1e )
         guest_unmap_l1e(v, pl1e);
-    pl1e = NULL;
-
-    process_deferred_ops();
 
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
@@ -3341,26 +3624,34 @@ int do_update_va_mapping(unsigned long va, u64 val64,
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            flush_tlb_local();
+            this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_TLB;
             break;
         case UVMF_ALL:
-            flush_tlb_mask(d->domain_dirty_cpumask);
+            this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
             break;
         default:
+            if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
+                break;
             if ( unlikely(!is_pv_32on64_domain(d) ?
                           get_user(vmask, (unsigned long *)bmap_ptr) :
                           get_user(vmask, (unsigned int *)bmap_ptr)) )
-                rc = -EFAULT;
+                rc = -EFAULT, vmask = 0;
             pmask = vcpumask_to_pcpumask(d, vmask);
+            if ( cpu_isset(smp_processor_id(), pmask) )
+                this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB;
             flush_tlb_mask(pmask);
             break;
         }
         break;
 
     case UVMF_INVLPG:
+        if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_ALL_TLBS )
+            break;
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
+            if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
+                break;
             if ( !paging_mode_enabled(d) ||
                  (paging_invlpg(v, va) != 0) ) 
                 flush_tlb_one_local(va);
@@ -3372,14 +3663,18 @@ int do_update_va_mapping(unsigned long va, u64 val64,
             if ( unlikely(!is_pv_32on64_domain(d) ?
                           get_user(vmask, (unsigned long *)bmap_ptr) :
                           get_user(vmask, (unsigned int *)bmap_ptr)) )
-                rc = -EFAULT;
+                rc = -EFAULT, vmask = 0;
             pmask = vcpumask_to_pcpumask(d, vmask);
+            if ( this_cpu(percpu_mm_info).deferred_ops & DOP_FLUSH_TLB )
+                cpu_clear(smp_processor_id(), pmask);
             flush_tlb_one_mask(pmask, va);
             break;
         }
         break;
     }
 
+    process_deferred_ops();
+
     return rc;
 }
 
@@ -3590,14 +3885,13 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
 
             spin_unlock(&d->grant_table->lock);
             break;
-        case XENMAPSPACE_mfn:
-        {
-            if ( get_page_from_pagenr(xatp.idx, d) ) {
-                mfn = xatp.idx;
-                page = mfn_to_page(mfn);
-            }
+        case XENMAPSPACE_gmfn:
+            xatp.idx = gmfn_to_mfn(d, xatp.idx);
+            if ( !get_page_from_pagenr(xatp.idx, d) )
+                break;
+            mfn = xatp.idx;
+            page = mfn_to_page(mfn);
             break;
-        }
         default:
             break;
         }
@@ -3642,39 +3936,6 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
-    case XENMEM_remove_from_physmap:
-    {
-        struct xen_remove_from_physmap xrfp;
-        unsigned long mfn;
-        struct domain *d;
-
-        if ( copy_from_guest(&xrfp, arg, 1) )
-            return -EFAULT;
-
-        rc = rcu_lock_target_domain_by_id(xrfp.domid, &d);
-        if ( rc != 0 )
-            return rc;
-
-        if ( xsm_remove_from_physmap(current->domain, d) )
-        {
-            rcu_unlock_domain(d);
-            return -EPERM;
-        }
-
-        domain_lock(d);
-
-        mfn = gmfn_to_mfn(d, xrfp.gpfn);
-
-        if ( mfn_valid(mfn) )
-            guest_physmap_remove_page(d, xrfp.gpfn, mfn, 0);
-
-        domain_unlock(d);
-
-        rcu_unlock_domain(d);
-
-        break;
-    }
-
     case XENMEM_set_memory_map:
     {
         struct xen_foreign_memory_map fmap;
@@ -3773,6 +4034,49 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         return 0;
     }
 
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        xen_pod_target_t target;
+        struct domain *d;
+
+        /* Support DOMID_SELF? */
+        if ( !IS_PRIV(current->domain) )
+            return -EINVAL;
+
+        if ( copy_from_guest(&target, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(target.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        if ( op == XENMEM_set_pod_target )
+        {
+            if ( target.target_pages > d->max_pages )
+            {
+                rc = -EINVAL;
+                goto pod_target_out_unlock;
+            }
+            
+            rc = p2m_pod_set_mem_target(d, target.target_pages);
+        }
+
+        target.tot_pages       = d->tot_pages;
+        target.pod_cache_pages = d->arch.p2m->pod.count;
+        target.pod_entries     = d->arch.p2m->pod.entry_count;
+
+        if ( copy_to_guest(arg, &target, 1) )
+        {
+            rc= -EFAULT;
+            goto pod_target_out_unlock;
+        }
+        
+    pod_target_out_unlock:
+        rcu_unlock_domain(d);
+        return rc;
+    }
+
     default:
         return subarch_memory_op(op, arg);
     }
@@ -3999,16 +4303,26 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
 
     /* Attempt to read the PTE that maps the VA being accessed. */
     guest_get_eff_l1e(v, addr, &pte);
-    page = l1e_get_page(pte);
 
     /* We are looking only for read-only mappings of p.t. pages. */
     if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ||
-         !mfn_valid(l1e_get_pfn(pte)) ||
-         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
-         ((page->u.inuse.type_info & PGT_count_mask) == 0) ||
-         (page_get_owner(page) != d) )
+         !get_page_from_pagenr(l1e_get_pfn(pte), d) )
         goto bail;
 
+    page = l1e_get_page(pte);
+    if ( !page_lock(page) )
+    {
+        put_page(page);
+        goto bail;
+    }
+
+    if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(page);
+        put_page(page);
+        goto bail;
+    }
+
     ptwr_ctxt.ctxt.regs = regs;
     ptwr_ctxt.ctxt.force_writeback = 0;
     ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size =
@@ -4016,9 +4330,11 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr,
     ptwr_ctxt.cr2 = addr;
     ptwr_ctxt.pte = pte;
 
-    page_lock(page);
     rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops);
+
     page_unlock(page);
+    put_page(page);
+
     if ( rc == X86EMUL_UNHANDLEABLE )
         goto bail;
 
@@ -4092,7 +4408,7 @@ int map_pages_to_xen(
                 {
                     if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
                         flush_flags |= FLUSH_TLB_GLOBAL;
-                    if ( (l1f_to_lNf(l3e_get_flags(ol3e)) ^ flags) &
+                    if ( (lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
                          PAGE_CACHE_ATTRS )
                         flush_flags |= FLUSH_CACHE;
                     flush_area(virt, flush_flags);
@@ -4495,12 +4811,18 @@ void __set_fixmap(
 void memguard_init(void)
 {
     unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20);
+#ifdef __i386__
     map_pages_to_xen(
         (unsigned long)__va(start),
         start >> PAGE_SHIFT,
         (xenheap_phys_end - start) >> PAGE_SHIFT,
         __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
-#ifdef __x86_64__
+#else
+    map_pages_to_xen(
+        (unsigned long)__va(start),
+        start >> PAGE_SHIFT,
+        (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT,
+        __PAGE_HYPERVISOR|MAP_SMALL_PAGES);
     BUG_ON(start != xen_phys_start);
     map_pages_to_xen(
         XEN_VIRT_START,
index 79b25962ac07b585b7f5d014fd884d6f3f56fcb4..4efde856e17a2907986de16f23a73d7b902f433e 100644 (file)
@@ -3,3 +3,9 @@ subdir-y += hap
 
 obj-y += paging.o
 obj-y += p2m.o
+obj-y += guest_walk_2.o
+obj-y += guest_walk_3.o
+obj-$(x86_64) += guest_walk_4.o
+
+guest_walk_%.o: guest_walk.c Makefile
+       $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
new file mode 100644 (file)
index 0000000..19f2393
--- /dev/null
@@ -0,0 +1,260 @@
+/******************************************************************************
+ * arch/x86/mm/guest_walk.c
+ *
+ * Pagetable walker for guest memory accesses.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/page.h>
+#include <asm/guest_pt.h>
+
+
+/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
+static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
+{
+    static uint32_t flags[] = {
+        /* I/F -  Usr Wr */
+        /* 0   0   0   0 */ _PAGE_PRESENT, 
+        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
+        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
+        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+        /* 0   1   0   0 */ _PAGE_PRESENT, 
+        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
+        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
+        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
+        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
+        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+    };
+
+    /* Don't demand not-NX if the CPU wouldn't enforce it. */
+    if ( !guest_supports_nx(v) )
+        pfec &= ~PFEC_insn_fetch;
+
+    /* Don't demand R/W if the CPU wouldn't enforce it. */
+    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
+         && !(pfec & PFEC_user_mode) )
+        pfec &= ~PFEC_write_access;
+
+    return flags[(pfec & 0x1f) >> 1];
+}
+
+/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
+ * Returns non-zero if it actually writes to guest memory. */
+static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+{
+    guest_intpte_t old, new;
+
+    old = *(guest_intpte_t *)walk_p;
+    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
+    if ( old != new ) 
+    {
+        /* Write the new entry into the walk, and try to write it back
+         * into the guest table as well.  If the guest table has changed
+         * under out feet then leave it alone. */
+        *(guest_intpte_t *)walk_p = new;
+        if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
+            return 1;
+    }
+    return 0;
+}
+
+
+/* Walk the guest pagetables, after the manner of a hardware walker. */
+uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                  uint32_t pfec, mfn_t top_mfn, void *top_map)
+{
+    struct domain *d = v->domain;
+    p2m_type_t p2mt;
+    guest_l1e_t *l1p = NULL;
+    guest_l2e_t *l2p = NULL;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    guest_l3e_t *l3p = NULL;
+    guest_l4e_t *l4p;
+#endif
+    uint32_t gflags, mflags, rc = 0;
+    int pse;
+
+    perfc_incr(guest_walk);
+    memset(gw, 0, sizeof(*gw));
+    gw->va = va;
+
+    /* Mandatory bits that must be set in every entry.  We invert NX, to
+     * calculate as if there were an "X" bit that allowed access. 
+     * We will accumulate, in rc, the set of flags that are missing. */
+    mflags = mandatory_flags(v, pfec);
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+
+    /* Get the l4e from the top level table and check its flags*/
+    gw->l4mfn = top_mfn;
+    l4p = (guest_l4e_t *) top_map;
+    gw->l4e = l4p[guest_l4_table_offset(va)];
+    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT ) goto out;
+
+    /* Map the l3 table */
+    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
+    if ( !p2m_is_ram(p2mt) ) 
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+    ASSERT(mfn_valid(mfn_x(gw->l3mfn)));
+
+    /* Get the l3e and check its flags*/
+    l3p = map_domain_page(mfn_x(gw->l3mfn));
+    gw->l3e = l3p[guest_l3_table_offset(va)];
+    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT )
+        goto out;
+
+#else /* PAE only... */
+
+    /* Get the l3e and check its flag */
+    gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+
+#endif /* PAE or 64... */
+
+    /* Map the l2 table */
+    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
+    if ( !p2m_is_ram(p2mt) )
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+    ASSERT(mfn_valid(mfn_x(gw->l2mfn)));
+
+    /* Get the l2e */
+    l2p = map_domain_page(mfn_x(gw->l2mfn));
+    gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#else /* 32-bit only... */
+
+    /* Get l2e from the top level table */
+    gw->l2mfn = top_mfn;
+    l2p = (guest_l2e_t *) top_map;
+    gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#endif /* All levels... */
+
+    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT )
+        goto out;
+
+    pse = (guest_supports_superpages(v) && 
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
+
+    if ( pse )
+    {
+        /* Special case: this guest VA is in a PSE superpage, so there's
+         * no guest l1e.  We make one up so that the propagation code
+         * can generate a shadow l1 table.  Start with the gfn of the 
+         * first 4k-page of the superpage. */
+        gfn_t start = guest_l2e_get_gfn(gw->l2e);
+        /* Grant full access in the l1e, since all the guest entry's 
+         * access controls are enforced in the shadow l2e. */
+        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+                     _PAGE_ACCESSED|_PAGE_DIRTY);
+        /* Import cache-control bits. Note that _PAGE_PAT is actually
+         * _PAGE_PSE, and it is always set. We will clear it in case
+         * _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear. */
+        flags |= (guest_l2e_get_flags(gw->l2e)
+                  & (_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
+        if ( !(gfn_x(start) & 1) )
+            /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
+            flags &= ~_PAGE_PAT;
+
+        /* Increment the pfn by the right number of 4k pages.  
+         * The ~0x1 is to mask out the PAT bit mentioned above. */
+        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+        gw->l1e = guest_l1e_from_gfn(start, flags);
+        gw->l1mfn = _mfn(INVALID_MFN);
+    } 
+    else 
+    {
+        /* Not a superpage: carry on and find the l1e. */
+        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
+        if ( !p2m_is_ram(p2mt) )
+        {
+            rc |= _PAGE_PRESENT;
+            goto out;
+        }
+        ASSERT(mfn_valid(mfn_x(gw->l1mfn)));
+        l1p = map_domain_page(mfn_x(gw->l1mfn));
+        gw->l1e = l1p[guest_l1_table_offset(va)];
+        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
+        rc |= ((gflags & mflags) ^ mflags);
+    }
+
+    /* Go back and set accessed and dirty bits only if the walk was a
+     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
+     * get set whenever a lower-level PT is used, at least some hardware
+     * walkers behave this way. */
+    if ( rc == 0 ) 
+    {
+#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
+        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+            paging_mark_dirty(d, mfn_x(gw->l4mfn));
+        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
+            paging_mark_dirty(d, mfn_x(gw->l3mfn));
+#endif
+        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+                         (pse && (pfec & PFEC_write_access))) )
+            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
+        if ( !pse ) 
+        {
+            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
+                             (pfec & PFEC_write_access)) )
+                paging_mark_dirty(d, mfn_x(gw->l1mfn));
+        }
+    }
+
+ out:
+#if GUEST_PAGING_LEVELS == 4
+    if ( l3p ) unmap_domain_page(l3p);
+#endif
+#if GUEST_PAGING_LEVELS >= 3
+    if ( l2p ) unmap_domain_page(l2p);
+#endif
+    if ( l1p ) unmap_domain_page(l1p);
+
+    return rc;
+}
index 64cb72786ef761a01795e3e447be79522c7e40c5..4261d053f5bfddc2437054c9dfa3f9c1ef53b981 100644 (file)
@@ -7,5 +7,5 @@ obj-y += p2m-ept.o
 guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
 guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
 
-guest_walk_%level.o: guest_walk.c $(HDRS) Makefile
+guest_walk_%level.o: guest_walk.c Makefile
        $(CC) $(CFLAGS) $(call guest_walk_defns,$(@F)) -c $< -o $@
index f1c54983d725f27ed47722baae8150fc18db7b05..425031508dd19b5fa0e216ab0eeb5ebc2304f21a 100644 (file)
  * Place - Suite 330, Boston, MA 02111-1307 USA.
  */
 
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
+
 #include <xen/domain_page.h>
-#include <asm/page.h>
-#include <xen/event.h>
+#include <xen/paging.h>
+#include <xen/config.h>
 #include <xen/sched.h>
-#include <asm/hvm/svm/vmcb.h>
-#include <asm/domain.h>
-#include <asm/paging.h>
-#include <asm/p2m.h>
-#include <asm/hap.h>
-
-#include "private.h"
 
 #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##level
 #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
 
-#if GUEST_PAGING_LEVELS > CONFIG_PAGING_LEVELS
+#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
 
-unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
-    struct vcpu *v, unsigned long gva, uint32_t *pfec)
-{
-    gdprintk(XENLOG_ERR,
-             "Guest paging level is greater than host paging level!\n");
-    domain_crash(v->domain);
-    return INVALID_GFN;
-}
-
-#else
-
-#if GUEST_PAGING_LEVELS == 2
-#include "../page-guest32.h"
-#define l1_pgentry_t l1_pgentry_32_t
-#define l2_pgentry_t l2_pgentry_32_t
-#undef l2e_get_flags
-#define l2e_get_flags(x) l2e_get_flags_32(x)
-#undef l1e_get_flags
-#define l1e_get_flags(x) l1e_get_flags_32(x)
-#endif
+#include <asm/guest_pt.h>
 
 unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
     struct vcpu *v, unsigned long gva, uint32_t *pfec)
 {
-    unsigned long gcr3 = v->arch.hvm_vcpu.guest_cr[3];
-    int mode = GUEST_PAGING_LEVELS;
-    int lev, index;
-    paddr_t gpa = 0;
-    unsigned long gpfn, mfn;
+    unsigned long cr3;
+    uint32_t missing;
+    mfn_t top_mfn;
+    void *top_map;
     p2m_type_t p2mt;
-    int success = 1;
+    walk_t gw;
 
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-#if GUEST_PAGING_LEVELS >= 3
-    l3_pgentry_t *l3e;
-#endif
-#if GUEST_PAGING_LEVELS >= 4
-    l4_pgentry_t *l4e;
-#endif
-
-    gpfn = (gcr3 >> PAGE_SHIFT);
-    for ( lev = mode; lev >= 1; lev-- )
+    /* Get the top-level table's MFN */
+    cr3 = v->arch.hvm_vcpu.guest_cr[3];
+    top_mfn = gfn_to_mfn(v->domain, _gfn(cr3 >> PAGE_SHIFT), &p2mt);
+    if ( !p2m_is_ram(p2mt) )
     {
-        mfn = mfn_x(gfn_to_mfn_current(gpfn, &p2mt));
-        if ( !p2m_is_ram(p2mt) )
-        {
-            HAP_PRINTK("bad pfn=0x%lx from gva=0x%lx at lev%d\n", gpfn, gva,
-                       lev);
-            success = 0;
-            break;
-        }
-        ASSERT(mfn_valid(mfn));
-
-        index = (gva >> PT_SHIFT[mode][lev]) & (PT_ENTRIES[mode][lev]-1);
-
-#if GUEST_PAGING_LEVELS >= 4
-        if ( lev == 4 )
-        {
-            l4e = map_domain_page(mfn);
-            if ( !(l4e_get_flags(l4e[index]) & _PAGE_PRESENT) )
-            {
-                HAP_PRINTK("Level 4 entry not present at index = %d\n", index);
-                success = 0;
-            }
-            gpfn = l4e_get_pfn(l4e[index]);
-            unmap_domain_page(l4e);
-        }
-#endif
+        pfec[0] &= ~PFEC_page_present;
+        return INVALID_GFN;
+    }
 
-#if GUEST_PAGING_LEVELS >= 3
-        if ( lev == 3 )
-        {
-            l3e = map_domain_page(mfn);
+    /* Map the top-level table and call the tree-walker */
+    ASSERT(mfn_valid(mfn_x(top_mfn)));
+    top_map = map_domain_page(mfn_x(top_mfn));
 #if GUEST_PAGING_LEVELS == 3
-            index += ((gcr3 >> 5) & 127) * 4;
-#endif
-            if ( !(l3e_get_flags(l3e[index]) & _PAGE_PRESENT) )
-            {
-                HAP_PRINTK("Level 3 entry not present at index = %d\n", index);
-                success = 0;
-            }
-            gpfn = l3e_get_pfn(l3e[index]);
-            unmap_domain_page(l3e);
-        }
+    top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
+    missing = guest_walk_tables(v, gva, &gw, pfec[0], top_mfn, top_map);
+    unmap_domain_page(top_map);
+
+    /* Interpret the answer */
+    if ( missing == 0 ) 
+        return gfn_x(guest_l1e_get_gfn(gw.l1e));
+    
+    if ( missing & _PAGE_PRESENT )
+        pfec[0] &= ~PFEC_page_present;
+    
+    return INVALID_GFN;
+}
 
-        if ( lev == 2 )
-        {
-            l2e = map_domain_page(mfn);
-            if ( !(l2e_get_flags(l2e[index]) & _PAGE_PRESENT) )
-            {
-                HAP_PRINTK("Level 2 entry not present at index = %d\n", index);
-                success = 0;
-            }
-
-            if ( l2e_get_flags(l2e[index]) & _PAGE_PSE )
-            {
-                paddr_t mask = ((paddr_t)1 << PT_SHIFT[mode][2]) - 1;
-                HAP_PRINTK("guest page table is PSE\n");
-                gpa = (l2e_get_intpte(l2e[index]) & ~mask) + (gva & mask);
-                unmap_domain_page(l2e);
-                break; /* last level page table, jump out from here */
-            }
-
-            gpfn = l2e_get_pfn(l2e[index]);
-            unmap_domain_page(l2e);
-        }
-
-        if ( lev == 1 )
-        {
-            l1e = map_domain_page(mfn);
-            if ( !(l1e_get_flags(l1e[index]) & _PAGE_PRESENT) )
-            {
-                HAP_PRINTK("Level 1 entry not present at index = %d\n", index);
-                success = 0;
-            }
-            gpfn = l1e_get_pfn(l1e[index]);
-            gpa = (l1e_get_intpte(l1e[index]) & PAGE_MASK) + (gva &~PAGE_MASK);
-            unmap_domain_page(l1e);
-        }
-
-        if ( success != 1 ) /* error happened, jump out */
-            break;
-    }
-
-    gpa &= PADDR_MASK;
-    HAP_PRINTK("success = %d, gva = %lx, gpa = %lx\n", success, gva, gpa);
+#else
 
-    return (!success ? INVALID_GFN : ((paddr_t)gpa >> PAGE_SHIFT));
+unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, unsigned long gva, uint32_t *pfec)
+{
+    gdprintk(XENLOG_ERR,
+             "Guest paging level is greater than host paging level!\n");
+    domain_crash(v->domain);
+    return INVALID_GFN;
 }
 
 #endif
 
+
 /*
  * Local variables:
  * mode: C
index 2556a6ea9e531ea37a3a2e4abed6a8d7f5718346..b7615f0d694ca31350c3d9e4767b0f8d8697bde2 100644 (file)
 
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef mfn_to_page
-#define mfn_to_page(_m) (frame_table + mfn_x(_m))
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
 #undef mfn_valid
-#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
 #undef page_to_mfn
-#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
 /************************************************/
 /*            HAP LOG DIRTY SUPPORT             */
@@ -96,11 +96,10 @@ static struct page_info *hap_alloc(struct domain *d)
 
     ASSERT(hap_locked_by_me(d));
 
-    if ( unlikely(list_empty(&d->arch.paging.hap.freelist)) )
+    pg = page_list_remove_head(&d->arch.paging.hap.freelist);
+    if ( unlikely(!pg) )
         return NULL;
 
-    pg = list_entry(d->arch.paging.hap.freelist.next, struct page_info, list);
-    list_del(&pg->list);
     d->arch.paging.hap.free_pages--;
 
     p = hap_map_domain_page(page_to_mfn(pg));
@@ -118,7 +117,7 @@ static void hap_free(struct domain *d, mfn_t mfn)
     ASSERT(hap_locked_by_me(d));
 
     d->arch.paging.hap.free_pages++;
-    list_add_tail(&pg->list, &d->arch.paging.hap.freelist);
+    page_list_add_tail(pg, &d->arch.paging.hap.freelist);
 }
 
 static struct page_info *hap_alloc_p2m_page(struct domain *d)
@@ -153,7 +152,7 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d)
         d->arch.paging.hap.total_pages--;
         d->arch.paging.hap.p2m_pages++;
         page_set_owner(pg, d);
-        pg->count_info = 1;
+        pg->count_info |= 1;
     }
 
     hap_unlock(d);
@@ -166,9 +165,9 @@ void hap_free_p2m_page(struct domain *d, struct page_info *pg)
     ASSERT(page_get_owner(pg) == d);
     /* Should have just the one ref we gave it in alloc_p2m_page() */
     if ( (pg->count_info & PGC_count_mask) != 1 )
-        HAP_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+        HAP_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n",
                   pg->count_info, pg->u.inuse.type_info);
-    pg->count_info = 0;
+    pg->count_info &= ~PGC_count_mask;
     /* Free should not decrement domain's total allocation, since
      * these pages were allocated without an owner. */
     page_set_owner(pg, NULL);
@@ -210,18 +209,15 @@ hap_set_allocation(struct domain *d, unsigned int pages, int *preempted)
             }
             d->arch.paging.hap.free_pages++;
             d->arch.paging.hap.total_pages++;
-            list_add_tail(&pg->list, &d->arch.paging.hap.freelist);
+            page_list_add_tail(pg, &d->arch.paging.hap.freelist);
         }
         else if ( d->arch.paging.hap.total_pages > pages )
         {
             /* Need to return memory to domheap */
-            ASSERT(!list_empty(&d->arch.paging.hap.freelist));
-            pg = list_entry(d->arch.paging.hap.freelist.next,
-                            struct page_info, list);
-            list_del(&pg->list);
+            pg = page_list_remove_head(&d->arch.paging.hap.freelist);
+            ASSERT(pg);
             d->arch.paging.hap.free_pages--;
             d->arch.paging.hap.total_pages--;
-            pg->count_info = 0;
             free_domheap_page(pg);
         }
 
@@ -393,7 +389,7 @@ static void hap_destroy_monitor_table(struct vcpu* v, mfn_t mmfn)
 void hap_domain_init(struct domain *d)
 {
     hap_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.paging.hap.freelist);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.hap.freelist);
 
     /* This domain will use HAP for log-dirty mode */
     paging_log_dirty_init(d, hap_enable_log_dirty, hap_disable_log_dirty,
@@ -639,9 +635,16 @@ static void
 hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
                     mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
 {
+    uint32_t old_flags;
+
     hap_lock(v->domain);
 
+    old_flags = l1e_get_flags(*p);
     safe_write_pte(p, new);
+    if ( (old_flags & _PAGE_PRESENT)
+         && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
+             flush_tlb_mask(v->domain->domain_dirty_cpumask);
+
 #if CONFIG_PAGING_LEVELS == 3
     /* install P2M in monitor table for PAE Xen */
     if ( level == 3 )
index 8ed9c08f1fde8c8498e7f21c029857b3794d2324..bb1e8ee643a986b691bffae1ff3ec82b1cd365af 100644 (file)
@@ -63,9 +63,10 @@ static int ept_set_middle_entry(struct domain *d, ept_entry_t *ept_entry)
 
     pg->count_info = 1;
     pg->u.inuse.type_info = 1 | PGT_validated;
-    list_add_tail(&pg->list, &d->arch.p2m->pages);
+    page_list_add_tail(pg, &d->arch.p2m->pages);
 
     ept_entry->emt = 0;
+    ept_entry->igmt = 0;
     ept_entry->sp_avail = 0;
     ept_entry->avail1 = 0;
     ept_entry->mfn = page_to_mfn(pg);
@@ -114,6 +115,10 @@ static int ept_next_level(struct domain *d, bool_t read_only,
     }
 }
 
+/*
+ * ept_set_entry() computes 'need_modify_vtd_table' for itself,
+ * by observing whether any gfn->mfn translations are modified.
+ */
 static int
 ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
               unsigned int order, p2m_type_t p2mt)
@@ -124,6 +129,9 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
     u32 index;
     int i, rv = 0, ret = 0;
     int walk_level = order / EPT_TABLE_ORDER;
+    int direct_mmio = (p2mt == p2m_mmio_direct);
+    uint8_t igmt = 0;
+    int need_modify_vtd_table = 1;
 
     /* we only support 4k and 2m pages now */
 
@@ -157,22 +165,30 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
     {
         if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
         {
-            /* Track the highest gfn for which we have ever had a valid mapping */
-            if ( gfn > d->arch.p2m->max_mapped_pfn )
-                d->arch.p2m->max_mapped_pfn = gfn;
-            ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn));
+            ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn),
+                                &igmt, direct_mmio);
+            ept_entry->igmt = igmt;
             ept_entry->sp_avail = walk_level ? 1 : 0;
 
             if ( ret == GUEST_TABLE_SUPER_PAGE )
             {
-                ept_entry->mfn = mfn_x(mfn) - offset;
+                if ( ept_entry->mfn == (mfn_x(mfn) - offset) )
+                    need_modify_vtd_table = 0;  
+                else                  
+                    ept_entry->mfn = mfn_x(mfn) - offset;
+
                 if ( ept_entry->avail1 == p2m_ram_logdirty &&
                   p2mt == p2m_ram_rw )
                     for ( i = 0; i < 512; i++ )
                         paging_mark_dirty(d, mfn_x(mfn)-offset+i);
             }
             else
-                ept_entry->mfn = mfn_x(mfn);
+            {
+                if ( ept_entry->mfn == mfn_x(mfn) )
+                    need_modify_vtd_table = 0;
+                else
+                    ept_entry->mfn = mfn_x(mfn);
+            }
 
             ept_entry->avail1 = p2mt;
             ept_entry->rsvd = 0;
@@ -211,7 +227,10 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
         {
             split_ept_entry = split_table + i;
             split_ept_entry->emt = epte_get_entry_emt(d,
-                                        gfn-offset+i, split_mfn+i);
+                                        gfn-offset+i, split_mfn+i, 
+                                        &igmt, direct_mmio);
+            split_ept_entry->igmt = igmt;
+
             split_ept_entry->sp_avail =  0;
 
             split_ept_entry->mfn = split_mfn+i;
@@ -226,14 +245,25 @@ ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 
         /* Set the destinated 4k page as normal */
         split_ept_entry = split_table + offset;
-        split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn));
-        split_ept_entry->mfn = mfn_x(mfn);
+        split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn), 
+                                                &igmt, direct_mmio);
+        split_ept_entry->igmt = igmt;
+
+        if ( split_ept_entry->mfn == mfn_x(mfn) )
+            need_modify_vtd_table = 0;
+        else
+            split_ept_entry->mfn = mfn_x(mfn);
         split_ept_entry->avail1 = p2mt;
         ept_p2m_type_to_flags(split_ept_entry, p2mt);
 
         unmap_domain_page(split_table);
     }
 
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn))
+         && (gfn + (1UL << order) - 1 > d->arch.p2m->max_mapped_pfn) )
+        d->arch.p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
+
     /* Success */
     rv = 1;
 
@@ -244,7 +274,8 @@ out:
 
     /* Now the p2m table is not shared with vt-d page table */
 
-    if ( iommu_enabled && is_hvm_domain(d) )
+    if ( iommu_enabled && is_hvm_domain(d)  
+             && need_modify_vtd_table )
     {
         if ( p2mt == p2m_ram_rw )
         {
@@ -272,7 +303,8 @@ out:
 }
 
 /* Read ept p2m entries */
-static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
+static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t,
+    p2m_query_t q)
 {
     ept_entry_t *table =
         map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
@@ -357,9 +389,25 @@ static uint64_t ept_get_entry_content(struct domain *d, unsigned long gfn)
     return content;
 }
 
-static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t)
+static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t,
+                                   p2m_query_t q)
 {
-    return ept_get_entry(current->domain, gfn, t);
+    return ept_get_entry(current->domain, gfn, t, q);
+}
+
+/* To test if the new emt type is the same with old,
+ * return 1 to not to reset ept entry.
+ */
+static int need_modify_ept_entry(struct domain *d, unsigned long gfn,
+                                    unsigned long mfn, uint8_t o_igmt,
+                                    uint8_t o_emt, p2m_type_t p2mt)
+{
+    uint8_t igmt, emt;
+    emt = epte_get_entry_emt(d, gfn, mfn, &igmt, 
+                                (p2mt == p2m_mmio_direct));
+    if ( (emt == o_emt) && (igmt == o_igmt) )
+        return 0;
+    return 1; 
 }
 
 void ept_change_entry_emt_with_range(struct domain *d, unsigned long start_gfn,
@@ -370,6 +418,7 @@ void ept_change_entry_emt_with_range(struct domain *d, unsigned long start_gfn,
     uint64_t epte;
     int order = 0;
     unsigned long mfn;
+    uint8_t o_igmt, o_emt;
 
     for ( gfn = start_gfn; gfn <= end_gfn; gfn++ )
     {
@@ -379,7 +428,9 @@ void ept_change_entry_emt_with_range(struct domain *d, unsigned long start_gfn,
         mfn = (epte & EPTE_MFN_MASK) >> PAGE_SHIFT;
         if ( !mfn_valid(mfn) )
             continue;
-        p2mt = (epte & EPTE_AVAIL1_MASK) >> 8;
+        p2mt = (epte & EPTE_AVAIL1_MASK) >> EPTE_AVAIL1_SHIFT;
+        o_igmt = (epte & EPTE_IGMT_MASK) >> EPTE_IGMT_SHIFT;
+        o_emt = (epte & EPTE_EMT_MASK) >> EPTE_EMT_SHIFT;
         order = 0;
 
         if ( epte & EPTE_SUPER_PAGE_MASK )
@@ -391,18 +442,26 @@ void ept_change_entry_emt_with_range(struct domain *d, unsigned long start_gfn,
                  * Set emt for super page.
                  */
                 order = EPT_TABLE_ORDER;
-                ept_set_entry(d, gfn, _mfn(mfn), order, p2mt);
+                if ( need_modify_ept_entry(d, gfn, mfn, 
+                                            o_igmt, o_emt, p2mt) )
+                    ept_set_entry(d, gfn, _mfn(mfn), order, p2mt);
                 gfn += 0x1FF;
             }
             else
             {
-                /* change emt for partial entries of the 2m area */
-                ept_set_entry(d, gfn, _mfn(mfn), order, p2mt);
+                /* change emt for partial entries of the 2m area. */
+                if ( need_modify_ept_entry(d, gfn, mfn, 
+                                            o_igmt, o_emt, p2mt) )
+                    ept_set_entry(d, gfn, _mfn(mfn), order, p2mt);
                 gfn = ((gfn >> EPT_TABLE_ORDER) << EPT_TABLE_ORDER) + 0x1FF;
             }
         }
         else /* gfn assigned with 4k */
-            ept_set_entry(d, gfn, _mfn(mfn), order, p2mt);
+        {
+            if ( need_modify_ept_entry(d, gfn, mfn, 
+                                            o_igmt, o_emt, p2mt) )
+                ept_set_entry(d, gfn, _mfn(mfn), order, p2mt);
+        }
     }
 }
 
index 00bed88db47163331c0421174c2000bb2dd1adf7..7b06e7df6340afa8d43028d1f6495db873ef85e4 100644 (file)
@@ -20,9 +20,6 @@
 #ifndef __HAP_PRIVATE_H__
 #define __HAP_PRIVATE_H__
 
-#include <asm/flushtlb.h>
-#include <asm/hvm/support.h>
-
 /********************************************/
 /*          GUEST TRANSLATION FUNCS         */
 /********************************************/
@@ -33,36 +30,5 @@ unsigned long hap_gva_to_gfn_3level(struct vcpu *v, unsigned long gva,
 unsigned long hap_gva_to_gfn_4level(struct vcpu *v, unsigned long gva,
                                     uint32_t *pfec);
 
-/********************************************/
-/*            MISC DEFINITIONS              */
-/********************************************/
-
-/* PT_SHIFT describes the amount by which a virtual address is shifted right 
- * to right justify the portion to be used for indexing into a page 
- * table, given the guest memory model (i.e. number of levels) and the level 
- * of the page table being accessed. The idea is from Virtual Iron's code.
- */
-static const int PT_SHIFT[][5] =
-  {   /*     ------  level ------           nr_levels  */
-    /*         1     2     3     4                   */
-    {    0,    0,    0,    0,    0},   /* 0 not used */
-    {    0,    0,    0,    0,    0},   /* 1 not used */
-    {    0,   12,   22,    0,    0},   /* 2  */
-    {    0,   12,   21,   30,    0},   /* 3  */
-    {    0,   12,   21,   30,   39}    /* 4  */
-  };
-
-/* PT_ENTRIES describes the number of entries in a page table, given the 
- * memory model (i.e. number of levels) and the level of the page table 
- * being considered. This idea from Virtual Iron's shadow code*/
-static const int PT_ENTRIES[][5] =
-  {   /*     ------  level ------           nr_levels  */
-    /*         1     2     3     4                   */
-    {    0,    0,    0,    0,    0},   /* 0 not used */
-    {    0,    0,    0,    0,    0},   /* 1 not used */
-    {    0, 1024, 1024,    0,    0},   /* 2  */
-    {    0,  512,  512,    4,    0},   /* 3  */
-    {    0,  512,  512,  512,  512}    /* 4  */
-  };
 
 #endif /* __SVM_NPT_H__ */
index 15d5297371b79ef9073c15ad09c9bc6662b36236..296eb44e0753b36fb28052d496316aab9cfd5fdc 100644 (file)
 
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef mfn_to_page
-#define mfn_to_page(_m) (frame_table + mfn_x(_m))
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
 #undef mfn_valid
-#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
 #undef page_to_mfn
-#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
 
 /* PTE flags for the various types of p2m entry */
@@ -118,9 +118,16 @@ static unsigned long p2m_type_to_flags(p2m_type_t t)
         return flags;
     case p2m_mmio_direct:
         return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
+    case p2m_populate_on_demand:
+        return flags;
     }
 }
 
+#if P2M_AUDIT
+static void audit_p2m(struct domain *d);
+#else
+# define audit_p2m(_d) do { (void)(_d); } while(0)
+#endif /* P2M_AUDIT */
 
 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
@@ -162,14 +169,15 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
                                       shift, max)) )
         return 0;
 
-    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    /* PoD: Not present doesn't imply empty. */
+    if ( !l1e_get_flags(*p2m_entry) )
     {
         struct page_info *pg = d->arch.p2m->alloc_page(d);
         if ( pg == NULL )
             return 0;
-        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        page_list_add_tail(pg, &d->arch.p2m->pages);
         pg->u.inuse.type_info = type | 1 | PGT_validated;
-        pg->count_info = 1;
+        pg->count_info |= 1;
 
         new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
                                  __PAGE_HYPERVISOR|_PAGE_USER);
@@ -197,7 +205,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
         }
     }
 
-    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
 
     /* split single large page into 4KB page in P2M table */
     if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
@@ -206,9 +214,9 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
         struct page_info *pg = d->arch.p2m->alloc_page(d);
         if ( pg == NULL )
             return 0;
-        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        page_list_add_tail(pg, &d->arch.p2m->pages);
         pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
-        pg->count_info = 1;
+        pg->count_info |= 1;
         
         /* New splintered mappings inherit the flags of the old superpage, 
          * with a little reorganisation for the _PAGE_PSE_PAT bit. */
@@ -242,6 +250,860 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
     return 1;
 }
 
+/*
+ * Populate-on-demand functionality
+ */
+static
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt);
+
+int
+p2m_pod_cache_add(struct domain *d,
+                  struct page_info *page,
+                  unsigned long order)
+{
+    int i;
+    struct page_info *p;
+    struct p2m_domain *p2md = d->arch.p2m;
+
+#ifndef NDEBUG
+    mfn_t mfn;
+
+    mfn = page_to_mfn(page);
+
+    /* Check to make sure this is a contiguous region */
+    if( mfn_x(mfn) & ((1 << order) - 1) )
+    {
+        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
+               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
+        return -1;
+    }
+    
+    for(i=0; i < 1 << order ; i++) {
+        struct domain * od;
+
+        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
+        od = page_get_owner(p);
+        if(od != d)
+        {
+            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
+                   __func__, mfn_x(mfn), d->domain_id,
+                   od?od->domain_id:-1);
+            return -1;
+        }
+    }
+#endif
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* First, take all pages off the domain list */
+    for(i=0; i < 1 << order ; i++)
+    {
+        p = page + i;
+        page_list_del(p, &d->page_list);
+    }
+
+    /* Then add the first one to the appropriate populate-on-demand list */
+    switch(order)
+    {
+    case 9:
+        page_list_add_tail(page, &p2md->pod.super); /* lock: page_alloc */
+        p2md->pod.count += 1 << order;
+        break;
+    case 0:
+        page_list_add_tail(page, &p2md->pod.single); /* lock: page_alloc */
+        p2md->pod.count += 1 ;
+        break;
+    default:
+        BUG();
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    return 0;
+}
+
+/* Get a page of size order from the populate-on-demand cache.  Will break
+ * down 2-meg pages into singleton pages automatically.  Returns null if
+ * a superpage is requested and no superpages are available.  Must be called
+ * with the d->page_lock held. */
+static struct page_info * p2m_pod_cache_get(struct domain *d,
+                                            unsigned long order)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    struct page_info *p = NULL;
+    int i;
+
+    if ( order == 9 && page_list_empty(&p2md->pod.super) )
+    {
+        return NULL;
+    }
+    else if ( order == 0 && page_list_empty(&p2md->pod.single) )
+    {
+        unsigned long mfn;
+        struct page_info *q;
+
+        BUG_ON( page_list_empty(&p2md->pod.super) );
+
+        /* Break up a superpage to make single pages. NB count doesn't
+         * need to be adjusted. */
+        printk("%s: Breaking up superpage.\n", __func__);
+        p = page_list_remove_head(&p2md->pod.super);
+        mfn = mfn_x(page_to_mfn(p));
+
+        for ( i=0; i<(1<<9); i++ )
+        {
+            q = mfn_to_page(_mfn(mfn+i));
+            page_list_add_tail(q, &p2md->pod.single);
+        }
+    }
+
+    switch ( order )
+    {
+    case 9:
+        BUG_ON( page_list_empty(&p2md->pod.super) );
+        p = page_list_remove_head(&p2md->pod.super);
+        p2md->pod.count -= 1 << order; /* Lock: page_alloc */
+        break;
+    case 0:
+        BUG_ON( page_list_empty(&p2md->pod.single) );
+        p = page_list_remove_head(&p2md->pod.single);
+        p2md->pod.count -= 1;
+        break;
+    default:
+        BUG();
+    }
+
+    /* Put the pages back on the domain page_list */
+    for ( i = 0 ; i < (1 << order) ; i++ )
+    {
+        BUG_ON(page_get_owner(p + i) != d);
+        page_list_add_tail(p + i, &d->page_list);
+    }
+
+    return p;
+}
+
+/* Set the size of the cache, allocating or freeing as necessary. */
+static int
+p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+
+    /* Increasing the target */
+    while ( pod_target > p2md->pod.count )
+    {
+        struct page_info * page;
+        int order;
+
+        if ( (pod_target - p2md->pod.count) >= (1>>9) )
+            order = 9;
+        else
+            order = 0;
+
+        page = alloc_domheap_pages(d, order, 0);
+        if ( unlikely(page == NULL) )
+            goto out;
+
+        p2m_pod_cache_add(d, page, order);
+    }
+
+    /* Decreasing the target */
+    /* We hold the p2m lock here, so we don't need to worry about
+     * cache disappearing under our feet. */
+    while ( pod_target < p2md->pod.count )
+    {
+        struct page_info * page;
+        int order, i;
+
+        /* Grab the lock before checking that pod.super is empty, or the last
+         * entries may disappear before we grab the lock. */
+        spin_lock(&d->page_alloc_lock);
+
+        if ( (p2md->pod.count - pod_target) > (1>>9)
+             && !page_list_empty(&p2md->pod.super) )
+            order = 9;
+        else
+            order = 0;
+
+        page = p2m_pod_cache_get(d, order);
+
+        ASSERT(page != NULL);
+
+        spin_unlock(&d->page_alloc_lock);
+
+        /* Then free them */
+        for ( i = 0 ; i < (1 << order) ; i++ )
+        {
+            /* Copied from common/memory.c:guest_remove_page() */
+            if ( unlikely(!get_page(page+i, d)) )
+            {
+                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
+                put_page_and_type(page+i);
+            
+            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
+                put_page(page+i);
+
+            put_page(page+i);
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * The "right behavior" here requires some careful thought.  First, some
+ * definitions:
+ * + M: static_max
+ * + B: number of pages the balloon driver has ballooned down to.
+ * + P: Number of populated pages. 
+ * + T: Old target
+ * + T': New target
+ *
+ * The following equations should hold:
+ *  0 <= P <= T <= B <= M
+ *  d->arch.p2m->pod.entry_count == B - P
+ *  d->tot_pages == P + d->arch.p2m->pod.count
+ *
+ * Now we have the following potential cases to cover:
+ *     B <T': Set the PoD cache size equal to the number of outstanding PoD
+ *   entries.  The balloon driver will deflate the balloon to give back
+ *   the remainder of the ram to the guest OS.
+ *  T <T'<B : Increase PoD cache size.
+ *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
+ *   get the memory right away.  However, that means every time we 
+ *   reduce the memory target we risk the guest attempting to populate the 
+ *   memory before the balloon driver has reached its new target.  Safer to
+ *   never reduce the cache size here, but only when the balloon driver frees 
+ *   PoD ranges.
+ *
+ * If there are many zero pages, we could reach the target also by doing
+ * zero sweeps and marking the ranges PoD; but the balloon driver will have
+ * to free this memory eventually anyway, so we don't actually gain that much
+ * by doing so.
+ *
+ * NB that the equation (B<T') may require adjustment to the cache
+ * size as PoD pages are freed as well; i.e., freeing a PoD-backed
+ * entry when pod.entry_count == pod.count requires us to reduce both
+ * pod.entry_count and pod.count.
+ */
+int
+p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+{
+    unsigned pod_target;
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+    unsigned long populated;
+
+    /* P == B: Nothing to do. */
+    if ( p2md->pod.entry_count == 0 )
+        goto out;
+
+    /* T' < B: Don't reduce the cache size; let the balloon driver
+     * take care of it. */
+    if ( target < d->tot_pages )
+        goto out;
+
+    populated  = d->tot_pages - p2md->pod.count;
+
+    pod_target = target - populated;
+
+    /* B < T': Set the cache size equal to # of outstanding entries,
+     * let the balloon driver fill in the rest. */
+    if ( pod_target > p2md->pod.entry_count )
+        pod_target = p2md->pod.entry_count;
+
+    ASSERT( pod_target > p2md->pod.count );
+
+    ret = p2m_pod_set_cache_target(d, pod_target);
+
+out:
+    return ret;
+}
+
+void
+p2m_pod_empty_cache(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    struct page_info *page;
+
+    spin_lock(&d->page_alloc_lock);
+
+    while ( (page = page_list_remove_head(&p2md->pod.super)) )
+    {
+        int i;
+            
+        for ( i = 0 ; i < (1 << 9) ; i++ )
+        {
+            BUG_ON(page_get_owner(page + i) != d);
+            page_list_add_tail(page + i, &d->page_list);
+        }
+
+        p2md->pod.count -= 1<<9;
+    }
+
+    while ( (page = page_list_remove_head(&p2md->pod.single)) )
+    {
+        BUG_ON(page_get_owner(page) != d);
+        page_list_add_tail(page, &d->page_list);
+
+        p2md->pod.count -= 1;
+    }
+
+    BUG_ON(p2md->pod.count != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+}
+
+/* This function is needed for two reasons:
+ * + To properly handle clearing of PoD entries
+ * + To "steal back" memory being freed for the PoD cache, rather than
+ *   releasing it.
+ *
+ * Once both of these functions have been completed, we can return and
+ * allow decrease_reservation() to handle everything else.
+ */
+int
+p2m_pod_decrease_reservation(struct domain *d,
+                             xen_pfn_t gpfn,
+                             unsigned int order)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret=0;
+    int i;
+
+    int steal_for_cache = 0;
+    int pod = 0, nonpod = 0, ram = 0;
+    
+
+    /* If we don't have any outstanding PoD entries, let things take their
+     * course */
+    if ( p2md->pod.entry_count == 0 )
+        goto out;
+
+    /* Figure out if we need to steal some freed memory for our cache */
+    steal_for_cache =  ( p2md->pod.entry_count > p2md->pod.count );
+
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    /* See what's in here. */
+    /* FIXME: Add contiguous; query for PSE entries? */
+    for ( i=0; i<(1<<order); i++)
+    {
+        p2m_type_t t;
+
+        gfn_to_mfn_query(d, gpfn + i, &t);
+
+        if ( t == p2m_populate_on_demand )
+            pod++;
+        else
+        {
+            nonpod++;
+            if ( p2m_is_ram(t) )
+                ram++;
+        }
+    }
+
+    /* No populate-on-demand?  Don't need to steal anything?  Then we're done!*/
+    if(!pod && !steal_for_cache)
+        goto out_unlock;
+
+    if ( !nonpod )
+    {
+        /* All PoD: Mark the whole region invalid and tell caller
+         * we're done. */
+        set_p2m_entry(d, gpfn, _mfn(INVALID_MFN), order, p2m_invalid);
+        p2md->pod.entry_count-=(1<<order); /* Lock: p2m */
+        BUG_ON(p2md->pod.entry_count < 0);
+        ret = 1;
+        goto out_unlock;
+    }
+
+    /* FIXME: Steal contig 2-meg regions for cache */
+
+    /* Process as long as:
+     * + There are PoD entries to handle, or
+     * + There is ram left, and we want to steal it
+     */
+    for ( i=0;
+          i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));
+          i++)
+    {
+        mfn_t mfn;
+        p2m_type_t t;
+
+        mfn = gfn_to_mfn_query(d, gpfn + i, &t);
+        if ( t == p2m_populate_on_demand )
+        {
+            set_p2m_entry(d, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid);
+            p2md->pod.entry_count--; /* Lock: p2m */
+            BUG_ON(p2md->pod.entry_count < 0);
+            pod--;
+        }
+        else if ( steal_for_cache && p2m_is_ram(t) )
+        {
+            struct page_info *page;
+
+            ASSERT(mfn_valid(mfn));
+
+            page = mfn_to_page(mfn);
+
+            set_p2m_entry(d, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid);
+            set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
+
+            p2m_pod_cache_add(d, page, 0);
+
+            steal_for_cache =  ( p2md->pod.entry_count > p2md->pod.count );
+
+            nonpod--;
+            ram--;
+        }
+    }    
+
+    /* If we've reduced our "liabilities" beyond our "assets", free some */
+    if ( p2md->pod.entry_count < p2md->pod.count )
+    {
+        printk("b %d\n", p2md->pod.entry_count);
+        p2m_pod_set_cache_target(d, p2md->pod.entry_count);
+    }
+
+    /* If there are no more non-PoD entries, tell decrease_reservation() that
+     * there's nothing left to do. */
+    if ( nonpod == 0 )
+        ret = 1;
+
+out_unlock:
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+out:
+    return ret;
+}
+
+void
+p2m_pod_dump_data(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    
+    printk("    PoD entries=%d cachesize=%d\n",
+           p2md->pod.entry_count, p2md->pod.count);
+}
+
+#define superpage_aligned(_x)  (((_x)&((1<<9)-1))==0)
+
+/* Search for all-zero superpages to be reclaimed as superpages for the
+ * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
+static int
+p2m_pod_zero_check_superpage(struct domain *d, unsigned long gfn)
+{
+    mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
+    p2m_type_t type, type0 = 0;
+    unsigned long * map = NULL;
+    int ret=0, reset = 0;
+    int i, j;
+    int max_ref = 1;
+
+    if ( !superpage_aligned(gfn) )
+        goto out;
+
+    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
+    if ( paging_mode_shadow(d) )
+        max_ref++;
+
+    /* Look up the mfns, checking to make sure they're the same mfn
+     * and aligned, and mapping them. */
+    for ( i=0; i<(1<<9); i++ )
+    {
+        
+        mfn = gfn_to_mfn_query(d, gfn + i, &type);
+
+        if ( i == 0 )
+        {
+            mfn0 = mfn;
+            type0 = type;
+        }
+
+        /* Conditions that must be met for superpage-superpage:
+         * + All gfns are ram types
+         * + All gfns have the same type
+         * + All of the mfns are allocated to a domain
+         * + None of the mfns are used as pagetables
+         * + The first mfn is 2-meg aligned
+         * + All the other mfns are in sequence
+         * Adding for good measure:
+         * + None of the mfns are likely to be mapped elsewhere (refcount
+         *   2 or less for shadow, 1 for hap)
+         */
+        if ( !p2m_is_ram(type)
+             || type != type0
+             || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )
+             || ( (mfn_to_page(mfn)->count_info & PGC_page_table) != 0 )
+             || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )
+             || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )
+                   || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )
+            goto out;
+    }
+
+    /* Now, do a quick check to see if it may be zero before unmapping. */
+    for ( i=0; i<(1<<9); i++ )
+    {
+        /* Quick zero-check */
+        map = map_domain_page(mfn_x(mfn0) + i);
+
+        for ( j=0; j<16; j++ )
+            if( *(map+j) != 0 )
+                break;
+
+        unmap_domain_page(map);
+
+        if ( j < 16 )
+            goto out;
+
+    }
+
+    /* Try to remove the page, restoring old mapping if it fails. */
+    set_p2m_entry(d, gfn,
+                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                  p2m_populate_on_demand);
+
+    /* Make none of the MFNs are used elsewhere... for example, mapped
+     * via the grant table interface, or by qemu.  Allow one refcount for
+     * being allocated to the domain. */
+    for ( i=0; i < (1<<9); i++ )
+    {
+        mfn = _mfn(mfn_x(mfn0) + i);
+        if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
+        {
+            reset = 1;
+            goto out_reset;
+        }
+    }
+
+    /* Finally, do a full zero-check */
+    for ( i=0; i < (1<<9); i++ )
+    {
+        map = map_domain_page(mfn_x(mfn0) + i);
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )
+            if( *(map+j) != 0 )
+            {
+                reset = 1;
+                break;
+            }
+
+        unmap_domain_page(map);
+
+        if ( reset )
+            goto out_reset;
+    }
+
+    /* Finally!  We've passed all the checks, and can add the mfn superpage
+     * back on the PoD cache, and account for the new p2m PoD entries */
+    p2m_pod_cache_add(d, mfn_to_page(mfn0), 9);
+    d->arch.p2m->pod.entry_count += (1<<9);
+
+out_reset:
+    if ( reset )
+        set_p2m_entry(d, gfn, mfn0, 9, type0);
+    
+out:
+    return ret;
+}
+
+static void
+p2m_pod_zero_check(struct domain *d, unsigned long *gfns, int count)
+{
+    mfn_t mfns[count];
+    p2m_type_t types[count];
+    unsigned long * map[count];
+
+    int i, j;
+    int max_ref = 1;
+
+    /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
+    if ( paging_mode_shadow(d) )
+        max_ref++;
+
+    /* First, get the gfn list, translate to mfns, and map the pages. */
+    for ( i=0; i<count; i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(d, gfns[i], types + i);
+        /* If this is ram, and not a pagetable, and probably not mapped
+           elsewhere, map it; otherwise, skip. */
+        if ( p2m_is_ram(types[i])
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) 
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 ) 
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            map[i] = NULL;
+    }
+
+    /* Then, go through and check for zeroed pages, removing write permission
+     * for those with zeroes. */
+    for ( i=0; i<count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+            continue;
+        }
+
+        /* Try to remove the page, restoring old mapping if it fails. */
+        set_p2m_entry(d, gfns[i],
+                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand);
+
+        /* See if the page was successfully unmapped.  (Allow one refcount
+         * for being allocated to a domain.) */
+        if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+
+            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
+
+            continue;
+        }
+    }
+
+    /* Now check each page for real */
+    for ( i=0; i < count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        /* See comment in p2m_pod_zero_check_superpage() re gnttab
+         * check timing.  */
+        if ( j < PAGE_SIZE/sizeof(*map[i]) )
+        {
+            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
+            continue;
+        }
+        else
+        {
+            /* Add to cache, and account for the new p2m PoD entry */
+            p2m_pod_cache_add(d, mfn_to_page(mfns[i]), 0);
+            d->arch.p2m->pod.entry_count++;
+        }
+
+        unmap_domain_page(map[i]);
+        map[i] = NULL;
+    }
+    
+}
+
+#define POD_SWEEP_LIMIT 1024
+static void
+p2m_pod_emergency_sweep_super(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long i, start, limit;
+
+    if ( p2md->pod.reclaim_super == 0 )
+    {
+        p2md->pod.reclaim_super = (p2md->pod.max_guest>>9)<<9;
+        p2md->pod.reclaim_super -= (1<<9);
+    }
+    
+    start = p2md->pod.reclaim_super;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    for ( i=p2md->pod.reclaim_super ; i > 0 ; i-=(1<<9) )
+    {
+        p2m_pod_zero_check_superpage(d, i);
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( !page_list_empty(&p2md->pod.super) &&  i < limit )
+            break;
+    }
+
+    p2md->pod.reclaim_super = i ? i - (1<<9) : 0;
+
+}
+
+#define POD_SWEEP_STRIDE  16
+static void
+p2m_pod_emergency_sweep(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long gfns[POD_SWEEP_STRIDE];
+    unsigned long i, j=0, start, limit;
+    p2m_type_t t;
+
+
+    if ( p2md->pod.reclaim_single == 0 )
+        p2md->pod.reclaim_single = p2md->pod.max_guest;
+
+    start = p2md->pod.reclaim_single;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    /* FIXME: Figure out how to avoid superpages */
+    for ( i=p2md->pod.reclaim_single ; i > 0 ; i-- )
+    {
+        gfn_to_mfn_query(d, i, &t );
+        if ( p2m_is_ram(t) )
+        {
+            gfns[j] = i;
+            j++;
+            BUG_ON(j > POD_SWEEP_STRIDE);
+            if ( j == POD_SWEEP_STRIDE )
+            {
+                p2m_pod_zero_check(d, gfns, j);
+                j = 0;
+            }
+        }
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( p2md->pod.count > 0 && i < limit )
+            break;
+    }
+
+    if ( j )
+        p2m_pod_zero_check(d, gfns, j);
+
+    p2md->pod.reclaim_single = i ? i - 1 : i;
+
+}
+
+static int
+p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
+                        mfn_t table_mfn,
+                        l1_pgentry_t *p2m_entry,
+                        unsigned int order,
+                        p2m_query_t q)
+{
+    struct page_info *p = NULL; /* Compiler warnings */
+    unsigned long gfn_aligned;
+    mfn_t mfn;
+    l1_pgentry_t entry_content = l1e_empty();
+    struct p2m_domain *p2md = d->arch.p2m;
+    int i;
+
+    /* We need to grab the p2m lock here and re-check the entry to make
+     * sure that someone else hasn't populated it for us, then hold it
+     * until we're done. */
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    /* Check to make sure this is still PoD */
+    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand )
+    {
+        p2m_unlock(p2md);
+        return 0;
+    }
+
+    /* If we're low, start a sweep */
+    if ( order == 9 && page_list_empty(&p2md->pod.super) )
+        p2m_pod_emergency_sweep_super(d);
+
+    if ( page_list_empty(&p2md->pod.single) &&
+         ( ( order == 0 )
+           || (order == 9 && page_list_empty(&p2md->pod.super) ) ) )
+        p2m_pod_emergency_sweep(d);
+
+    /* Keep track of the highest gfn demand-populated by a guest fault */
+    if ( q == p2m_guest && gfn > p2md->pod.max_guest )
+        p2md->pod.max_guest = gfn;
+
+    spin_lock(&d->page_alloc_lock);
+
+    if ( p2md->pod.count == 0 )
+        goto out_of_memory;
+
+    /* Get a page f/ the cache.  A NULL return value indicates that the
+     * 2-meg range should be marked singleton PoD, and retried */
+    if ( (p = p2m_pod_cache_get(d, order)) == NULL )
+        goto remap_and_retry;
+
+    mfn = page_to_mfn(p);
+
+    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
+
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Fill in the entry in the p2m */
+    switch ( order )
+    {
+    case 9:
+    {
+        l2_pgentry_t l2e_content;
+        
+        l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                   p2m_type_to_flags(p2m_ram_rw) | _PAGE_PSE);
+
+        entry_content.l1 = l2e_content.l2;
+    }
+    break;
+    case 0:
+        entry_content = l1e_from_pfn(mfn_x(mfn),
+                                     p2m_type_to_flags(p2m_ram_rw));
+        break;
+        
+    }
+
+    gfn_aligned = (gfn >> order) << order;
+
+    paging_write_p2m_entry(d, gfn_aligned, p2m_entry, table_mfn,
+                           entry_content, (order==9)?2:1);
+
+    for( i = 0 ; i < (1UL << order) ; i++ )
+        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
+    
+    p2md->pod.entry_count -= (1 << order); /* Lock: p2m */
+    BUG_ON(p2md->pod.entry_count < 0);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+    return 0;
+out_of_memory:
+    spin_unlock(&d->page_alloc_lock);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+    printk("%s: Out of populate-on-demand memory!\n", __func__);
+    domain_crash(d);
+    return -1;
+remap_and_retry:
+    BUG_ON(order != 9);
+    spin_unlock(&d->page_alloc_lock);
+
+    /* Remap this 2-meg region in singleton chunks */
+    gfn_aligned = (gfn>>order)<<order;
+    for(i=0; i<(1<<order); i++)
+        set_p2m_entry(d, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand);
+    audit_p2m(d);
+    p2m_unlock(p2md);
+    return 0;
+}
+
 // Returns 0 on error (out of memory)
 static int
 p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
@@ -303,6 +1165,7 @@ p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
                                    L2_PAGETABLE_ENTRIES);
         ASSERT(p2m_entry);
         
+        /* FIXME: Deal with 4k replaced by 2meg pages */
         if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
              !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
         {
@@ -311,7 +1174,7 @@ p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
             goto out;
         }
         
-        if ( mfn_valid(mfn) )
+        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
                                        p2m_type_to_flags(p2mt) | _PAGE_PSE);
         else
@@ -322,7 +1185,8 @@ p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+    if ( mfn_valid(mfn) 
+         && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
 
     if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) )
@@ -344,7 +1208,8 @@ p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 }
 
 static mfn_t
-p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
+p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t,
+               p2m_query_t q)
 {
     mfn_t mfn;
     paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
@@ -401,8 +1266,21 @@ p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
 
     l2e = map_domain_page(mfn_x(mfn));
     l2e += l2_table_offset(addr);
+
+pod_retry_l2:
     if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
     {
+        /* PoD: Try to populate a 2-meg chunk */
+        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if( !p2m_pod_demand_populate(d, gfn, mfn,
+                                             (l1_pgentry_t *)l2e, 9, q) )
+                    goto pod_retry_l2;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+    
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
@@ -421,8 +1299,20 @@ p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
 
     l1e = map_domain_page(mfn_x(mfn));
     l1e += l1_table_offset(addr);
+pod_retry_l1:
     if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
     {
+        /* PoD: Try to populate */
+        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
+        {
+            if ( q != p2m_query ) {
+                if( !p2m_pod_demand_populate(d, gfn, mfn,
+                                             (l1_pgentry_t *)l1e, 0, q) )
+                    goto pod_retry_l1;
+            } else
+                *t = p2m_populate_on_demand;
+        }
+    
         unmap_domain_page(l1e);
         return _mfn(INVALID_MFN);
     }
@@ -435,7 +1325,8 @@ p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
 }
 
 /* Read the current domain's p2m table (through the linear mapping). */
-static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
+static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t,
+                                    p2m_query_t q)
 {
     mfn_t mfn = _mfn(INVALID_MFN);
     p2m_type_t p2mt = p2m_mmio_dm;
@@ -447,48 +1338,114 @@ static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
 
     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
-        l1_pgentry_t l1e = l1e_empty();
+        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
         l2_pgentry_t l2e = l2e_empty();
         int ret;
 
         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
                / sizeof(l1_pgentry_t));
 
+        /*
+         * Read & process L2
+         */
+        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
+                                       + l2_linear_offset(addr)];
+
+    pod_retry_l2:
         ret = __copy_from_user(&l2e,
-                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                               p2m_entry,
                                sizeof(l2e));
+        if ( ret != 0
+             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        {
+            if( (l2e_get_flags(l2e) & _PAGE_PSE)
+                && ( p2m_flags_to_type(l2e_get_flags(l2e))
+                     == p2m_populate_on_demand ) )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
+                                                 p2m_entry, 9, q) )
+                        goto pod_retry_l2;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    printk("%s: Allocate failed!\n", __func__);
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
+            }
+
+            goto pod_retry_l1;
+        }
         
-        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
-             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
+        if (l2e_get_flags(l2e) & _PAGE_PSE)
         {
             p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
             ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
             if ( p2m_is_valid(p2mt) )
                 mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
             else
                 p2mt = p2m_mmio_dm;
+
+            goto out;
         }
-        else
-        {
-        
-            /* Need to __copy_from_user because the p2m is sparse and this
-             * part might not exist */
-            ret = __copy_from_user(&l1e,
-                                   &phys_to_machine_mapping[gfn],
-                                   sizeof(l1e));
+
+        /*
+         * Read and process L1
+         */
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+    pod_retry_l1:
+        p2m_entry = &phys_to_machine_mapping[gfn];
+
+        ret = __copy_from_user(&l1e,
+                               p2m_entry,
+                               sizeof(l1e));
             
-            if ( ret == 0 ) {
-                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-                if ( p2m_is_valid(p2mt) )
-                    mfn = _mfn(l1e_get_pfn(l1e));
-                else 
-                    /* XXX see above */
-                    p2mt = p2m_mmio_dm;
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+
+            if ( p2m_flags_to_type(l1e_get_flags(l1e))
+                 == p2m_populate_on_demand )
+            {
+                /* The read has succeeded, so we know that the mapping
+                 * exits at this point.  */
+                if ( q != p2m_query )
+                {
+                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
+                                                 (l1_pgentry_t *)p2m_entry, 0,
+                                                 q) )
+                        goto pod_retry_l1;
+
+                    /* Allocate failed. */
+                    p2mt = p2m_invalid;
+                    goto out;
+                }
+                else
+                {
+                    p2mt = p2m_populate_on_demand;
+                    goto out;
+                }
             }
+
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else 
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
         }
     }
-
+out:
     *t = p2mt;
     return mfn;
 }
@@ -506,7 +1463,9 @@ int p2m_init(struct domain *d)
 
     memset(p2m, 0, sizeof(*p2m));
     p2m_lock_init(p2m);
-    INIT_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&p2m->pod.super);
+    INIT_PAGE_LIST_HEAD(&p2m->pod.single);
 
     p2m->set_entry = p2m_set_entry;
     p2m->get_entry = p2m_gfn_to_mfn;
@@ -567,7 +1526,6 @@ int p2m_alloc_table(struct domain *d,
 
 {
     mfn_t mfn = _mfn(INVALID_MFN);
-    struct list_head *entry;
     struct page_info *page, *p2m_top;
     unsigned int page_count = 0;
     unsigned long gfn = -1UL;
@@ -593,7 +1551,7 @@ int p2m_alloc_table(struct domain *d,
         p2m_unlock(p2m);
         return -ENOMEM;
     }
-    list_add_tail(&p2m_top->list, &p2m->pages);
+    page_list_add_tail(p2m_top, &p2m->pages);
 
     p2m_top->count_info = 1;
     p2m_top->u.inuse.type_info =
@@ -614,11 +1572,8 @@ int p2m_alloc_table(struct domain *d,
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
-    for ( entry = d->page_list.next;
-          entry != &d->page_list;
-          entry = entry->next )
+    page_list_for_each(page, &d->page_list)
     {
-        page = list_entry(entry, struct page_info, list);
         mfn = page_to_mfn(page);
         gfn = get_gpfn_from_mfn(mfn_x(mfn));
         page_count++;
@@ -648,19 +1603,14 @@ void p2m_teardown(struct domain *d)
 /* Return all the p2m pages to Xen.
  * We know we don't have any extra mappings to these pages */
 {
-    struct list_head *entry, *n;
     struct page_info *pg;
     struct p2m_domain *p2m = d->arch.p2m;
 
     p2m_lock(p2m);
     d->arch.phys_table = pagetable_null();
 
-    list_for_each_safe(entry, n, &p2m->pages)
-    {
-        pg = list_entry(entry, struct page_info, list);
-        list_del(entry);
+    while ( (pg = page_list_remove_head(&p2m->pages)) )
         p2m->free_page(d, pg);
-    }
     p2m_unlock(p2m);
 }
 
@@ -677,6 +1627,7 @@ static void audit_p2m(struct domain *d)
     struct page_info *page;
     struct domain *od;
     unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    int entry_count = 0;
     mfn_t p2mfn;
     unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
     int test_linear;
@@ -692,6 +1643,8 @@ static void audit_p2m(struct domain *d)
     if ( test_linear )
         flush_tlb_local();
 
+    spin_lock(&d->page_alloc_lock);
+
     /* Audit part one: walk the domain's page allocation list, checking
      * the m2p entries. */
     for ( entry = d->page_list.next;
@@ -729,7 +1682,7 @@ static void audit_p2m(struct domain *d)
             continue;
         }
 
-        p2mfn = gfn_to_mfn_foreign(d, gfn, &type);
+        p2mfn = gfn_to_mfn_type_foreign(d, gfn, &type, p2m_query);
         if ( mfn_x(p2mfn) != mfn )
         {
             mpbad++;
@@ -747,7 +1700,7 @@ static void audit_p2m(struct domain *d)
 
         if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
         {
-            lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type));
+            lp2mfn = mfn_x(gfn_to_mfn_query(d, gfn, &type));
             if ( lp2mfn != mfn_x(p2mfn) )
             {
                 P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
@@ -759,6 +1712,8 @@ static void audit_p2m(struct domain *d)
         //                mfn, gfn, p2mfn, lp2mfn);
     }
 
+    spin_unlock(&d->page_alloc_lock);
+
     /* Audit part two: walk the domain's p2m table, checking the entries. */
     if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
     {
@@ -802,6 +1757,10 @@ static void audit_p2m(struct domain *d)
                 {
                     if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                     {
+                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
+                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
+                                  == p2m_populate_on_demand ) )
+                            entry_count+=(1<<9);
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
@@ -814,7 +1773,7 @@ static void audit_p2m(struct domain *d)
                         for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
                         {
                             m2pfn = get_gpfn_from_mfn(mfn+i1);
-                            if ( m2pfn != (gfn + i) )
+                            if ( m2pfn != (gfn + i1) )
                             {
                                 pmbad++;
                                 P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
@@ -832,13 +1791,20 @@ static void audit_p2m(struct domain *d)
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
                     {
                         if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                        {
+                            if ( p2m_flags_to_type(l1e_get_flags(l1e[i1]))
+                                 == p2m_populate_on_demand )
+                            entry_count++;
                             continue;
+                        }
                         mfn = l1e_get_pfn(l1e[i1]);
                         ASSERT(mfn_valid(_mfn(mfn)));
                         m2pfn = get_gpfn_from_mfn(mfn);
                         if ( m2pfn != gfn )
                         {
                             pmbad++;
+                            printk("mismatch: gfn %#lx -> mfn %#lx"
+                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
                             P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
                                        " -> gfn %#lx\n", gfn, mfn, m2pfn);
                             BUG();
@@ -861,6 +1827,15 @@ static void audit_p2m(struct domain *d)
 
     }
 
+    if ( entry_count != d->arch.p2m->pod.entry_count )
+    {
+        printk("%s: refcounted entry count %d, audit count %d!\n",
+               __func__,
+               d->arch.p2m->pod.entry_count,
+               entry_count);
+        BUG();
+    }
+        
     //P2M_PRINTK("p2m audit complete\n");
     //if ( orphans_i | orphans_d | mpbad | pmbad )
     //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
@@ -869,8 +1844,6 @@ static void audit_p2m(struct domain *d)
         P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
                    pmbad, mpbad);
 }
-#else
-#define audit_p2m(_d) do { (void)(_d); } while(0)
 #endif /* P2M_AUDIT */
 
 
@@ -907,6 +1880,89 @@ guest_physmap_remove_page(struct domain *d, unsigned long gfn,
     p2m_unlock(d->arch.p2m);
 }
 
+#if CONFIG_PAGING_LEVELS == 3
+static int gfn_check_limit(
+    struct domain *d, unsigned long gfn, unsigned int order)
+{
+    /*
+     * 32bit AMD nested paging does not support over 4GB guest due to 
+     * hardware translation limit. This limitation is checked by comparing
+     * gfn with 0xfffffUL.
+     */
+    if ( !paging_mode_hap(d) || ((gfn + (1ul << order)) <= 0x100000UL) ||
+         (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+        return 0;
+
+    if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
+        dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
+                " 4GB: specify 'hap=0' domain config option.\n",
+                d->domain_id);
+
+    return -EINVAL;
+}
+#else
+#define gfn_check_limit(d, g, o) 0
+#endif
+
+int
+guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                      unsigned int order)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long i;
+    p2m_type_t ot;
+    mfn_t omfn;
+    int pod_count = 0;
+    int rc = 0;
+
+    BUG_ON(!paging_mode_translate(d));
+
+    rc = gfn_check_limit(d, gfn, order);
+    if ( rc != 0 )
+        return rc;
+
+    p2m_lock(p2md);
+    audit_p2m(d);
+
+    P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    /* Make sure all gpfns are unused */
+    for ( i = 0; i < (1UL << order); i++ )
+    {
+        omfn = gfn_to_mfn_query(d, gfn + i, &ot);
+        if ( p2m_is_ram(ot) )
+        {
+            printk("%s: gfn_to_mfn returned type %d!\n",
+                   __func__, ot);
+            rc = -EBUSY;
+            goto out;
+        }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
+        }
+    }
+
+    /* Now, actually do the two-way mapping */
+    if ( !set_p2m_entry(d, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
+                        p2m_populate_on_demand) )
+        rc = -EINVAL;
+    else
+    {
+        p2md->pod.entry_count += 1 << order; /* Lock: p2m */
+        p2md->pod.entry_count -= pod_count;
+        BUG_ON(p2md->pod.entry_count < 0);
+    }
+
+    audit_p2m(d);
+    p2m_unlock(p2md);
+
+out:
+    return rc;
+
+}
+
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                         unsigned long mfn, unsigned int page_order, 
@@ -915,6 +1971,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
     unsigned long i, ogfn;
     p2m_type_t ot;
     mfn_t omfn;
+    int pod_count = 0;
     int rc = 0;
 
     if ( !paging_mode_translate(d) )
@@ -932,21 +1989,9 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
         return 0;
     }
 
-#if CONFIG_PAGING_LEVELS == 3
-    /*
-     * 32bit PAE nested paging does not support over 4GB guest due to 
-     * hardware translation limit. This limitation is checked by comparing
-     * gfn with 0xfffffUL.
-     */
-    if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
-    {
-        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
-            dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
-                    " 4GB: specify 'hap=0' domain config option.\n",
-                    d->domain_id);
-        return -EINVAL;
-    }
-#endif
+    rc = gfn_check_limit(d, gfn, page_order);
+    if ( rc != 0 )
+        return rc;
 
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
@@ -956,18 +2001,23 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
     /* First, remove m->p mappings for existing p->m mappings */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        omfn = gfn_to_mfn(d, gfn, &ot);
+        omfn = gfn_to_mfn_query(d, gfn + i, &ot);
         if ( p2m_is_ram(ot) )
         {
             ASSERT(mfn_valid(omfn));
-            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
+            set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        }
+        else if ( ot == p2m_populate_on_demand )
+        {
+            /* Count how man PoD entries we'll be replacing if successful */
+            pod_count++;
         }
     }
 
     /* Then, look for m->p mappings for this range and deal with them */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        ogfn = mfn_to_gfn(d, _mfn(mfn));
+        ogfn = mfn_to_gfn(d, _mfn(mfn+i));
         if (
 #ifdef __x86_64__
             (ogfn != 0x5555555555555555L)
@@ -975,20 +2025,20 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
             (ogfn != 0x55555555L)
 #endif
             && (ogfn != INVALID_M2P_ENTRY)
-            && (ogfn != gfn) )
+            && (ogfn != gfn + i) )
         {
             /* This machine frame is already mapped at another physical
              * address */
             P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
-                      mfn, ogfn, gfn);
-            omfn = gfn_to_mfn(d, ogfn, &ot);
+                      mfn + i, ogfn, gfn + i);
+            omfn = gfn_to_mfn_query(d, ogfn, &ot);
             if ( p2m_is_ram(ot) )
             {
                 ASSERT(mfn_valid(omfn));
                 P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                           ogfn , mfn_x(omfn));
-                if ( mfn_x(omfn) == mfn )
-                    p2m_remove_page(d, ogfn, mfn, 0);
+                if ( mfn_x(omfn) == (mfn + i) )
+                    p2m_remove_page(d, ogfn, mfn + i, 0);
             }
         }
     }
@@ -1008,6 +2058,11 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
         if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
                             p2m_invalid) )
             rc = -EINVAL;
+        else
+        {
+            d->arch.p2m->pod.entry_count -= pod_count; /* Lock: p2m */
+            BUG_ON(d->arch.p2m->pod.entry_count < 0);
+        }
     }
 
     audit_p2m(d);
@@ -1150,7 +2205,7 @@ set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
     if ( !paging_mode_translate(d) )
         return 0;
 
-    omfn = gfn_to_mfn(d, gfn, &ot);
+    omfn = gfn_to_mfn_query(d, gfn, &ot);
     if ( p2m_is_ram(ot) )
     {
         ASSERT(mfn_valid(omfn));
index 2247d8dd68752a1236c7d13618776db55236becc..2b898dd73b6c18633e4bfcdbc62046af826f34ff 100644 (file)
 /************************************************/
 /* Override macros from asm/page.h to make them work with mfn_t */
 #undef mfn_to_page
-#define mfn_to_page(_m) (frame_table + mfn_x(_m))
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
 #undef mfn_valid
-#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
 #undef page_to_mfn
-#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
 /* The log-dirty lock.  This protects the log-dirty bitmap from
  * concurrent accesses (and teardowns, etc).
@@ -585,6 +585,9 @@ void paging_teardown(struct domain *d)
 
     /* clean up log dirty resources. */
     paging_log_dirty_teardown(d);
+
+    /* Move populate-on-demand cache back to domain_list for destruction */
+    p2m_pod_empty_cache(d);
 }
 
 /* Call once all of the references to the domain have gone away */
index 76c879ee815c2cfb955f9172704dfab00cb2c067..3333d43679f075f0edf48e60cc1897120054e132 100644 (file)
@@ -1,5 +1,5 @@
 obj-$(x86_32) += common.o guest_2.o guest_3.o
 obj-$(x86_64) += common.o guest_2.o guest_3.o guest_4.o
 
-guest_%.o: multi.c $(HDRS) Makefile
+guest_%.o: multi.c Makefile
        $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
index dd391cc6759d08c45950e73a19d0a6658950e456..ca4cb13ac4831027743c128dddcb243bac54ccea 100644 (file)
@@ -48,9 +48,9 @@ void shadow_domain_init(struct domain *d)
     int i;
     shadow_lock_init(d);
     for ( i = 0; i <= SHADOW_MAX_ORDER; i++ )
-        INIT_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
-    INIT_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
-    INIT_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
+        INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelists[i]);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.p2m_freelist);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows);
 
     /* Use shadow pagetables for log-dirty support */
     paging_log_dirty_init(d, shadow_enable_log_dirty, 
@@ -627,6 +627,15 @@ void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
             idx = (idx + 1) % SHADOW_OOS_PAGES;
         if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
         {
+            int i;
+            for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ )
+            {
+                if ( mfn_valid(oos_fixup[idx].smfn[i])
+                     && (mfn_x(oos_fixup[idx].smfn[i]) == mfn_x(smfn))
+                     && (oos_fixup[idx].off[i] == off) )
+                    return;
+            }
+
             next = oos_fixup[idx].next;
 
             if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
@@ -1282,9 +1291,9 @@ static inline int space_is_available(
     for ( ; order <= shadow_max_order(d); ++order )
     {
         unsigned int n = count;
-        const struct list_head *p;
+        const struct page_info *sp;
 
-        list_for_each ( p, &d->arch.paging.shadow.freelists[order] )
+        page_list_for_each ( sp, &d->arch.paging.shadow.freelists[order] )
             if ( --n == 0 )
                 return 1;
         count = (count + 1) >> 1;
@@ -1297,8 +1306,8 @@ static inline int space_is_available(
  * non-Xen mappings in this top-level shadow mfn */
 static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
 {
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
-    switch ( sp->type )
+    struct page_info *sp = mfn_to_page(smfn);
+    switch ( sp->u.sh.type )
     {
     case SH_type_l2_32_shadow:
         SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v,smfn);
@@ -1313,7 +1322,7 @@ static void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn)
         break;
 #endif
     default:
-        SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->type);
+        SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type);
         BUG();
     }
 }
@@ -1325,7 +1334,7 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
         /* Convert smfn to gfn */
         unsigned long gfn;
         ASSERT(mfn_valid(smfn));
-        gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer));
+        gfn = mfn_to_gfn(d, _mfn(mfn_to_page(smfn)->v.sh.back));
         __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
                     sizeof(gfn), (unsigned char*)&gfn);
     }
@@ -1341,8 +1350,7 @@ static void _shadow_prealloc(
     /* Need a vpcu for calling unpins; for now, since we don't have
      * per-vcpu shadows, any will do */
     struct vcpu *v, *v2;
-    struct list_head *l, *t;
-    struct shadow_page_info *sp;
+    struct page_info *sp, *t;
     mfn_t smfn;
     int i;
 
@@ -1356,10 +1364,9 @@ static void _shadow_prealloc(
 
     /* Stage one: walk the list of pinned pages, unpinning them */
     perfc_incr(shadow_prealloc_1);
-    list_for_each_backwards_safe(l, t, &d->arch.paging.shadow.pinned_shadows)
+    page_list_for_each_safe_reverse(sp, t, &d->arch.paging.shadow.pinned_shadows)
     {
-        sp = list_entry(l, struct shadow_page_info, list);
-        smfn = shadow_page_to_mfn(sp);
+        smfn = page_to_mfn(sp);
 
         /* Unpin this top-level shadow */
         trace_shadow_prealloc_unpin(d, smfn);
@@ -1418,8 +1425,7 @@ void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
  * this domain's shadows */
 static void shadow_blow_tables(struct domain *d) 
 {
-    struct list_head *l, *t;
-    struct shadow_page_info *sp;
+    struct page_info *sp, *t;
     struct vcpu *v = d->vcpu[0];
     mfn_t smfn;
     int i;
@@ -1427,10 +1433,9 @@ static void shadow_blow_tables(struct domain *d)
     ASSERT(v != NULL);
 
     /* Pass one: unpin all pinned pages */
-    list_for_each_backwards_safe(l,t, &d->arch.paging.shadow.pinned_shadows)
+    page_list_for_each_safe_reverse(sp, t, &d->arch.paging.shadow.pinned_shadows)
     {
-        sp = list_entry(l, struct shadow_page_info, list);
-        smfn = shadow_page_to_mfn(sp);
+        smfn = page_to_mfn(sp);
         sh_unpin(v, smfn);
     }
         
@@ -1484,6 +1489,18 @@ static __init int shadow_blow_tables_keyhandler_init(void)
 __initcall(shadow_blow_tables_keyhandler_init);
 #endif /* !NDEBUG */
 
+static inline struct page_info *
+next_shadow(const struct page_info *sp)
+{
+    return sp->next_shadow ? mfn_to_page(_mfn(sp->next_shadow)) : NULL;
+}
+
+static inline void
+set_next_shadow(struct page_info *sp, struct page_info *next)
+{
+    sp->next_shadow = next ? mfn_x(page_to_mfn(next)) : 0;
+}
+
 /* Allocate another shadow's worth of (contiguous, aligned) pages,
  * and fill in the type and backpointer fields of their page_infos. 
  * Never fails to allocate. */
@@ -1491,7 +1508,7 @@ mfn_t shadow_alloc(struct domain *d,
                     u32 shadow_type,
                     unsigned long backpointer)
 {
-    struct shadow_page_info *sp = NULL;
+    struct page_info *sp = NULL;
     unsigned int order = shadow_order(shadow_type);
     cpumask_t mask;
     void *p;
@@ -1506,7 +1523,7 @@ mfn_t shadow_alloc(struct domain *d,
 
     /* Find smallest order which can satisfy the request. */
     for ( i = order; i <= SHADOW_MAX_ORDER; i++ )
-        if ( !list_empty(&d->arch.paging.shadow.freelists[i]) )
+        if ( (sp = page_list_remove_head(&d->arch.paging.shadow.freelists[i])) )
             goto found;
     
     /* If we get here, we failed to allocate. This should never happen.
@@ -1517,16 +1534,12 @@ mfn_t shadow_alloc(struct domain *d,
     BUG();
 
  found:
-    sp = list_entry(d->arch.paging.shadow.freelists[i].next, 
-                    struct shadow_page_info, list);
-    list_del(&sp->list);
-            
     /* We may have to halve the chunk a number of times. */
     while ( i != order )
     {
         i--;
-        sp->order = i;
-        list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[i]);
+        sp->v.free.order = i;
+        page_list_add_tail(sp, &d->arch.paging.shadow.freelists[i]);
         sp += 1 << i;
     }
     d->arch.paging.shadow.free_pages -= 1 << order;
@@ -1544,26 +1557,26 @@ mfn_t shadow_alloc(struct domain *d,
             flush_tlb_mask(mask);
         }
         /* Now safe to clear the page for reuse */
-        p = sh_map_domain_page(shadow_page_to_mfn(sp+i));
+        p = sh_map_domain_page(page_to_mfn(sp+i));
         ASSERT(p != NULL);
         clear_page(p);
         sh_unmap_domain_page(p);
-        INIT_LIST_HEAD(&sp[i].list);
-        sp[i].type = shadow_type;
-        sp[i].pinned = 0;
-        sp[i].count = 0;
-        sp[i].backpointer = backpointer;
-        sp[i].next_shadow = NULL;
+        INIT_PAGE_LIST_ENTRY(&sp[i].list);
+        sp[i].u.sh.type = shadow_type;
+        sp[i].u.sh.pinned = 0;
+        sp[i].u.sh.count = 0;
+        sp[i].v.sh.back = backpointer;
+        set_next_shadow(&sp[i], NULL);
         perfc_incr(shadow_alloc_count);
     }
-    return shadow_page_to_mfn(sp);
+    return page_to_mfn(sp);
 }
 
 
 /* Return some shadow pages to the pool. */
 void shadow_free(struct domain *d, mfn_t smfn)
 {
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn); 
+    struct page_info *sp = mfn_to_page(smfn); 
     u32 shadow_type;
     unsigned long order;
     unsigned long mask;
@@ -1572,7 +1585,7 @@ void shadow_free(struct domain *d, mfn_t smfn)
     ASSERT(shadow_locked_by_me(d));
     perfc_incr(shadow_free);
 
-    shadow_type = sp->type;
+    shadow_type = sp->u.sh.type;
     ASSERT(shadow_type != SH_type_none);
     ASSERT(shadow_type != SH_type_p2m_table);
     order = shadow_order(shadow_type);
@@ -1596,7 +1609,7 @@ void shadow_free(struct domain *d, mfn_t smfn)
         }
 #endif
         /* Strip out the type: this is now a free shadow page */
-        sp[i].type = 0;
+        sp[i].u.sh.type = 0;
         /* Remember the TLB timestamp so we will know whether to flush 
          * TLBs when we reuse the page.  Because the destructors leave the
          * contents of the pages in place, we can delay TLB flushes until
@@ -1609,22 +1622,24 @@ void shadow_free(struct domain *d, mfn_t smfn)
     for ( ; order < shadow_max_order(d); ++order )
     {
         mask = 1 << order;
-        if ( (mfn_x(shadow_page_to_mfn(sp)) & mask) ) {
+        if ( (mfn_x(page_to_mfn(sp)) & mask) ) {
             /* Merge with predecessor block? */
-            if ( ((sp-mask)->type != PGT_none) || ((sp-mask)->order != order) )
+            if ( ((sp-mask)->u.sh.type != PGT_none) ||
+                 ((sp-mask)->v.free.order != order) )
                 break;
-            list_del(&(sp-mask)->list);
             sp -= mask;
+            page_list_del(sp, &d->arch.paging.shadow.freelists[order]);
         } else {
             /* Merge with successor block? */
-            if ( ((sp+mask)->type != PGT_none) || ((sp+mask)->order != order) )
+            if ( ((sp+mask)->u.sh.type != PGT_none) ||
+                 ((sp+mask)->v.free.order != order) )
                 break;
-            list_del(&(sp+mask)->list);
+            page_list_del(sp + mask, &d->arch.paging.shadow.freelists[order]);
         }
     }
 
-    sp->order = order;
-    list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
+    sp->v.free.order = order;
+    page_list_add_tail(sp, &d->arch.paging.shadow.freelists[order]);
 }
 
 /* Divert some memory from the pool to be used by the p2m mapping.
@@ -1662,8 +1677,8 @@ sh_alloc_p2m_pages(struct domain *d)
          * believed to be a concern.
          */
         page_set_owner(&pg[i], d);
-        pg[i].count_info = 1;
-        list_add_tail(&pg[i].list, &d->arch.paging.shadow.p2m_freelist);
+        pg[i].count_info |= 1;
+        page_list_add_tail(&pg[i], &d->arch.paging.shadow.p2m_freelist);
     }
     return 1;
 }
@@ -1672,25 +1687,22 @@ sh_alloc_p2m_pages(struct domain *d)
 static struct page_info *
 shadow_alloc_p2m_page(struct domain *d)
 {
-    struct list_head *entry;
     struct page_info *pg;
     mfn_t mfn;
     void *p;
     
     shadow_lock(d);
 
-    if ( list_empty(&d->arch.paging.shadow.p2m_freelist) &&
+    if ( page_list_empty(&d->arch.paging.shadow.p2m_freelist) &&
          !sh_alloc_p2m_pages(d) )
     {
         shadow_unlock(d);
         return NULL;
     }
-    entry = d->arch.paging.shadow.p2m_freelist.next;
-    list_del(entry);
+    pg = page_list_remove_head(&d->arch.paging.shadow.p2m_freelist);
 
     shadow_unlock(d);
 
-    pg = list_entry(entry, struct page_info, list);
     mfn = page_to_mfn(pg);
     p = sh_map_domain_page(mfn);
     clear_page(p);
@@ -1706,16 +1718,13 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg)
     /* Should have just the one ref we gave it in alloc_p2m_page() */
     if ( (pg->count_info & PGC_count_mask) != 1 )
     {
-        SHADOW_ERROR("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+        SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n",
                      pg->count_info, pg->u.inuse.type_info);
     }
-    pg->count_info = 0;
+    pg->count_info &= ~PGC_count_mask;
     /* Free should not decrement domain's total allocation, since 
      * these pages were allocated without an owner. */
     page_set_owner(pg, NULL); 
-#if defined(__x86_64__)
-    spin_lock_init(&pg->lock);
-#endif
     free_domheap_pages(pg, 0);
     d->arch.paging.shadow.p2m_pages--;
     perfc_decr(shadow_alloc_count);
@@ -1774,7 +1783,7 @@ static unsigned int sh_set_allocation(struct domain *d,
                                       unsigned int pages,
                                       int *preempted)
 {
-    struct shadow_page_info *sp;
+    struct page_info *sp;
     unsigned int lower_bound;
     unsigned int j, order = shadow_max_order(d);
 
@@ -1796,7 +1805,7 @@ static unsigned int sh_set_allocation(struct domain *d,
         if ( d->arch.paging.shadow.total_pages < pages ) 
         {
             /* Need to allocate more memory from domheap */
-            sp = (struct shadow_page_info *)
+            sp = (struct page_info *)
                 alloc_domheap_pages(NULL, order, MEMF_node(domain_to_node(d)));
             if ( sp == NULL ) 
             { 
@@ -1807,31 +1816,26 @@ static unsigned int sh_set_allocation(struct domain *d,
             d->arch.paging.shadow.total_pages += 1 << order;
             for ( j = 0; j < 1U << order; j++ )
             {
-                sp[j].type = 0;  
-                sp[j].pinned = 0;
-                sp[j].count = 0;
-                sp[j].mbz = 0;
+                sp[j].u.sh.type = 0;
+                sp[j].u.sh.pinned = 0;
+                sp[j].u.sh.count = 0;
                 sp[j].tlbflush_timestamp = 0; /* Not in any TLB */
             }
-            sp->order = order;
-            list_add_tail(&sp->list, &d->arch.paging.shadow.freelists[order]);
+            sp->v.free.order = order;
+            page_list_add_tail(sp, &d->arch.paging.shadow.freelists[order]);
         } 
         else if ( d->arch.paging.shadow.total_pages > pages ) 
         {
             /* Need to return memory to domheap */
             _shadow_prealloc(d, order, 1);
-            ASSERT(!list_empty(&d->arch.paging.shadow.freelists[order]));
-            sp = list_entry(d->arch.paging.shadow.freelists[order].next,
-                            struct shadow_page_info, list);
-            list_del(&sp->list);
-#if defined(__x86_64__)
+            sp = page_list_remove_head(&d->arch.paging.shadow.freelists[order]);
+            ASSERT(sp);
             /*
-             * Re-instate lock field which we overwrite with shadow_page_info.
-             * This was safe, since the lock is only used on guest pages.
+             * The pages were allocated anonymously, but the owner field
+             * gets overwritten normally, so need to clear it here.
              */
             for ( j = 0; j < 1U << order; j++ )
-                spin_lock_init(&((struct page_info *)sp)[j].lock);
-#endif
+                page_set_owner(&((struct page_info *)sp)[j], NULL);
             d->arch.paging.shadow.free_pages -= 1 << order;
             d->arch.paging.shadow.total_pages -= 1 << order;
             free_domheap_pages((struct page_info *)sp, order);
@@ -1882,7 +1886,7 @@ static inline key_t sh_hash(unsigned long n, unsigned int t)
 static void sh_hash_audit_bucket(struct domain *d, int bucket)
 /* Audit one bucket of the hash table */
 {
-    struct shadow_page_info *sp, *x;
+    struct page_info *sp, *x;
 
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
@@ -1891,38 +1895,39 @@ static void sh_hash_audit_bucket(struct domain *d, int bucket)
     while ( sp )
     {
         /* Not a shadow? */
-        BUG_ON( sp->mbz != 0 );
+        BUG_ON( (sp->count_info & PGC_count_mask )!= 0 ) ;
         /* Bogus type? */
-        BUG_ON( sp->type == 0 ); 
-        BUG_ON( sp->type > SH_type_max_shadow );
+        BUG_ON( sp->u.sh.type == 0 );
+        BUG_ON( sp->u.sh.type > SH_type_max_shadow );
         /* Wrong bucket? */
-        BUG_ON( sh_hash(sp->backpointer, sp->type) != bucket ); 
+        BUG_ON( sh_hash(sp->v.sh.back, sp->u.sh.type) != bucket );
         /* Duplicate entry? */
-        for ( x = sp->next_shadow; x; x = x->next_shadow )
-            BUG_ON( x->backpointer == sp->backpointer && x->type == sp->type );
+        for ( x = next_shadow(sp); x; x = next_shadow(x) )
+            BUG_ON( x->v.sh.back == sp->v.sh.back &&
+                    x->u.sh.type == sp->u.sh.type );
         /* Follow the backpointer to the guest pagetable */
-        if ( sp->type != SH_type_fl1_32_shadow
-             && sp->type != SH_type_fl1_pae_shadow
-             && sp->type != SH_type_fl1_64_shadow )
+        if ( sp->u.sh.type != SH_type_fl1_32_shadow
+             && sp->u.sh.type != SH_type_fl1_pae_shadow
+             && sp->u.sh.type != SH_type_fl1_64_shadow )
         {
-            struct page_info *gpg = mfn_to_page(_mfn(sp->backpointer));
+            struct page_info *gpg = mfn_to_page(_mfn(sp->v.sh.back));
             /* Bad shadow flags on guest page? */
-            BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
+            BUG_ON( !(gpg->shadow_flags & (1<<sp->u.sh.type)) );
             /* Bad type count on guest page? */
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
-            if ( sp->type == SH_type_l1_32_shadow
-                 || sp->type == SH_type_l1_pae_shadow
-                 || sp->type == SH_type_l1_64_shadow )
+            if ( sp->u.sh.type == SH_type_l1_32_shadow
+                 || sp->u.sh.type == SH_type_l1_pae_shadow
+                 || sp->u.sh.type == SH_type_l1_64_shadow )
             {
                 if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
                      && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
                 {
                     if ( !page_is_out_of_sync(gpg) )
                     {
-                        SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+                        SHADOW_ERROR("MFN %#"PRpgmfn" shadowed (by %#"PRI_mfn")"
                                      " and not OOS but has typecount %#lx\n",
-                                     sp->backpointer, 
-                                     mfn_x(shadow_page_to_mfn(sp)), 
+                                     sp->v.sh.back,
+                                     mfn_x(page_to_mfn(sp)), 
                                      gpg->u.inuse.type_info);
                         BUG();
                     }
@@ -1933,15 +1938,15 @@ static void sh_hash_audit_bucket(struct domain *d, int bucket)
             if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
                  && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
             {
-                SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+                SHADOW_ERROR("MFN %#"PRpgmfn" shadowed (by %#"PRI_mfn")"
                              " but has typecount %#lx\n",
-                             sp->backpointer, mfn_x(shadow_page_to_mfn(sp)), 
+                             sp->v.sh.back, mfn_x(page_to_mfn(sp)),
                              gpg->u.inuse.type_info);
                 BUG();
             }
         }
         /* That entry was OK; on we go */
-        sp = sp->next_shadow;
+        sp = next_shadow(sp);
     }
 }
 
@@ -1974,15 +1979,15 @@ static void sh_hash_audit(struct domain *d)
  * Returns 0 for success, 1 for error. */
 static int shadow_hash_alloc(struct domain *d)
 {
-    struct shadow_page_info **table;
+    struct page_info **table;
 
     ASSERT(shadow_locked_by_me(d));
     ASSERT(!d->arch.paging.shadow.hash_table);
 
-    table = xmalloc_array(struct shadow_page_info *, SHADOW_HASH_BUCKETS);
+    table = xmalloc_array(struct page_info *, SHADOW_HASH_BUCKETS);
     if ( !table ) return 1;
     memset(table, 0, 
-           SHADOW_HASH_BUCKETS * sizeof (struct shadow_page_info *));
+           SHADOW_HASH_BUCKETS * sizeof (struct page_info *));
     d->arch.paging.shadow.hash_table = table;
     return 0;
 }
@@ -2004,7 +2009,7 @@ mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
  * or INVALID_MFN if it doesn't exist */
 {
     struct domain *d = v->domain;
-    struct shadow_page_info *sp, *prev;
+    struct page_info *sp, *prev;
     key_t key;
 
     ASSERT(shadow_locked_by_me(d));
@@ -2021,21 +2026,21 @@ mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
     prev = NULL;
     while(sp)
     {
-        if ( sp->backpointer == n && sp->type == t )
+        if ( sp->v.sh.back == n && sp->u.sh.type == t )
         {
             /* Pull-to-front if 'sp' isn't already the head item */
             if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) )
             {
                 if ( unlikely(d->arch.paging.shadow.hash_walking != 0) )
                     /* Can't reorder: someone is walking the hash chains */
-                    return shadow_page_to_mfn(sp);
+                    return page_to_mfn(sp);
                 else 
                 {
                     ASSERT(prev);
                     /* Delete sp from the list */
                     prev->next_shadow = sp->next_shadow;                    
                     /* Re-insert it at the head of the list */
-                    sp->next_shadow = d->arch.paging.shadow.hash_table[key];
+                    set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
                     d->arch.paging.shadow.hash_table[key] = sp;
                 }
             }
@@ -2043,10 +2048,10 @@ mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t)
             {
                 perfc_incr(shadow_hash_lookup_head);
             }
-            return shadow_page_to_mfn(sp);
+            return page_to_mfn(sp);
         }
         prev = sp;
-        sp = sp->next_shadow;
+        sp = next_shadow(sp);
     }
 
     perfc_incr(shadow_hash_lookup_miss);
@@ -2058,7 +2063,7 @@ void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
 /* Put a mapping (n,t)->smfn into the hash table */
 {
     struct domain *d = v->domain;
-    struct shadow_page_info *sp;
+    struct page_info *sp;
     key_t key;
     
     ASSERT(shadow_locked_by_me(d));
@@ -2072,8 +2077,8 @@ void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t,
     sh_hash_audit_bucket(d, key);
     
     /* Insert this shadow at the top of the bucket */
-    sp = mfn_to_shadow_page(smfn);
-    sp->next_shadow = d->arch.paging.shadow.hash_table[key];
+    sp = mfn_to_page(smfn);
+    set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]);
     d->arch.paging.shadow.hash_table[key] = sp;
     
     sh_hash_audit_bucket(d, key);
@@ -2084,7 +2089,7 @@ void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
 /* Excise the mapping (n,t)->smfn from the hash table */
 {
     struct domain *d = v->domain;
-    struct shadow_page_info *sp, *x;
+    struct page_info *sp, *x;
     key_t key;
 
     ASSERT(shadow_locked_by_me(d));
@@ -2097,10 +2102,10 @@ void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
     key = sh_hash(n, t);
     sh_hash_audit_bucket(d, key);
     
-    sp = mfn_to_shadow_page(smfn);
+    sp = mfn_to_page(smfn);
     if ( d->arch.paging.shadow.hash_table[key] == sp ) 
         /* Easy case: we're deleting the head item. */
-        d->arch.paging.shadow.hash_table[key] = sp->next_shadow;
+        d->arch.paging.shadow.hash_table[key] = next_shadow(sp);
     else 
     {
         /* Need to search for the one we want */
@@ -2109,15 +2114,15 @@ void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t,
         {
             ASSERT(x); /* We can't have hit the end, since our target is
                         * still in the chain somehwere... */
-            if ( x->next_shadow == sp ) 
+            if ( next_shadow(x) == sp )
             {
                 x->next_shadow = sp->next_shadow;
                 break;
             }
-            x = x->next_shadow;
+            x = next_shadow(x);
         }
     }
-    sp->next_shadow = NULL;
+    set_next_shadow(sp, NULL);
 
     sh_hash_audit_bucket(d, key);
 }
@@ -2139,7 +2144,7 @@ static void hash_foreach(struct vcpu *v,
 {
     int i, done = 0;
     struct domain *d = v->domain;
-    struct shadow_page_info *x;
+    struct page_info *x;
 
     /* Say we're here, to stop hash-lookups reordering the chains */
     ASSERT(shadow_locked_by_me(d));
@@ -2151,14 +2156,14 @@ static void hash_foreach(struct vcpu *v,
         /* WARNING: This is not safe against changes to the hash table.
          * The callback *must* return non-zero if it has inserted or
          * deleted anything from the hash (lookups are OK, though). */
-        for ( x = d->arch.paging.shadow.hash_table[i]; x; x = x->next_shadow )
+        for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) )
         {
-            if ( callback_mask & (1 << x->type) ) 
+            if ( callback_mask & (1 << x->u.sh.type) )
             {
-                ASSERT(x->type <= 15);
-                ASSERT(callbacks[x->type] != NULL);
-                done = callbacks[x->type](v, shadow_page_to_mfn(x), 
-                                          callback_mfn);
+                ASSERT(x->u.sh.type <= 15);
+                ASSERT(callbacks[x->u.sh.type] != NULL);
+                done = callbacks[x->u.sh.type](v, page_to_mfn(x),
+                                               callback_mfn);
                 if ( done ) break;
             }
         }
@@ -2175,8 +2180,8 @@ static void hash_foreach(struct vcpu *v,
 
 void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
 {
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
-    unsigned int t = sp->type;
+    struct page_info *sp = mfn_to_page(smfn);
+    unsigned int t = sp->u.sh.type;
 
 
     SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn));
@@ -2188,7 +2193,7 @@ void sh_destroy_shadow(struct vcpu *v, mfn_t smfn)
            t == SH_type_fl1_64_shadow  || 
            t == SH_type_monitor_table  || 
            (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) ||
-           (page_get_owner(mfn_to_page(_mfn(sp->backpointer))) 
+           (page_get_owner(mfn_to_page(_mfn(sp->v.sh.back)))
             == v->domain)); 
 
     /* The down-shifts here are so that the switch statement is on nice
@@ -2440,7 +2445,7 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
     {
         unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask);
         mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn);
-        int shtype = mfn_to_shadow_page(last_smfn)->type;
+        int shtype = mfn_to_page(last_smfn)->u.sh.type;
 
         if ( callbacks[shtype] ) 
             callbacks[shtype](v, last_smfn, gmfn);
@@ -2483,25 +2488,25 @@ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn,
 int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
                                      mfn_t smfn, unsigned long off)
 {
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+    struct page_info *sp = mfn_to_page(smfn);
     
     ASSERT(mfn_valid(smfn));
     ASSERT(mfn_valid(gmfn));
     
-    if ( sp->type == SH_type_l1_32_shadow
-         || sp->type == SH_type_fl1_32_shadow )
+    if ( sp->u.sh.type == SH_type_l1_32_shadow
+         || sp->u.sh.type == SH_type_fl1_32_shadow )
     {
         return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
             (v, gmfn, smfn, off);
     }
 #if CONFIG_PAGING_LEVELS >= 3
-    else if ( sp->type == SH_type_l1_pae_shadow
-              || sp->type == SH_type_fl1_pae_shadow )
+    else if ( sp->u.sh.type == SH_type_l1_pae_shadow
+              || sp->u.sh.type == SH_type_fl1_pae_shadow )
         return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
             (v, gmfn, smfn, off);
 #if CONFIG_PAGING_LEVELS >= 4
-    else if ( sp->type == SH_type_l1_64_shadow
-              || sp->type == SH_type_fl1_64_shadow )
+    else if ( sp->u.sh.type == SH_type_l1_64_shadow
+              || sp->u.sh.type == SH_type_fl1_64_shadow )
         return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
             (v, gmfn, smfn, off);
 #endif
@@ -2584,7 +2589,7 @@ int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
                && (page->u.inuse.type_info & PGT_count_mask) == 0) )
         {
             SHADOW_ERROR("can't find all mappings of mfn %lx: "
-                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
+                          "c=%08lx t=%08lx\n", mfn_x(gmfn), 
                           page->count_info, page->u.inuse.type_info);
         }
     }
@@ -2603,17 +2608,17 @@ static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
 /* Follow this shadow's up-pointer, if it has one, and remove the reference
  * found there.  Returns 1 if that was the only reference to this shadow */
 {
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+    struct page_info *sp = mfn_to_page(smfn);
     mfn_t pmfn;
     void *vaddr;
     int rc;
 
-    ASSERT(sp->type > 0);
-    ASSERT(sp->type < SH_type_max_shadow);
-    ASSERT(sp->type != SH_type_l2_32_shadow);
-    ASSERT(sp->type != SH_type_l2_pae_shadow);
-    ASSERT(sp->type != SH_type_l2h_pae_shadow);
-    ASSERT(sp->type != SH_type_l4_64_shadow);
+    ASSERT(sp->u.sh.type > 0);
+    ASSERT(sp->u.sh.type < SH_type_max_shadow);
+    ASSERT(sp->u.sh.type != SH_type_l2_32_shadow);
+    ASSERT(sp->u.sh.type != SH_type_l2_pae_shadow);
+    ASSERT(sp->u.sh.type != SH_type_l2h_pae_shadow);
+    ASSERT(sp->u.sh.type != SH_type_l4_64_shadow);
     
     if (sp->up == 0) return 0;
     pmfn = _mfn(sp->up >> PAGE_SHIFT);
@@ -2624,10 +2629,10 @@ static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
     ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
     
     /* Is this the only reference to this shadow? */
-    rc = (sp->count == 1) ? 1 : 0;
+    rc = (sp->u.sh.count == 1) ? 1 : 0;
 
     /* Blank the offending entry */
-    switch (sp->type) 
+    switch (sp->u.sh.type)
     {
     case SH_type_l1_32_shadow:
     case SH_type_l2_32_shadow:
@@ -3158,7 +3163,6 @@ void shadow_teardown(struct domain *d)
 {
     struct vcpu *v;
     mfn_t mfn;
-    struct list_head *entry, *n;
     struct page_info *pg;
 
     ASSERT(d->is_dying);
@@ -3210,12 +3214,8 @@ void shadow_teardown(struct domain *d)
     }
 #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
 
-    list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
-    {
-        list_del(entry);
-        pg = list_entry(entry, struct page_info, list);
+    while ( (pg = page_list_remove_head(&d->arch.paging.shadow.p2m_freelist)) )
         shadow_free_p2m_page(d, pg);
-    }
 
     if ( d->arch.paging.shadow.total_pages != 0 )
     {
@@ -3659,7 +3659,6 @@ int shadow_track_dirty_vram(struct domain *d,
         for ( i = 0; i < nr; i++ ) {
             mfn_t mfn = gfn_to_mfn(d, begin_pfn + i, &t);
             struct page_info *page;
-            u32 count_info;
             int dirty = 0;
             paddr_t sl1ma = d->dirty_vram->sl1ma[i];
 
@@ -3670,8 +3669,7 @@ int shadow_track_dirty_vram(struct domain *d,
             else
             {
                 page = mfn_to_page(mfn);
-                count_info = page->u.inuse.type_info & PGT_count_mask;
-                switch (count_info)
+                switch (page->u.inuse.type_info & PGT_count_mask)
                 {
                 case 0:
                     /* No guest reference, nothing to track. */
index f3ac8bfc3922f033267b08483ca1ed8510504246..4e1fe8a7c040d18eb760dd0114aff939028159f4 100644 (file)
@@ -35,6 +35,7 @@
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/cacheattr.h>
 #include <asm/mtrr.h>
+#include <asm/guest_pt.h>
 #include "private.h"
 #include "types.h"
 
@@ -156,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
         put_page(mfn_to_page(gmfn));
 }
 
-/**************************************************************************/
-/* CPU feature support querying */
-
-static inline int
-guest_supports_superpages(struct vcpu *v)
-{
-    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
-     * CR4.PSE is set or the guest is in PAE or long mode. 
-     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
-    return (is_hvm_vcpu(v) && 
-            (GUEST_PAGING_LEVELS != 2 
-             || !hvm_paging_enabled(v)
-             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
-}
-
-static inline int
-guest_supports_nx(struct vcpu *v)
-{
-    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
-        return 0;
-    if ( !is_hvm_vcpu(v) )
-        return cpu_has_nx;
-    return hvm_nx_enabled(v);
-}
-
 
 /**************************************************************************/
 /* Functions for walking the guest page tables */
 
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
-{
-    static uint32_t flags[] = {
-        /* I/F -  Usr Wr */
-        /* 0   0   0   0 */ _PAGE_PRESENT, 
-        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-        /* 0   1   0   0 */ _PAGE_PRESENT, 
-        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
-        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
-        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-    };
-
-    /* Don't demand not-NX if the CPU wouldn't enforce it. */
-    if ( !guest_supports_nx(v) )
-        pfec &= ~PFEC_insn_fetch;
-
-    /* Don't demand R/W if the CPU wouldn't enforce it. */
-    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
-         && !(pfec & PFEC_user_mode) )
-        pfec &= ~PFEC_write_access;
-
-    return flags[(pfec & 0x1f) >> 1];
-}
-
-/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
- * Returns non-zero if it actually writes to guest memory. */
-static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+static inline uint32_t
+sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                     uint32_t pfec)
 {
-    guest_intpte_t old, new;
-    int ret = 0;
-
-    old = *(guest_intpte_t *)walk_p;
-    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
-    if ( old != new ) 
-    {
-        /* Write the new entry into the walk, and try to write it back
-         * into the guest table as well.  If the guest table has changed
-         * under out feet then leave it alone. */
-        *(guest_intpte_t *)walk_p = new;
-        if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
-            ret = 1;
-
-        /* FIXME -- this code is longer than necessary */
-        if(set_dirty)
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
-        else
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
-    }
-    return ret;
+    return guest_walk_tables(v, va, gw, pfec, 
+#if GUEST_PAGING_LEVELS == 3 /* PAE */
+                             _mfn(INVALID_MFN),
+                             v->arch.paging.shadow.gl3e
+#else /* 32 or 64 */
+                             pagetable_get_mfn(v->arch.guest_table),
+                             v->arch.paging.shadow.guest_vtable
+#endif
+                             );
 }
 
 /* This validation is called with lock held, and after write permission
@@ -254,7 +183,7 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
  * Return 1 to indicate success and 0 for inconsistency
  */
 static inline uint32_t
-shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
+shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
 {
     struct domain *d = v->domain;
     guest_l1e_t *l1p;
@@ -267,9 +196,8 @@ shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw)
 
     ASSERT(shadow_locked_by_me(d));
 
-    if ( gw->version ==
-         atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
-        return 1;
+    if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
+         return 1;
 
     /* We may consider caching guest page mapping from last
      * guest table walk. However considering this check happens
@@ -364,239 +292,6 @@ gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
     return rc;
 }
 
-/* Walk the guest pagetables, after the manner of a hardware walker. 
- *
- * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
- *         pointer to a pagefault code
- * 
- * We walk the vcpu's guest pagetables, filling the walk_t with what we
- * see and adding any Accessed and Dirty bits that are needed in the
- * guest entries.  Using the pagefault code, we check the permissions as
- * we go.  For the purposes of reading pagetables we treat all non-RAM
- * memory as contining zeroes.
- * 
- * The walk is done in a lock-free style, with some sanity check postponed
- * after grabbing shadow lock later. Those delayed checks will make sure
- * no inconsistent mapping being translated into shadow page table.
- * 
- * Returns 0 for success, or the set of permission bits that we failed on 
- * if the walk did not complete.
- * N.B. This is different from the old return code but almost no callers
- * checked the old return code anyway.
- */
-static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
-{
-    struct domain *d = v->domain;
-    p2m_type_t p2mt;
-    guest_l1e_t *l1p = NULL;
-    guest_l2e_t *l2p = NULL;
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    guest_l3e_t *l3p = NULL;
-    guest_l4e_t *l4p;
-#endif
-    uint32_t gflags, mflags, rc = 0;
-    int pse;
-
-    perfc_incr(shadow_guest_walk);
-    memset(gw, 0, sizeof(*gw));
-    gw->va = va;
-
-    gw->version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
-    rmb();
-
-    /* Mandatory bits that must be set in every entry.  We invert NX, to
-     * calculate as if there were an "X" bit that allowed access. 
-     * We will accumulate, in rc, the set of flags that are missing. */
-    mflags = mandatory_flags(v, pfec);
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-
-    /* Get the l4e from the top level table and check its flags*/
-    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
-    l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
-    gw->l4e = l4p[guest_l4_table_offset(va)];
-    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT ) goto out;
-
-    /* Map the l3 table */
-    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
-    if ( !p2m_is_ram(p2mt) ) 
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-    ASSERT(mfn_valid(gw->l3mfn));
-
-    /* Get the l3e and check its flags*/
-    l3p = sh_map_domain_page(gw->l3mfn);
-    gw->l3e = l3p[guest_l3_table_offset(va)];
-    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT )
-        goto out;
-
-#else /* PAE only... */
-
-    /* Get l3e from the cache of the top level table and check its flag */
-    gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
-    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-
-#endif /* PAE or 64... */
-
-    /* Map the l2 table */
-    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
-    if ( !p2m_is_ram(p2mt) )
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-    ASSERT(mfn_valid(gw->l2mfn));
-
-    /* Get the l2e */
-    l2p = sh_map_domain_page(gw->l2mfn);
-    gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#else /* 32-bit only... */
-
-    /* Get l2e from the top level table */
-    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
-    l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
-    gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#endif /* All levels... */
-
-    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT )
-        goto out;
-
-    pse = (guest_supports_superpages(v) && 
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
-
-    if ( pse )
-    {
-        /* Special case: this guest VA is in a PSE superpage, so there's
-         * no guest l1e.  We make one up so that the propagation code
-         * can generate a shadow l1 table.  Start with the gfn of the 
-         * first 4k-page of the superpage. */
-        gfn_t start = guest_l2e_get_gfn(gw->l2e);
-        /* Grant full access in the l1e, since all the guest entry's 
-         * access controls are enforced in the shadow l2e. */
-        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
-                     _PAGE_ACCESSED|_PAGE_DIRTY);
-        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
-         * of the level 1. */
-        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
-            flags |= _PAGE_PAT;
-        /* Copy the cache-control bits to the l1 as well, because we
-         * can't represent PAT in the (non-PSE) shadow l2e. :(
-         * This could cause problems if a guest ever maps an area of
-         * memory with superpages using more than one caching mode. */
-        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
-        /* Increment the pfn by the right number of 4k pages.  
-         * The ~0x1 is to mask out the PAT bit mentioned above. */
-        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
-        gw->l1e = guest_l1e_from_gfn(start, flags);
-        gw->l1mfn = _mfn(INVALID_MFN);
-    } 
-    else 
-    {
-        /* Not a superpage: carry on and find the l1e. */
-        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
-        if ( !p2m_is_ram(p2mt) )
-        {
-            rc |= _PAGE_PRESENT;
-            goto out;
-        }
-        ASSERT(mfn_valid(gw->l1mfn));
-        l1p = sh_map_domain_page(gw->l1mfn);
-        gw->l1e = l1p[guest_l1_table_offset(va)];
-        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
-        rc |= ((gflags & mflags) ^ mflags);
-    }
-
-    /* Go back and set accessed and dirty bits only if the walk was a
-     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
-     * get set whenever a lower-level PT is used, at least some hardware
-     * walkers behave this way. */
-    if ( rc == 0 ) 
-    {
-#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
-        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
-            paging_mark_dirty(d, mfn_x(gw->l4mfn));
-        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
-            paging_mark_dirty(d, mfn_x(gw->l3mfn));
-#endif
-        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
-                         (pse && (pfec & PFEC_write_access))) )
-            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
-        if ( !pse ) 
-        {
-            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
-                             (pfec & PFEC_write_access)) )
-                paging_mark_dirty(d, mfn_x(gw->l1mfn));
-        }
-    }
-
- out:
-#if GUEST_PAGING_LEVELS == 4
-    if ( l3p ) sh_unmap_domain_page(l3p);
-#endif
-#if GUEST_PAGING_LEVELS >= 3
-    if ( l2p ) sh_unmap_domain_page(l2p);
-#endif
-    if ( l1p ) sh_unmap_domain_page(l1p);
-
-    return rc;
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding frame number. */
-static inline gfn_t
-guest_walk_to_gfn(walk_t *gw)
-{
-    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
-        return _gfn(INVALID_GFN);
-    return guest_l1e_get_gfn(gw->l1e);
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding physical address. */
-static inline paddr_t
-guest_walk_to_gpa(walk_t *gw)
-{
-    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
-        return 0;
-    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
-}
-
-#if 0 /* Keep for debugging */
-/* Pretty-print the contents of a guest-walk */
-static inline void print_gw(walk_t *gw)
-{
-    SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    SHADOW_PRINTK("   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
-    SHADOW_PRINTK("   l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
-    SHADOW_PRINTK("   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
-#endif /* PAE or 64... */
-    SHADOW_PRINTK("   l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
-#endif /* All levels... */
-    SHADOW_PRINTK("   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
-    SHADOW_PRINTK("   l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
-    SHADOW_PRINTK("   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
-    SHADOW_PRINTK("   l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
-}
-#endif /* 0 */
-
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 /* Lightweight audit: pass all the shadows associated with this guest walk
  * through the audit mechanisms */
@@ -657,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigned long addr,
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
+    if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 
          && mfn_valid(gw.l1mfn) )
     {
         if ( gl1mfn )
@@ -679,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e)
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
+    (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
     *(guest_l1e_t *)eff_l1e = gw.l1e;
 }
 #endif /* CONFIG == GUEST (== SHADOW) */
@@ -851,15 +546,32 @@ _sh_propagate(struct vcpu *v,
          !is_xen_heap_mfn(mfn_x(target_mfn)) )
     {
         unsigned int type;
+
+        /* compute the PAT index for shadow page entry when VT-d is enabled
+         * and device assigned. 
+         * 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT.
+         * 2) if enables snoop control, compute the PAT index as WB.
+         * 3) if disables snoop control, compute the PAT index with
+         *    gMTRR and gPAT.
+         */
         if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) )
             sflags |= pat_type_2_pte_flags(type);
         else if ( d->arch.hvm_domain.is_in_uc_mode )
             sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
+        else if ( p2mt == p2m_mmio_direct )
+            sflags |= get_pat_flags(v,
+                                    gflags,
+                                    gfn_to_paddr(target_gfn),
+                                    ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
+                                    MTRR_TYPE_UNCACHABLE); 
+        else if ( iommu_snoop )
+            sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK);
         else
             sflags |= get_pat_flags(v,
                                     gflags,
                                     gfn_to_paddr(target_gfn),
-                                    ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT);
+                                    ((paddr_t)mfn_x(target_mfn)) << PAGE_SHIFT,
+                                    NO_HARDCODE_MEM_TYPE);
     }
 
     // Set the A&D bits for higher level shadows.
@@ -1171,9 +883,6 @@ static int shadow_set_l4e(struct vcpu *v,
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
         }
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
-        shadow_resync_all(v, 0);
-#endif
     }
 
     /* Write the new entry */
@@ -1219,9 +928,6 @@ static int shadow_set_l3e(struct vcpu *v,
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
         }
-#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
-        shadow_resync_all(v, 0);
-#endif
     }
 
     /* Write the new entry */
@@ -1284,13 +990,13 @@ static int shadow_set_l2e(struct vcpu *v,
         }
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
         {
-            struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
-            mfn_t gl1mfn = _mfn(sp->backpointer);
+            struct page_info *sp = mfn_to_page(sl1mfn);
+            mfn_t gl1mfn = _mfn(sp->v.sh.back);
 
             /* If the shadow is a fl1 then the backpointer contains
                the GFN instead of the GMFN, and it's definitely not
                OOS. */
-            if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+            if ( (sp->u.sh.type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
                  && mfn_is_out_of_sync(gl1mfn) )
                 sh_resync(v, gl1mfn);
         }
@@ -1333,23 +1039,23 @@ static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e,
                                        mfn_t sl1mfn,
                                        struct domain *d)
 { 
-    mfn_t mfn;
+    mfn_t mfn = shadow_l1e_get_mfn(new_sl1e);
+    int flags = shadow_l1e_get_flags(new_sl1e);
     unsigned long gfn;
 
-    if ( !d->dirty_vram ) return;
-
-    mfn = shadow_l1e_get_mfn(new_sl1e);
-
-    if ( !mfn_valid(mfn) ) return; /* m2p for mmio_direct may not exist */
+    if ( !d->dirty_vram         /* tracking disabled? */
+         || !(flags & _PAGE_RW) /* read-only mapping? */
+         || !mfn_valid(mfn) )   /* mfn can be invalid in mmio_direct */
+        return;
 
     gfn = mfn_to_gfn(d, mfn);
 
-    if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
+    if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) )
+    {
         unsigned long i = gfn - d->dirty_vram->begin_pfn;
         struct page_info *page = mfn_to_page(mfn);
-        u32 count_info = page->u.inuse.type_info & PGT_count_mask;
         
-        if ( count_info == 1 )
+        if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
             /* Initial guest reference, record it */
             d->dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn))
                 | ((unsigned long)sl1e & ~PAGE_MASK);
@@ -1361,49 +1067,58 @@ static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e,
                                        mfn_t sl1mfn,
                                        struct domain *d)
 {
-    mfn_t mfn;
+    mfn_t mfn = shadow_l1e_get_mfn(old_sl1e);
+    int flags = shadow_l1e_get_flags(old_sl1e);
     unsigned long gfn;
 
-    if ( !d->dirty_vram ) return;
-
-    mfn = shadow_l1e_get_mfn(old_sl1e);
-
-    if ( !mfn_valid(mfn) ) return;
+    if ( !d->dirty_vram         /* tracking disabled? */
+         || !(flags & _PAGE_RW) /* read-only mapping? */
+         || !mfn_valid(mfn) )   /* mfn can be invalid in mmio_direct */
+        return;
 
     gfn = mfn_to_gfn(d, mfn);
 
-    if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) ) {
+    if ( (gfn >= d->dirty_vram->begin_pfn) && (gfn < d->dirty_vram->end_pfn) )
+    {
         unsigned long i = gfn - d->dirty_vram->begin_pfn;
         struct page_info *page = mfn_to_page(mfn);
-        u32 count_info = page->u.inuse.type_info & PGT_count_mask;
         int dirty = 0;
         paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn))
             | ((unsigned long)sl1e & ~PAGE_MASK);
 
-        if ( count_info == 1 ) {
+        if ( (page->u.inuse.type_info & PGT_count_mask) == 1 )
+        {
             /* Last reference */
             if ( d->dirty_vram->sl1ma[i] == INVALID_PADDR ) {
                 /* We didn't know it was that one, let's say it is dirty */
                 dirty = 1;
-            } else {
+            }
+            else
+            {
                 ASSERT(d->dirty_vram->sl1ma[i] == sl1ma);
                 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
-                if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_DIRTY )
+                if ( flags & _PAGE_DIRTY )
                     dirty = 1;
             }
-        } else {
+        }
+        else
+        {
             /* We had more than one reference, just consider the page dirty. */
             dirty = 1;
             /* Check that it's not the one we recorded. */
-            if ( d->dirty_vram->sl1ma[i] == sl1ma ) {
+            if ( d->dirty_vram->sl1ma[i] == sl1ma )
+            {
                 /* Too bad, we remembered the wrong one... */
                 d->dirty_vram->sl1ma[i] = INVALID_PADDR;
-            } else {
+            }
+            else
+            {
                 /* Ok, our recorded sl1e is still pointing to this page, let's
                  * just hope it will remain. */
             }
         }
-        if ( dirty ) {
+        if ( dirty )
+        {
             d->dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8);
             d->dirty_vram->last_dirty = NOW();
         }
@@ -1505,8 +1220,8 @@ static inline void increment_ptr_to_guest_entry(void *ptr)
 do {                                                                    \
     int _i;                                                             \
     shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn));                  \
-    ASSERT(mfn_to_shadow_page(_sl1mfn)->type == SH_type_l1_shadow       \
-           || mfn_to_shadow_page(_sl1mfn)->type == SH_type_fl1_shadow); \
+    ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow  \
+           || mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\
     for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ )              \
     {                                                                   \
         (_sl1e) = _sp + _i;                                             \
@@ -1543,7 +1258,7 @@ do {                                                                    \
 do {                                                                      \
     int _i, _j, __done = 0;                                               \
     int _xen = !shadow_mode_external(_dom);                               \
-    ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_32_shadow);    \
+    ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow);\
     for ( _j = 0; _j < 4 && !__done; _j++ )                               \
     {                                                                     \
         shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn);                  \
@@ -1571,11 +1286,11 @@ do {                                                                       \
     int _i;                                                                \
     int _xen = !shadow_mode_external(_dom);                                \
     shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn));                     \
-    ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_pae_shadow      \
-           || mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_pae_shadow);\
+    ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow \
+           || mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow);\
     for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
         if ( (!(_xen))                                                     \
-             || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_pae_shadow\
+             || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_pae_shadow\
              || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES))                  \
                  < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
         {                                                                  \
@@ -1596,13 +1311,13 @@ do {                                                                        \
     int _i;                                                                 \
     int _xen = !shadow_mode_external(_dom);                                 \
     shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn));                      \
-    ASSERT(mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2_64_shadow ||     \
-           mfn_to_shadow_page(_sl2mfn)->type == SH_type_l2h_64_shadow);     \
+    ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_64_shadow ||\
+           mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_64_shadow);\
     for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                  \
     {                                                                       \
         if ( (!(_xen))                                                      \
              || !is_pv_32on64_domain(_dom)                                  \
-             || mfn_to_shadow_page(_sl2mfn)->type != SH_type_l2h_64_shadow  \
+             || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow\
              || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) )           \
         {                                                                   \
             (_sl2e) = _sp + _i;                                             \
@@ -1624,7 +1339,7 @@ do {                                                                        \
 do {                                                                    \
     int _i;                                                             \
     shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn));                  \
-    ASSERT(mfn_to_shadow_page(_sl3mfn)->type == SH_type_l3_64_shadow);  \
+    ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\
     for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ )              \
     {                                                                   \
         (_sl3e) = _sp + _i;                                             \
@@ -1642,7 +1357,7 @@ do {                                                                    \
     shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn));                  \
     int _xen = !shadow_mode_external(_dom);                             \
     int _i;                                                             \
-    ASSERT(mfn_to_shadow_page(_sl4mfn)->type == SH_type_l4_64_shadow);  \
+    ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\
     for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ )              \
     {                                                                   \
         if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) )                  \
@@ -1817,7 +1532,7 @@ sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
          && shadow_type != SH_type_l2h_pae_shadow 
          && shadow_type != SH_type_l4_64_shadow )
         /* Lower-level shadow, not yet linked form a higher level */
-        mfn_to_shadow_page(smfn)->up = 0;
+        mfn_to_page(smfn)->up = 0;
 
 #if GUEST_PAGING_LEVELS == 4
 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) 
@@ -1830,14 +1545,12 @@ sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
          * of them, decide that this isn't an old linux guest, and stop
          * pinning l3es.  This is not very quick but it doesn't happen
          * very often. */
-        struct list_head *l, *t;
-        struct shadow_page_info *sp;
+        struct page_info *sp, *t;
         struct vcpu *v2;
         int l4count = 0, vcpus = 0;
-        list_for_each(l, &v->domain->arch.paging.shadow.pinned_shadows)
+        page_list_for_each(sp, &v->domain->arch.paging.shadow.pinned_shadows)
         {
-            sp = list_entry(l, struct shadow_page_info, list);
-            if ( sp->type == SH_type_l4_64_shadow )
+            if ( sp->u.sh.type == SH_type_l4_64_shadow )
                 l4count++;
         }
         for_each_vcpu ( v->domain, v2 ) 
@@ -1845,11 +1558,10 @@ sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
         if ( l4count > 2 * vcpus ) 
         {
             /* Unpin all the pinned l3 tables, and don't pin any more. */
-            list_for_each_safe(l, t, &v->domain->arch.paging.shadow.pinned_shadows)
+            page_list_for_each_safe(sp, t, &v->domain->arch.paging.shadow.pinned_shadows)
             {
-                sp = list_entry(l, struct shadow_page_info, list);
-                if ( sp->type == SH_type_l3_64_shadow )
-                    sh_unpin(v, shadow_page_to_mfn(sp));
+                if ( sp->u.sh.type == SH_type_l3_64_shadow )
+                    sh_unpin(v, page_to_mfn(sp));
             }
             v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL;
         }
@@ -2021,7 +1733,8 @@ static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v,
 static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, 
                                                 walk_t *gw, 
                                                 mfn_t *sl3mfn,
-                                                fetch_type_t ft)
+                                                fetch_type_t ft,
+                                                int *resync)
 {
     mfn_t sl4mfn;
     shadow_l4e_t *sl4e;
@@ -2051,6 +1764,11 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
         ASSERT((r & SHADOW_SET_FLUSH) == 0);
         if ( r & SHADOW_SET_ERROR )
             return NULL;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+        *resync |= 1;
+#endif
+
     }
     /* Now follow it down a level.  Guaranteed to succeed. */
     return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
@@ -2061,14 +1779,15 @@ static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v,
 static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, 
                                                 walk_t *gw, 
                                                 mfn_t *sl2mfn,
-                                                fetch_type_t ft)
+                                                fetch_type_t ft,
+                                                int *resync)
 {
 #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
     mfn_t sl3mfn = _mfn(INVALID_MFN);
     shadow_l3e_t *sl3e;
     if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */
     /* Get the l3e */
-    sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+    sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync);
     if ( sl3e == NULL ) return NULL; 
     if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
     {
@@ -2100,6 +1819,11 @@ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v,
         ASSERT((r & SHADOW_SET_FLUSH) == 0);
         if ( r & SHADOW_SET_ERROR )
             return NULL;        
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+        *resync |= 1;
+#endif
+
     }
     /* Now follow it down a level.  Guaranteed to succeed. */
     return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
@@ -2132,11 +1856,13 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
                                                 fetch_type_t ft)
 {
     mfn_t sl2mfn;
+    int resync = 0;
     shadow_l2e_t *sl2e;
 
     /* Get the l2e */
-    sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+    sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync);
     if ( sl2e == NULL ) return NULL;
+
     /* Install the sl1 in the l2e if it wasn't there or if we need to
      * re-do it to fix a PSE dirty bit. */
     if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT 
@@ -2182,6 +1908,7 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
         ASSERT((r & SHADOW_SET_FLUSH) == 0);        
         if ( r & SHADOW_SET_ERROR )
             return NULL;
+
         /* This next line is important: in 32-on-PAE and 32-on-64 modes,
          * the guest l1 table has an 8k shadow, and we need to return
          * the right mfn of the pair. This call will set it for us as a
@@ -2189,6 +1916,14 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
          * compiled out.) */
         (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
     }
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+    /* All pages walked are now pagetables. Safe to resync pages
+       in case level 4 or 3 shadows were set. */
+    if ( resync )
+        shadow_resync_all(v, 0);
+#endif
+
     /* Now follow it down a level.  Guaranteed to succeed. */
     return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
 }
@@ -2209,7 +1944,7 @@ static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v,
 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
 {
     shadow_l4e_t *sl4e;
-    u32 t = mfn_to_shadow_page(smfn)->type;
+    u32 t = mfn_to_page(smfn)->u.sh.type;
     mfn_t gmfn, sl4mfn;
 
     SHADOW_DEBUG(DESTROY_SHADOW,
@@ -2217,7 +1952,7 @@ void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
     ASSERT(t == SH_type_l4_shadow);
 
     /* Record that the guest page isn't shadowed any more (in this type) */
-    gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
+    gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
     delete_shadow_status(v, gmfn, t, smfn);
     shadow_demote(v, gmfn, t);
     /* Decrement refcounts of all the old entries */
@@ -2238,7 +1973,7 @@ void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
 void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
 {
     shadow_l3e_t *sl3e;
-    u32 t = mfn_to_shadow_page(smfn)->type;
+    u32 t = mfn_to_page(smfn)->u.sh.type;
     mfn_t gmfn, sl3mfn;
 
     SHADOW_DEBUG(DESTROY_SHADOW,
@@ -2246,7 +1981,7 @@ void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
     ASSERT(t == SH_type_l3_shadow);
 
     /* Record that the guest page isn't shadowed any more (in this type) */
-    gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
+    gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
     delete_shadow_status(v, gmfn, t, smfn);
     shadow_demote(v, gmfn, t);
 
@@ -2268,7 +2003,7 @@ void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
 void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
 {
     shadow_l2e_t *sl2e;
-    u32 t = mfn_to_shadow_page(smfn)->type;
+    u32 t = mfn_to_page(smfn)->u.sh.type;
     mfn_t gmfn, sl2mfn;
 
     SHADOW_DEBUG(DESTROY_SHADOW,
@@ -2281,7 +2016,7 @@ void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
 #endif
 
     /* Record that the guest page isn't shadowed any more (in this type) */
-    gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
+    gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
     delete_shadow_status(v, gmfn, t, smfn);
     shadow_demote(v, gmfn, t);
 
@@ -2302,7 +2037,7 @@ void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
 {
     struct domain *d = v->domain;
     shadow_l1e_t *sl1e;
-    u32 t = mfn_to_shadow_page(smfn)->type;
+    u32 t = mfn_to_page(smfn)->u.sh.type;
 
     SHADOW_DEBUG(DESTROY_SHADOW,
                   "%s(%05lx)\n", __func__, mfn_x(smfn));
@@ -2311,12 +2046,12 @@ void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
     /* Record that the guest page isn't shadowed any more (in this type) */
     if ( t == SH_type_fl1_shadow )
     {
-        gfn_t gfn = _gfn(mfn_to_shadow_page(smfn)->backpointer);
+        gfn_t gfn = _gfn(mfn_to_page(smfn)->v.sh.back);
         delete_fl1_shadow_status(v, gfn, smfn);
     }
     else 
     {
-        mfn_t gmfn = _mfn(mfn_to_shadow_page(smfn)->backpointer);
+        mfn_t gmfn = _mfn(mfn_to_page(smfn)->v.sh.back);
         delete_shadow_status(v, gmfn, t, smfn);
         shadow_demote(v, gmfn, t);
     }
@@ -2342,7 +2077,7 @@ void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
 void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
 {
     struct domain *d = v->domain;
-    ASSERT(mfn_to_shadow_page(mmfn)->type == SH_type_monitor_table);
+    ASSERT(mfn_to_page(mmfn)->u.sh.type == SH_type_monitor_table);
 
 #if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
     {
@@ -2458,11 +2193,16 @@ static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
     if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT )
     {
         gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e);
-        mfn_t gl3mfn = gfn_to_mfn(d, gl3gfn, &p2mt);
+        mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
         if ( p2m_is_ram(p2mt) )
             sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
-        else
+        else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+        if ( mfn_valid(sl3mfn) )
+            shadow_resync_all(v, 0);
+#endif
     }
     l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch);
 
@@ -2510,11 +2250,16 @@ static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
     if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT )
     {
         gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e);
-        mfn_t gl2mfn = gfn_to_mfn(v->domain, gl2gfn, &p2mt);
+        mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
         if ( p2m_is_ram(p2mt) )
             sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
-        else
+        else if ( p2mt != p2m_populate_on_demand )
             result |= SHADOW_SET_ERROR;
+
+#if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
+        if ( mfn_valid(sl2mfn) )
+            shadow_resync_all(v, 0);
+#endif
     }
     l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch);
     result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
@@ -2554,10 +2299,10 @@ static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
         }
         else
         {
-            mfn_t gl1mfn = gfn_to_mfn(v->domain, gl1gfn, &p2mt);
+            mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
             if ( p2m_is_ram(p2mt) )
-                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
-            else
+                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 
+            else if ( p2mt != p2m_populate_on_demand )
                 result |= SHADOW_SET_ERROR;
         }
     }
@@ -2576,7 +2321,7 @@ static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
 
 #if SHADOW_PAGING_LEVELS == 3
         reserved_xen_slot = 
-            ((mfn_to_shadow_page(sl2mfn)->type == SH_type_l2h_pae_shadow) &&
+            ((mfn_to_page(sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow) &&
              (shadow_index 
               >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1))));
 #else /* SHADOW_PAGING_LEVELS == 2 */
@@ -2624,13 +2369,13 @@ static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
     perfc_incr(shadow_validate_gl1e_calls);
 
     gfn = guest_l1e_get_gfn(new_gl1e);
-    gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+    gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
 
     l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
     result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
-    gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+    gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
     if ( mfn_valid(gl1mfn) 
          && mfn_is_out_of_sync(gl1mfn) )
     {
@@ -2684,7 +2429,7 @@ void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
             shadow_l1e_t nsl1e;
 
             gfn = guest_l1e_get_gfn(gl1e);
-            gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+            gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
             l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
             rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
             
@@ -2707,42 +2452,41 @@ void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
  *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
 int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
 {
-    struct shadow_page_info *sp;
+    struct page_info *sp;
     mfn_t smfn;
 
     smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
     ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
     
     /* Up to l2 */
-    sp = mfn_to_shadow_page(smfn);
-    if ( sp->count != 1 || !sp->up )
+    sp = mfn_to_page(smfn);
+    if ( sp->u.sh.count != 1 || !sp->up )
         return 0;
     smfn = _mfn(sp->up >> PAGE_SHIFT);
     ASSERT(mfn_valid(smfn));
 
 #if (SHADOW_PAGING_LEVELS == 4) 
     /* up to l3 */
-    sp = mfn_to_shadow_page(smfn);
-    if ( sp->count != 1 || !sp->up )
+    sp = mfn_to_page(smfn);
+    if ( sp->u.sh.count != 1 || !sp->up )
         return 0;
     smfn = _mfn(sp->up >> PAGE_SHIFT);
     ASSERT(mfn_valid(smfn));
 
     /* up to l4 */
-    sp = mfn_to_shadow_page(smfn);
-    if ( sp->count != 1 
+    sp = mfn_to_page(smfn);
+    if ( sp->u.sh.count != 1
          || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
         return 0;
     smfn = _mfn(sp->up >> PAGE_SHIFT);
     ASSERT(mfn_valid(smfn));
+#endif
 
-#if (GUEST_PAGING_LEVELS == 2)
+#if (GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS == 3)
     /* In 2-on-3 shadow mode the up pointer contains the link to the
      * shadow page, but the shadow_table contains only the first of the
      * four pages that makes the PAE top shadow tables. */
     smfn = _mfn(mfn_x(smfn) & ~0x3UL);
-#endif
-
 #endif
 
     if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
@@ -3001,7 +2745,7 @@ static void sh_prefetch(struct vcpu *v, walk_t *gw,
 
         /* Look at the gfn that the l1e is pointing at */
         gfn = guest_l1e_get_gfn(gl1e);
-        gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+        gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
 
         /* Propagate the entry.  */
         l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
@@ -3173,6 +2917,7 @@ static int sh_page_fault(struct vcpu *v,
     fetch_type_t ft = 0;
     p2m_type_t p2mt;
     uint32_t rc;
+    int version;
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
     int fast_emul = 0;
 #endif
@@ -3204,6 +2949,7 @@ static int sh_page_fault(struct vcpu *v,
                writes to an out of sync page. */
             if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
             {
+                fast_emul = 0;
                 v->arch.paging.last_write_emul_ok = 0;
                 goto page_fault_slow_path;
             }
@@ -3246,8 +2992,8 @@ static int sh_page_fault(struct vcpu *v,
                                         + shadow_l2_linear_offset(va)),
                                        sizeof(sl2e)) != 0)
                      || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
-                     || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
-                                      shadow_l2e_get_mfn(sl2e))->backpointer))
+                     || !mfn_valid(gl1mfn = _mfn(mfn_to_page(
+                                      shadow_l2e_get_mfn(sl2e))->v.sh.back))
                      || unlikely(mfn_is_out_of_sync(gl1mfn)) )
                {
                    /* Hit the slow path as if there had been no 
@@ -3316,7 +3062,14 @@ static int sh_page_fault(struct vcpu *v,
     }
 
  rewalk:
-    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+    /* The walk is done in a lock-free style, with some sanity check
+     * postponed after grabbing shadow lock later. Those delayed checks
+     * will make sure no inconsistent mapping being translated into
+     * shadow page table. */ 
+    version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
+    rmb();
+    rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     regs->error_code &= ~PFEC_page_present;
@@ -3348,7 +3101,7 @@ static int sh_page_fault(struct vcpu *v,
 
     /* What mfn is the guest trying to access? */
     gfn = guest_l1e_get_gfn(gw.l1e);
-    gmfn = gfn_to_mfn(d, gfn, &p2mt);
+    gmfn = gfn_to_mfn_guest(d, gfn, &p2mt);
 
     if ( shadow_mode_refcounts(d) && 
          (!p2m_is_valid(p2mt) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) )
@@ -3369,6 +3122,19 @@ static int sh_page_fault(struct vcpu *v,
     shadow_lock(d);
 
     TRACE_CLEAR_PATH_FLAGS;
+
+    /* Make sure there is enough free shadow memory to build a chain of
+     * shadow tables. (We never allocate a top-level shadow on this path,
+     * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
+     * SH_type_l1_shadow isn't correct in the latter case, all page
+     * tables are the same size there.)
+     *
+     * Preallocate shadow pages *before* removing writable accesses
+     * otherwhise an OOS L1 might be demoted and promoted again with
+     * writable mappings. */
+    shadow_prealloc(d,
+                    SH_type_l1_shadow,
+                    GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
     
     rc = gw_remove_write_accesses(v, va, &gw);
 
@@ -3392,7 +3158,7 @@ static int sh_page_fault(struct vcpu *v,
     }
 #endif /* OOS */
 
-    if ( !shadow_check_gwalk(v, va, &gw) )
+    if ( !shadow_check_gwalk(v, va, &gw, version) )
     {
         perfc_incr(shadow_inconsistent_gwalk);
         shadow_unlock(d);
@@ -3402,15 +3168,6 @@ static int sh_page_fault(struct vcpu *v,
     shadow_audit_tables(v);
     sh_audit_gw(v, &gw);
 
-    /* Make sure there is enough free shadow memory to build a chain of
-     * shadow tables. (We never allocate a top-level shadow on this path,
-     * only a 32b l1, pae l1, or 64b l3+2+1. Note that while
-     * SH_type_l1_shadow isn't correct in the latter case, all page
-     * tables are the same size there.) */
-    shadow_prealloc(d,
-                    SH_type_l1_shadow,
-                    GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
-
     /* Acquire the shadow.  This must happen before we figure out the rights 
      * for the shadow entry, since we might promote a page here. */
     ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
@@ -3497,10 +3254,10 @@ static int sh_page_fault(struct vcpu *v,
         goto mmio;
     }
 
-    /* Log attempts to write to read-only memory */
+    /* Ignore attempts to write to read-only memory. */
     if ( (p2mt == p2m_ram_ro) && (ft == ft_demand_write) )
     {
-        static unsigned long lastpage = 0;
+        static unsigned long lastpage;
         if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) )
             gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory"
                      " page. va page=%#lx, mfn=%#lx\n",
@@ -3792,7 +3549,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
     // easier than invalidating all of the individual 4K pages).
     //
     sl1mfn = shadow_l2e_get_mfn(sl2e);
-    if ( mfn_to_shadow_page(sl1mfn)->type
+    if ( mfn_to_page(sl1mfn)->u.sh.type
          == SH_type_fl1_shadow )
     {
         flush_tlb_local();
@@ -3802,7 +3559,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
     /* Check to see if the SL1 is out of sync. */
     {
-        mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+        mfn_t gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
         struct page_info *pg = mfn_to_page(gl1mfn);
         if ( mfn_valid(gl1mfn) 
              && page_is_out_of_sync(pg) )
@@ -3832,7 +3589,7 @@ sh_invlpg(struct vcpu *v, unsigned long va)
             }
 
             sl1mfn = shadow_l2e_get_mfn(sl2e);
-            gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+            gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
             pg = mfn_to_page(gl1mfn);
             
             if ( likely(sh_mfn_is_a_page_table(gl1mfn)
@@ -3869,7 +3626,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
+    if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
     {
         if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
             pfec[0] &= ~PFEC_page_present;
@@ -4237,7 +3994,7 @@ sh_set_toplevel_shadow(struct vcpu *v,
         /* Need to repin the old toplevel shadow if it's been unpinned
          * by shadow_prealloc(): in PV mode we're still running on this
          * shadow and it's not safe to free it yet. */
-        if ( !mfn_to_shadow_page(old_smfn)->pinned && !sh_pin(v, old_smfn) )
+        if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(v, old_smfn) )
         {
             SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn));
             domain_crash(v->domain);
@@ -4395,7 +4152,7 @@ sh_update_cr3(struct vcpu *v, int do_locking)
             if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
             {
                 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
-                gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
+                gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
                 if ( p2m_is_ram(p2mt) )
                     flush |= sh_remove_write_access(v, gl2mfn, 2, 0);
             }
@@ -4408,7 +4165,7 @@ sh_update_cr3(struct vcpu *v, int do_locking)
             if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT )
             {
                 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
-                gl2mfn = gfn_to_mfn(d, gl2gfn, &p2mt);
+                gl2mfn = gfn_to_mfn_query(d, gl2gfn, &p2mt);
                 if ( p2m_is_ram(p2mt) )
                     sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3) 
                                            ? SH_type_l2h_shadow 
@@ -4531,16 +4288,16 @@ int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
 {
     int r;
     shadow_l1e_t *sl1p, sl1e;
-    struct shadow_page_info *sp;
+    struct page_info *sp;
 
     ASSERT(mfn_valid(gmfn));
     ASSERT(mfn_valid(smfn));
 
-    sp = mfn_to_shadow_page(smfn);
+    sp = mfn_to_page(smfn);
 
-    if ( sp->mbz != 0
-         || (sp->type != SH_type_l1_shadow
-             && sp->type != SH_type_fl1_shadow) )
+    if ( ((sp->count_info & PGC_count_mask) != 0)
+         || (sp->u.sh.type != SH_type_l1_shadow
+             && sp->u.sh.type != SH_type_fl1_shadow) )
         goto fail;
 
     sl1p = sh_map_domain_page(smfn);
@@ -4679,7 +4436,7 @@ int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
 void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
 /* Blank out a single shadow entry */
 {
-    switch ( mfn_to_shadow_page(smfn)->type )
+    switch ( mfn_to_page(smfn)->u.sh.type )
     {
     case SH_type_l1_shadow:
         (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
@@ -4712,7 +4469,7 @@ int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
              && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
         {
             (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
-            if ( mfn_to_shadow_page(sl1mfn)->type == 0 )
+            if ( mfn_to_page(sl1mfn)->u.sh.type == 0 )
                 /* This breaks us cleanly out of the FOREACH macro */
                 done = 1;
         }
@@ -4735,7 +4492,7 @@ int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
              && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
         {
             (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
-            if ( mfn_to_shadow_page(sl2mfn)->type == 0 )
+            if ( mfn_to_page(sl2mfn)->u.sh.type == 0 )
                 /* This breaks us cleanly out of the FOREACH macro */
                 done = 1;
         }
@@ -4757,7 +4514,7 @@ int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
              && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
         {
             (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
-            if ( mfn_to_shadow_page(sl3mfn)->type == 0 )
+            if ( mfn_to_page(sl3mfn)->u.sh.type == 0 )
                 /* This breaks us cleanly out of the FOREACH macro */
                 done = 1;
         }
@@ -4794,7 +4551,12 @@ static mfn_t emulate_gva_to_mfn(struct vcpu *v,
     }
 
     /* Translate the GFN to an MFN */
-    mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
+    /* PoD: query only if shadow lock is held (to avoid deadlock) */
+    if ( shadow_locked_by_me(v->domain) )
+        mfn = gfn_to_mfn_query(v->domain, _gfn(gfn), &p2mt);
+    else
+        mfn = gfn_to_mfn(v->domain, _gfn(gfn), &p2mt);
+        
     if ( p2mt == p2m_ram_ro )
         return _mfn(READONLY_GFN);
     if ( !p2m_is_ram(p2mt) )
@@ -5154,7 +4916,7 @@ int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
     int done = 0;
     
     /* Follow the backpointer */
-    gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+    gl1mfn = _mfn(mfn_to_page(sl1mfn)->v.sh.back);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
@@ -5198,7 +4960,7 @@ int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
             {
                 gfn = guest_l1e_get_gfn(*gl1e);
                 mfn = shadow_l1e_get_mfn(*sl1e);
-                gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+                gmfn = gfn_to_mfn_query(v->domain, gfn, &p2mt);
                 if ( mfn_x(gmfn) != mfn_x(mfn) )
                     AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn
                                " --> %" PRI_mfn " != mfn %" PRI_mfn,
@@ -5244,7 +5006,7 @@ int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
     int done = 0;
 
     /* Follow the backpointer */
-    gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+    gl2mfn = _mfn(mfn_to_page(sl2mfn)->v.sh.back);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     /* Only L1's may be out of sync. */
@@ -5265,7 +5027,7 @@ int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
             mfn = shadow_l2e_get_mfn(*sl2e);
             gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)  
                 ? get_fl1_shadow_status(v, gfn)
-                : get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt), 
+                : get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt), 
                                     SH_type_l1_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn
@@ -5273,7 +5035,7 @@ int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
                            " --> %" PRI_mfn " != mfn %" PRI_mfn,
                            gfn_x(gfn), 
                            (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
-                           : mfn_x(gfn_to_mfn(v->domain, gfn, &p2mt)),
+                           : mfn_x(gfn_to_mfn_query(v->domain, gfn, &p2mt)),
                            mfn_x(gmfn), mfn_x(mfn));
         }
     });
@@ -5293,7 +5055,7 @@ int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
     int done = 0;
 
     /* Follow the backpointer */
-    gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+    gl3mfn = _mfn(mfn_to_page(sl3mfn)->v.sh.back);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
     /* Only L1's may be out of sync. */
@@ -5312,7 +5074,7 @@ int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
         {
             gfn = guest_l3e_get_gfn(*gl3e);
             mfn = shadow_l3e_get_mfn(*sl3e);
-            gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt), 
+            gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt), 
                                      ((GUEST_PAGING_LEVELS == 3 ||
                                        is_pv_32on64_vcpu(v))
                                       && !shadow_mode_external(v->domain)
@@ -5340,7 +5102,7 @@ int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
     int done = 0;
 
     /* Follow the backpointer */
-    gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+    gl4mfn = _mfn(mfn_to_page(sl4mfn)->v.sh.back);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
     /* Only L1's may be out of sync. */
@@ -5359,7 +5121,7 @@ int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
         {
             gfn = guest_l4e_get_gfn(*gl4e);
             mfn = shadow_l4e_get_mfn(*sl4e);
-            gmfn = get_shadow_status(v, gfn_to_mfn(v->domain, gfn, &p2mt), 
+            gmfn = get_shadow_status(v, gfn_to_mfn_query(v->domain, gfn, &p2mt), 
                                      SH_type_l3_shadow);
             if ( mfn_x(gmfn) != mfn_x(mfn) )
                 AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn
index bc9e5176b56d18d6e9e380bf390190529f9ad4e2..a831afd5f5863b1206cd54d9ee2c0da7d4428c53 100644 (file)
@@ -220,52 +220,6 @@ extern void shadow_audit_tables(struct vcpu *v);
 #undef GUEST_LEVELS
 #endif /* CONFIG_PAGING_LEVELS == 4 */
 
-/******************************************************************************
- * Page metadata for shadow pages.
- */
-
-struct shadow_page_info
-{
-    union {
-        /* When in use, guest page we're a shadow of */
-        unsigned long backpointer;
-        /* When free, order of the freelist we're on */
-        unsigned int order;
-    };
-    union {
-        /* When in use, next shadow in this hash chain */
-        struct shadow_page_info *next_shadow;
-        /* When free, TLB flush time when freed */
-        u32 tlbflush_timestamp;
-    };
-    struct {
-        unsigned int type:5;      /* What kind of shadow is this? */
-        unsigned int pinned:1;    /* Is the shadow pinned? */
-        unsigned int count:26;    /* Reference count */
-        u32 mbz;                  /* Must be zero: this is where the owner 
-                                   * field lives in a non-shadow page */
-    } __attribute__((packed));
-    union {
-        /* For unused shadow pages, a list of pages of this order; 
-         * for pinnable shadows, if pinned, a list of other pinned shadows
-         * (see sh_type_is_pinnable() below for the definition of 
-         * "pinnable" shadow types). */
-        struct list_head list;
-        /* For non-pinnable shadows, a higher entry that points at us */
-        paddr_t up;
-    };
-};
-
-/* The structure above *must* be no larger than a struct page_info
- * from mm.h, since we'll be using the same space in the frametable. 
- * Also, the mbz field must line up with the owner field of normal 
- * pages, so they look properly like anonymous/xen pages. */
-static inline void shadow_check_page_struct_offsets(void) {
-    BUILD_BUG_ON(sizeof (struct shadow_page_info) > sizeof (struct page_info));
-    BUILD_BUG_ON(offsetof(struct shadow_page_info, mbz) !=
-                 offsetof(struct page_info, u.inuse._domain));
-};
-
 /* Shadow type codes */
 #define SH_type_none           (0U) /* on the shadow free list */
 #define SH_type_min_shadow     (1U)
@@ -520,22 +474,13 @@ mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn);
  * MFN/page-info handling 
  */
 
-// Override mfn_to_page from asm/page.h, which was #include'd above,
-// in order to make it work with our mfn type.
+/* Override macros from asm/page.h to make them work with mfn_t */
 #undef mfn_to_page
-#define mfn_to_page(_m) (frame_table + mfn_x(_m))
-#define mfn_to_shadow_page(_m) ((struct shadow_page_info *)mfn_to_page(_m))
-
-// Override page_to_mfn from asm/page.h, which was #include'd above,
-// in order to make it work with our mfn type.
-#undef page_to_mfn
-#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
-#define shadow_page_to_mfn(_spg) (page_to_mfn((struct page_info *)_spg))
-
-// Override mfn_valid from asm/page.h, which was #include'd above,
-// in order to make it work with our mfn type.
+#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
 #undef mfn_valid
-#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
 
 /* Override pagetable_t <-> struct page_info conversions to work with mfn_t */
 #undef pagetable_get_page
@@ -667,26 +612,26 @@ void sh_destroy_shadow(struct vcpu *v, mfn_t smfn);
 static inline int sh_get_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
 {
     u32 x, nx;
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+    struct page_info *sp = mfn_to_page(smfn);
 
     ASSERT(mfn_valid(smfn));
 
-    x = sp->count;
+    x = sp->u.sh.count;
     nx = x + 1;
 
     if ( unlikely(nx >= 1U<<26) )
     {
-        SHADOW_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
-                       sp->backpointer, mfn_x(smfn));
+        SHADOW_PRINTK("shadow ref overflow, gmfn=%" PRpgmfn " smfn=%lx\n",
+                       sp->v.sh.back, mfn_x(smfn));
         return 0;
     }
     
     /* Guarded by the shadow lock, so no need for atomic update */
-    sp->count = nx;
+    sp->u.sh.count = nx;
 
     /* We remember the first shadow entry that points to each shadow. */
     if ( entry_pa != 0 
-         && !sh_type_is_pinnable(v, sp->type) 
+         && !sh_type_is_pinnable(v, sp->u.sh.type)
          && sp->up == 0 ) 
         sp->up = entry_pa;
     
@@ -699,29 +644,29 @@ static inline int sh_get_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
 static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
 {
     u32 x, nx;
-    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+    struct page_info *sp = mfn_to_page(smfn);
 
     ASSERT(mfn_valid(smfn));
-    ASSERT(sp->mbz == 0);
+    ASSERT(!(sp->count_info & PGC_count_mask));
 
     /* If this is the entry in the up-pointer, remove it */
     if ( entry_pa != 0 
-         && !sh_type_is_pinnable(v, sp->type) 
+         && !sh_type_is_pinnable(v, sp->u.sh.type)
          && sp->up == entry_pa ) 
         sp->up = 0;
 
-    x = sp->count;
+    x = sp->u.sh.count;
     nx = x - 1;
 
     if ( unlikely(x == 0) ) 
     {
         SHADOW_ERROR("shadow ref underflow, smfn=%lx oc=%08x t=%#x\n",
-                     mfn_x(smfn), sp->count, sp->type);
+                     mfn_x(smfn), sp->u.sh.count, sp->u.sh.type);
         BUG();
     }
 
     /* Guarded by the shadow lock, so no need for atomic update */
-    sp->count = nx;
+    sp->u.sh.count = nx;
 
     if ( unlikely(nx == 0) ) 
         sh_destroy_shadow(v, smfn);
@@ -733,26 +678,26 @@ static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
  * Returns 0 for failure, 1 for success. */
 static inline int sh_pin(struct vcpu *v, mfn_t smfn)
 {
-    struct shadow_page_info *sp;
+    struct page_info *sp;
     
     ASSERT(mfn_valid(smfn));
-    sp = mfn_to_shadow_page(smfn);
-    ASSERT(sh_type_is_pinnable(v, sp->type));
-    if ( sp->pinned ) 
+    sp = mfn_to_page(smfn);
+    ASSERT(sh_type_is_pinnable(v, sp->u.sh.type));
+    if ( sp->u.sh.pinned )
     {
         /* Already pinned: take it out of the pinned-list so it can go 
          * at the front */
-        list_del(&sp->list);
+        page_list_del(sp, &v->domain->arch.paging.shadow.pinned_shadows);
     }
     else
     {
         /* Not pinned: pin it! */
         if ( !sh_get_ref(v, smfn, 0) )
             return 0;
-        sp->pinned = 1;
+        sp->u.sh.pinned = 1;
     }
     /* Put it at the head of the list of pinned shadows */
-    list_add(&sp->list, &v->domain->arch.paging.shadow.pinned_shadows);
+    page_list_add(sp, &v->domain->arch.paging.shadow.pinned_shadows);
     return 1;
 }
 
@@ -760,15 +705,15 @@ static inline int sh_pin(struct vcpu *v, mfn_t smfn)
  * of pinned shadows, and release the extra ref. */
 static inline void sh_unpin(struct vcpu *v, mfn_t smfn)
 {
-    struct shadow_page_info *sp;
+    struct page_info *sp;
     
     ASSERT(mfn_valid(smfn));
-    sp = mfn_to_shadow_page(smfn);
-    ASSERT(sh_type_is_pinnable(v, sp->type));
-    if ( sp->pinned )
+    sp = mfn_to_page(smfn);
+    ASSERT(sh_type_is_pinnable(v, sp->u.sh.type));
+    if ( sp->u.sh.pinned )
     {
-        sp->pinned = 0;
-        list_del(&sp->list);
+        sp->u.sh.pinned = 0;
+        page_list_del(sp, &v->domain->arch.paging.shadow.pinned_shadows);
         sp->up = 0; /* in case this stops being a pinnable type in future */
         sh_put_ref(v, smfn, 0);
     }
index 440d2d31fb59a94432a0cf0e9df8854ae4656b06..83d9e90d5ceaa50364cc516ca74cf7fcc781f8b0 100644 (file)
@@ -191,169 +191,19 @@ static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
 })
 #endif
 
+ /* Override gfn_to_mfn to work with gfn_t */
+#undef gfn_to_mfn_query
+#define gfn_to_mfn_query(d, g, t) _gfn_to_mfn_type((d), gfn_x(g), (t), p2m_query)
+#undef gfn_to_mfn_guest
+#define gfn_to_mfn_guest(d, g, t) _gfn_to_mfn_type((d), gfn_x(g), (t), p2m_guest)
 
-/* Type of the guest's frame numbers */
-TYPE_SAFE(unsigned long,gfn)
-#define SH_PRI_gfn "05lx"
-
-#define VALID_GFN(m) (m != INVALID_GFN)
-
-static inline int
-valid_gfn(gfn_t m)
-{
-    return VALID_GFN(gfn_x(m));
-}
-
-static inline paddr_t
-gfn_to_paddr(gfn_t gfn)
-{
-    return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
-}
-
-/* Override gfn_to_mfn to work with gfn_t */
-#undef gfn_to_mfn
-#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), gfn_x(g), (t))
+/* The shadow types needed for the various levels. */
 
 #if GUEST_PAGING_LEVELS == 2
-
-#include "../page-guest32.h"
-
-#define GUEST_L1_PAGETABLE_ENTRIES     1024
-#define GUEST_L2_PAGETABLE_ENTRIES     1024
-#define GUEST_L1_PAGETABLE_SHIFT         12
-#define GUEST_L2_PAGETABLE_SHIFT         22
-
-/* Types of the guest's page tables */
-typedef l1_pgentry_32_t guest_l1e_t;
-typedef l2_pgentry_32_t guest_l2e_t;
-typedef intpte_32_t guest_intpte_t;
-
-/* Access functions for them */
-static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
-{ return l1e_get_paddr_32(gl1e); }
-static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
-{ return l2e_get_paddr_32(gl2e); }
-
-static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
-{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
-{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); }
-
-static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
-{ return l1e_get_flags_32(gl1e); }
-static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
-{ return l2e_get_flags_32(gl2e); }
-
-static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
-{ l1e_add_flags_32(gl1e, flags); return gl1e; }
-static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
-{ l2e_add_flags_32(gl2e, flags); return gl2e; }
-
-static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
-{ return l1e_from_pfn_32(gfn_x(gfn), flags); }
-static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
-{ return l2e_from_pfn_32(gfn_x(gfn), flags); }
-
-#define guest_l1_table_offset(a) l1_table_offset_32(a)
-#define guest_l2_table_offset(a) l2_table_offset_32(a)
-
-/* The shadow types needed for the various levels. */
 #define SH_type_l1_shadow  SH_type_l1_32_shadow
 #define SH_type_l2_shadow  SH_type_l2_32_shadow
 #define SH_type_fl1_shadow SH_type_fl1_32_shadow
-
-#else /* GUEST_PAGING_LEVELS != 2 */
-
-#if GUEST_PAGING_LEVELS == 3
-#define GUEST_L1_PAGETABLE_ENTRIES      512
-#define GUEST_L2_PAGETABLE_ENTRIES      512
-#define GUEST_L3_PAGETABLE_ENTRIES        4
-#define GUEST_L1_PAGETABLE_SHIFT         12
-#define GUEST_L2_PAGETABLE_SHIFT         21
-#define GUEST_L3_PAGETABLE_SHIFT         30
-#else /* GUEST_PAGING_LEVELS == 4 */
-#define GUEST_L1_PAGETABLE_ENTRIES      512
-#define GUEST_L2_PAGETABLE_ENTRIES      512
-#define GUEST_L3_PAGETABLE_ENTRIES      512
-#define GUEST_L4_PAGETABLE_ENTRIES      512
-#define GUEST_L1_PAGETABLE_SHIFT         12
-#define GUEST_L2_PAGETABLE_SHIFT         21
-#define GUEST_L3_PAGETABLE_SHIFT         30
-#define GUEST_L4_PAGETABLE_SHIFT         39
-#endif
-
-/* Types of the guest's page tables */
-typedef l1_pgentry_t guest_l1e_t;
-typedef l2_pgentry_t guest_l2e_t;
-typedef l3_pgentry_t guest_l3e_t;
-#if GUEST_PAGING_LEVELS >= 4
-typedef l4_pgentry_t guest_l4e_t;
-#endif
-typedef intpte_t guest_intpte_t;
-
-/* Access functions for them */
-static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
-{ return l1e_get_paddr(gl1e); }
-static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
-{ return l2e_get_paddr(gl2e); }
-static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
-{ return l3e_get_paddr(gl3e); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
-{ return l4e_get_paddr(gl4e); }
-#endif
-
-static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
-{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
-{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
-static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
-{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
-{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
-#endif
-
-static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
-{ return l1e_get_flags(gl1e); }
-static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
-{ return l2e_get_flags(gl2e); }
-static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
-{ return l3e_get_flags(gl3e); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
-{ return l4e_get_flags(gl4e); }
-#endif
-
-static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
-{ l1e_add_flags(gl1e, flags); return gl1e; }
-static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
-{ l2e_add_flags(gl2e, flags); return gl2e; }
-static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags)
-{ l3e_add_flags(gl3e, flags); return gl3e; }
-#if GUEST_PAGING_LEVELS >= 4
-static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags)
-{ l4e_add_flags(gl4e, flags); return gl4e; }
-#endif
-
-static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
-{ return l1e_from_pfn(gfn_x(gfn), flags); }
-static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
-{ return l2e_from_pfn(gfn_x(gfn), flags); }
-static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
-{ return l3e_from_pfn(gfn_x(gfn), flags); }
-#if GUEST_PAGING_LEVELS >= 4
-static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
-{ return l4e_from_pfn(gfn_x(gfn), flags); }
-#endif
-
-#define guest_l1_table_offset(a) l1_table_offset(a)
-#define guest_l2_table_offset(a) l2_table_offset(a)
-#define guest_l3_table_offset(a) l3_table_offset(a)
-#define guest_l4_table_offset(a) l4_table_offset(a)
-
-/* The shadow types needed for the various levels. */
-#if GUEST_PAGING_LEVELS == 3
+#elif GUEST_PAGING_LEVELS == 3
 #define SH_type_l1_shadow  SH_type_l1_pae_shadow
 #define SH_type_fl1_shadow SH_type_fl1_pae_shadow
 #define SH_type_l2_shadow  SH_type_l2_pae_shadow
@@ -367,35 +217,6 @@ static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
 #define SH_type_l4_shadow  SH_type_l4_64_shadow
 #endif
 
-#endif /* GUEST_PAGING_LEVELS != 2 */
-
-
-/* Type used for recording a walk through guest pagetables.  It is
- * filled in by the pagetable walk function, and also used as a cache
- * for later walks.  When we encounter a suporpage l2e, we fabricate an
- * l1e for propagation to the shadow (for splintering guest superpages
- * into many shadow l1 entries).  */
-typedef struct shadow_walk_t walk_t;
-struct shadow_walk_t 
-{
-    unsigned long va;           /* Address we were looking for */
-#if GUEST_PAGING_LEVELS >= 3
-#if GUEST_PAGING_LEVELS >= 4
-    guest_l4e_t l4e;            /* Guest's level 4 entry */
-#endif
-    guest_l3e_t l3e;            /* Guest's level 3 entry */
-#endif
-    guest_l2e_t l2e;            /* Guest's level 2 entry */
-    guest_l1e_t l1e;            /* Guest's level 1 entry (or fabrication) */
-#if GUEST_PAGING_LEVELS >= 4
-    mfn_t l4mfn;                /* MFN that the level 4 entry was in */
-    mfn_t l3mfn;                /* MFN that the level 3 entry was in */
-#endif
-    mfn_t l2mfn;                /* MFN that the level 2 entry was in */
-    mfn_t l1mfn;                /* MFN that the level 1 entry was in */
-    int version;                /* Saved guest dirty version */
-};
-
 /* macros for dealing with the naming of the internal function names of the
  * shadow code's external entry points.
  */
@@ -460,17 +281,9 @@ struct shadow_walk_t
 #define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20)
 #endif
 
-#define SH_PRI_pte PRIpte
-
-#if GUEST_PAGING_LEVELS == 2
-#define SH_PRI_gpte "08x"
-#else /* GUEST_PAGING_LEVELS >= 3 */
-#ifndef __x86_64__
-#define SH_PRI_gpte "016llx"
-#else
-#define SH_PRI_gpte "016lx"
-#endif
-#endif /* GUEST_PAGING_LEVELS >= 3 */
+#define SH_PRI_pte  PRIpte
+#define SH_PRI_gpte PRI_gpte
+#define SH_PRI_gfn  PRI_gfn
 
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH)
index 75d45d2eefffe6f84d04e6af09c2bad7753e1344..1587df385a64ee9d6ace8102ec3cb398ad05b0c7 100644 (file)
 
 /* bitmap indicate which fixed map is free */
 DEFINE_SPINLOCK(msix_fixmap_lock);
-DECLARE_BITMAP(msix_fixmap_pages, MAX_MSIX_PAGES);
+DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
 
 static int msix_fixmap_alloc(void)
 {
-    int i;
-    int rc = -1;
+    int i, rc = -ENOMEM;
 
     spin_lock(&msix_fixmap_lock);
-    for ( i = 0; i < MAX_MSIX_PAGES; i++ )
+    for ( i = 0; i < FIX_MSIX_MAX_PAGES; i++ )
         if ( !test_bit(i, &msix_fixmap_pages) )
             break;
-    if ( i == MAX_MSIX_PAGES )
+    if ( i == FIX_MSIX_MAX_PAGES )
         goto out;
     rc = FIX_MSIX_IO_RESERV_BASE + i;
     set_bit(i, &msix_fixmap_pages);
@@ -52,18 +51,72 @@ static int msix_fixmap_alloc(void)
 
 static void msix_fixmap_free(int idx)
 {
-    if ( idx < FIX_MSIX_IO_RESERV_BASE )
-        return;
-
     spin_lock(&msix_fixmap_lock);
-    clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
+    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
+        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
     spin_unlock(&msix_fixmap_lock);
 }
 
+static int msix_get_fixmap(struct pci_dev *dev, unsigned long table_paddr,
+                           unsigned long entry_paddr)
+{
+    int nr_page, idx;
+
+    nr_page = (entry_paddr >> PAGE_SHIFT) - (table_paddr >> PAGE_SHIFT);
+
+    if ( nr_page < 0 || nr_page >= MAX_MSIX_TABLE_PAGES )
+        return -EINVAL;
+
+    spin_lock(&dev->msix_table_lock);
+    if ( dev->msix_table_refcnt[nr_page]++ == 0 )
+    {
+        idx = msix_fixmap_alloc();
+        if ( idx < 0 )
+        {
+            dev->msix_table_refcnt[nr_page]--;
+            goto out;
+        }
+        set_fixmap_nocache(idx, entry_paddr);
+        dev->msix_table_idx[nr_page] = idx;
+    }
+    else
+        idx = dev->msix_table_idx[nr_page];
+
+ out:
+    spin_unlock(&dev->msix_table_lock);
+    return idx;
+}
+
+static void msix_put_fixmap(struct pci_dev *dev, int idx)
+{
+    int i;
+    unsigned long start;
+
+    spin_lock(&dev->msix_table_lock);
+    for ( i = 0; i < MAX_MSIX_TABLE_PAGES; i++ )
+    {
+        if ( dev->msix_table_idx[i] == idx )
+            break;
+    }
+    if ( i == MAX_MSIX_TABLE_PAGES )
+        goto out;
+
+    if ( --dev->msix_table_refcnt[i] == 0 )
+    {
+        start = fix_to_virt(idx);
+        destroy_xen_mappings(start, start + PAGE_SIZE);
+        msix_fixmap_free(idx);
+        dev->msix_table_idx[i] = 0;
+    }
+
+ out:
+    spin_unlock(&dev->msix_table_lock);
+}
+
 /*
  * MSI message composition
  */
-static void msi_compose_msg(struct pci_dev *pdev, int vector,
+void msi_compose_msg(struct pci_dev *pdev, int vector,
                             struct msi_msg *msg)
 {
     unsigned dest;
@@ -78,19 +131,19 @@ static void msi_compose_msg(struct pci_dev *pdev, int vector,
         msg->address_lo =
             MSI_ADDR_BASE_LO |
             ((INT_DEST_MODE == 0) ?
-                MSI_ADDR_DESTMODE_PHYS:
-                MSI_ADDR_DESTMODE_LOGIC) |
+             MSI_ADDR_DESTMODE_PHYS:
+             MSI_ADDR_DESTMODE_LOGIC) |
             ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                MSI_ADDR_REDIRECTION_CPU:
-                MSI_ADDR_REDIRECTION_LOWPRI) |
+             MSI_ADDR_REDIRECTION_CPU:
+             MSI_ADDR_REDIRECTION_LOWPRI) |
             MSI_ADDR_DEST_ID(dest);
 
         msg->data =
             MSI_DATA_TRIGGER_EDGE |
             MSI_DATA_LEVEL_ASSERT |
             ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                MSI_DATA_DELIVERY_FIXED:
-                MSI_DATA_DELIVERY_LOWPRI) |
+             MSI_DATA_DELIVERY_FIXED:
+             MSI_DATA_DELIVERY_LOWPRI) |
             MSI_DATA_VECTOR(vector);
     }
 }
@@ -127,8 +180,7 @@ static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
     case PCI_CAP_ID_MSIX:
     {
         void __iomem *base;
-        base = entry->mask_base +
-           entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+        base = entry->mask_base;
 
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -158,6 +210,8 @@ static int set_vector_msi(struct msi_desc *entry)
 
 static int unset_vector_msi(int vector)
 {
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
+
     if ( vector >= NR_VECTORS )
     {
         dprintk(XENLOG_ERR, "Trying to uninstall msi data for Vector %d\n",
@@ -166,6 +220,7 @@ static int unset_vector_msi(int vector)
     }
 
     irq_desc[vector].msi_desc = NULL;
+
     return 0;
 }
 
@@ -201,13 +256,12 @@ static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
     case PCI_CAP_ID_MSIX:
     {
         void __iomem *base;
-        base = entry->mask_base +
-            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+        base = entry->mask_base;
 
         writel(msg->address_lo,
-            base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         writel(msg->address_hi,
-            base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
         writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
         break;
     }
@@ -217,9 +271,9 @@ static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
     entry->msg = *msg;
 }
 
-void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+void set_msi_affinity(unsigned int vector, cpumask_t mask)
 {
-    struct msi_desc *desc = irq_desc[irq].msi_desc;
+    struct msi_desc *desc = irq_desc[vector].msi_desc;
     struct msi_msg msg;
     unsigned int dest;
 
@@ -230,17 +284,15 @@ void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
     dest = cpu_mask_to_apicid(mask);
 
     if ( !desc )
-       return;
+        return;
 
-    ASSERT(spin_is_locked(&irq_desc[irq].lock));
-    spin_lock(&desc->dev->lock);
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
     read_msi_msg(desc, &msg);
 
     msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
     msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
     write_msi_msg(desc, &msg);
-    spin_unlock(&desc->dev->lock);
 }
 
 static void msi_set_enable(struct pci_dev *dev, int enable)
@@ -281,9 +333,9 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
     }
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(unsigned int vector)
 {
-    struct msi_desc *entry = irq_desc[irq].msi_desc;
+    struct msi_desc *entry = irq_desc[vector].msi_desc;
 
     BUG_ON(!entry || !entry->dev);
     switch (entry->msi_attrib.type) {
@@ -292,8 +344,7 @@ static void msix_flush_writes(unsigned int irq)
         break;
     case PCI_CAP_ID_MSIX:
     {
-        int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-            PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
         readl(entry->mask_base + offset);
         break;
     }
@@ -303,11 +354,18 @@ static void msix_flush_writes(unsigned int irq)
     }
 }
 
-static void msi_set_mask_bit(unsigned int irq, int flag)
+int msi_maskable_irq(const struct msi_desc *entry)
+{
+    BUG_ON(!entry);
+    return entry->msi_attrib.type != PCI_CAP_ID_MSI
+           || entry->msi_attrib.maskbit;
+}
+
+static void msi_set_mask_bit(unsigned int vector, int flag)
 {
-    struct msi_desc *entry = irq_desc[irq].msi_desc;
+    struct msi_desc *entry = irq_desc[vector].msi_desc;
 
-    ASSERT(spin_is_locked(&irq_desc[irq].lock));
+    ASSERT(spin_is_locked(&irq_desc[vector].lock));
     BUG_ON(!entry || !entry->dev);
     switch (entry->msi_attrib.type) {
     case PCI_CAP_ID_MSI:
@@ -323,14 +381,11 @@ static void msi_set_mask_bit(unsigned int irq, int flag)
             mask_bits &= ~(1);
             mask_bits |= flag;
             pci_conf_write32(bus, slot, func, pos, mask_bits);
-        } else {
-            msi_set_enable(entry->dev, !flag);
         }
         break;
     case PCI_CAP_ID_MSIX:
     {
-        int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
-            PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
+        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
         writel(flag, entry->mask_base + offset);
         readl(entry->mask_base + offset);
         break;
@@ -342,16 +397,16 @@ static void msi_set_mask_bit(unsigned int irq, int flag)
     entry->msi_attrib.masked = !!flag;
 }
 
-void mask_msi_irq(unsigned int irq)
+void mask_msi_vector(unsigned int vector)
 {
-    msi_set_mask_bit(irq, 1);
-    msix_flush_writes(irq);
+    msi_set_mask_bit(vector, 1);
+    msix_flush_writes(vector);
 }
 
-void unmask_msi_irq(unsigned int irq)
+void unmask_msi_vector(unsigned int vector)
 {
-    msi_set_mask_bit(irq, 0);
-    msix_flush_writes(irq);
+    msi_set_mask_bit(vector, 0);
+    msix_flush_writes(vector);
 }
 
 static struct msi_desc* alloc_msi_entry(void)
@@ -369,7 +424,7 @@ static struct msi_desc* alloc_msi_entry(void)
     return entry;
 }
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
     struct msi_msg msg;
 
@@ -380,33 +435,25 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
     return 0;
 }
 
-static void teardown_msi_vector(int vector)
+void teardown_msi_vector(int vector)
 {
     unset_vector_msi(vector);
 }
 
-static void msi_free_vector(int vector)
+int msi_free_vector(struct msi_desc *entry)
 {
-    struct msi_desc *entry;
-
-    ASSERT(spin_is_locked(&irq_desc[vector].lock));
-    entry = irq_desc[vector].msi_desc;
-    teardown_msi_vector(vector);
-
     if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
     {
         unsigned long start;
 
-        writel(1, entry->mask_base + entry->msi_attrib.entry_nr
-              * PCI_MSIX_ENTRY_SIZE
-              + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+        writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
         start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
-        msix_fixmap_free(virt_to_fix(start));
-        destroy_xen_mappings(start, start + PAGE_SIZE);
+        msix_put_fixmap(entry->dev, virt_to_fix(start));
     }
     list_del(&entry->list);
     xfree(entry);
+    return 0;
 }
 
 static struct msi_desc *find_msi_entry(struct pci_dev *dev,
@@ -433,15 +480,18 @@ static struct msi_desc *find_msi_entry(struct pci_dev *dev,
  * multiple messages. A return of zero indicates the successful setup
  * of an entry zero with the new MSI irq or non-zero for otherwise.
  **/
-static int msi_capability_init(struct pci_dev *dev, int vector)
+static int msi_capability_init(struct pci_dev *dev,
+                               int vector,
+                               struct msi_desc **desc)
 {
     struct msi_desc *entry;
-    int pos, ret;
+    int pos;
     u16 control;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
     pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSI);
     control = pci_conf_read16(bus, slot, func, msi_control_reg(pos));
     /* MSI Entry Initialization */
@@ -460,31 +510,24 @@ static int msi_capability_init(struct pci_dev *dev, int vector)
     entry->vector = vector;
     if ( is_mask_bit_support(control) )
         entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
-                is_64bit_address(control));
+                                                                   is_64bit_address(control));
     entry->dev = dev;
     if ( entry->msi_attrib.maskbit )
     {
         unsigned int maskbits, temp;
         /* All MSIs are unmasked by default, Mask them all */
         maskbits = pci_conf_read32(bus, slot, func,
-                       msi_mask_bits_reg(pos, is_64bit_address(control)));
+                                   msi_mask_bits_reg(pos, is_64bit_address(control)));
         temp = (1 << multi_msi_capable(control));
         temp = ((temp - 1) & ~temp);
         maskbits |= temp;
         pci_conf_write32(bus, slot, func,
-            msi_mask_bits_reg(pos, is_64bit_address(control)),
-            maskbits);
+                         msi_mask_bits_reg(pos, is_64bit_address(control)),
+                         maskbits);
     }
     list_add_tail(&entry->list, &dev->msi_list);
 
-    /* Configure MSI capability structure */
-    ret = setup_msi_irq(dev, entry);
-    if ( ret )
-    {
-        msi_free_vector(vector);
-        return ret;
-    }
-
+    *desc = entry;
     /* Restore the original MSI enabled bits  */
     pci_conf_write16(bus, slot, func, msi_control_reg(pos), control);
 
@@ -501,13 +544,15 @@ static int msi_capability_init(struct pci_dev *dev, int vector)
  * single MSI-X irq. A return of zero indicates the successful setup of
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
-static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi)
+static int msix_capability_init(struct pci_dev *dev,
+                                struct msi_info *msi,
+                                struct msi_desc **desc)
 {
     struct msi_desc *entry;
     int pos;
     u16 control;
-    unsigned long phys_addr;
-    u32 table_offset;
+    unsigned long table_paddr, entry_paddr;
+    u32 table_offset, entry_offset;
     u8 bir;
     void __iomem *base;
     int idx;
@@ -515,6 +560,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi)
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(desc);
+
     pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
     msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
@@ -528,15 +576,17 @@ static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi)
     table_offset = pci_conf_read32(bus, slot, func, msix_table_offset_reg(pos));
     bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
     table_offset &= ~PCI_MSIX_FLAGS_BIRMASK;
-    phys_addr = msi->table_base + table_offset;
-    idx = msix_fixmap_alloc();
+    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
+
+    table_paddr = msi->table_base + table_offset;
+    entry_paddr = table_paddr + entry_offset;
+    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
     if ( idx < 0 )
     {
         xfree(entry);
-        return -ENOMEM;
+        return idx;
     }
-    set_fixmap_nocache(idx, phys_addr);
-    base = (void *)(fix_to_virt(idx) + (phys_addr & ((1UL << PAGE_SHIFT) - 1)));
+    base = (void *)(fix_to_virt(idx) + (entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
 
     entry->msi_attrib.type = PCI_CAP_ID_MSIX;
     entry->msi_attrib.is_64 = 1;
@@ -550,9 +600,11 @@ static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi)
 
     list_add_tail(&entry->list, &dev->msi_list);
 
-    setup_msi_irq(dev, entry);
+    /* Mask interrupt here */
+    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
-    /* Set MSI-X enabled bits */
+    *desc = entry;
+    /* Restore MSI-X enabled bits */
     pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
 
     return 0;
@@ -568,45 +620,35 @@ static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi)
  * indicates the successful setup of an entry zero with the new MSI
  * irq or non-zero for otherwise.
  **/
-static int __pci_enable_msi(struct msi_info *msi)
+static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
 {
     int status;
     struct pci_dev *pdev;
 
-    pdev = pci_lock_pdev(msi->bus, msi->devfn);
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(msi->bus, msi->devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
     {
-       spin_unlock(&pdev->lock);
         dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
-            "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
-            PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         return 0;
     }
 
-    status = msi_capability_init(pdev, msi->vector);
-    spin_unlock(&pdev->lock);
+    status = msi_capability_init(pdev, msi->vector, desc);
     return status;
 }
 
-static void __pci_disable_msi(int vector)
+static void __pci_disable_msi(struct msi_desc *entry)
 {
-    struct msi_desc *entry;
     struct pci_dev *dev;
     int pos;
     u16 control;
     u8 bus, slot, func;
 
-    entry = irq_desc[vector].msi_desc;
-    if ( !entry )
-       return;
-    /*
-     * Lock here is safe.  msi_desc can not be removed without holding
-     * both irq_desc[].lock (which we do) and pdev->lock.
-     */
-    spin_lock(&entry->dev->lock);
     dev = entry->dev;
     bus = dev->bus;
     slot = PCI_SLOT(dev->devfn);
@@ -618,10 +660,6 @@ static void __pci_disable_msi(int vector)
 
     BUG_ON(list_empty(&dev->msi_list));
 
-    msi_free_vector(vector);
-
-    pci_conf_write16(bus, slot, func, msi_control_reg(pos), control);
-    spin_unlock(&dev->lock);
 }
 
 /**
@@ -639,7 +677,7 @@ static void __pci_disable_msi(int vector)
  * of irqs available. Driver should use the returned value to re-send
  * its request.
  **/
-static int __pci_enable_msix(struct msi_info *msi)
+static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
     int status, pos, nr_entries;
     struct pci_dev *pdev;
@@ -647,49 +685,36 @@ static int __pci_enable_msix(struct msi_info *msi)
     u8 slot = PCI_SLOT(msi->devfn);
     u8 func = PCI_FUNC(msi->devfn);
 
-    pdev = pci_lock_pdev(msi->bus, msi->devfn);
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(msi->bus, msi->devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
     nr_entries = multi_msix_capable(control);
-    if (msi->entry_nr > nr_entries)
-    {
-       spin_unlock(&pdev->lock);
+    if (msi->entry_nr >= nr_entries)
         return -EINVAL;
-    }
 
     if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
     {
-       spin_unlock(&pdev->lock);
         dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
                 "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
                 PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         return 0;
     }
 
-    status = msix_capability_init(pdev, msi);
-    spin_unlock(&pdev->lock);
+    status = msix_capability_init(pdev, msi, desc);
     return status;
 }
 
-static void __pci_disable_msix(int vector)
+static void __pci_disable_msix(struct msi_desc *entry)
 {
-    struct msi_desc *entry;
     struct pci_dev *dev;
     int pos;
     u16 control;
     u8 bus, slot, func;
 
-    entry = irq_desc[vector].msi_desc;
-    if ( !entry )
-       return;
-    /*
-     * Lock here is safe.  msi_desc can not be removed without holding
-     * both irq_desc[].lock (which we do) and pdev->lock.
-     */
-    spin_lock(&entry->dev->lock);
     dev = entry->dev;
     bus = dev->bus;
     slot = PCI_SLOT(dev->devfn);
@@ -697,54 +722,53 @@ static void __pci_disable_msix(int vector)
 
     pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
-    msi_set_enable(dev, 0);
+    msix_set_enable(dev, 0);
 
     BUG_ON(list_empty(&dev->msi_list));
 
-    msi_free_vector(vector);
+    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
     pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
-    spin_unlock(&dev->lock);
 }
 
-int pci_enable_msi(struct msi_info *msi)
+/*
+ * Notice: only construct the msi_desc
+ * no change to irq_desc here, and the interrupt is masked
+ */
+int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
 {
-    ASSERT(spin_is_locked(&irq_desc[msi->vector].lock));
+    ASSERT(spin_is_locked(&pcidevs_lock));
 
-    return  msi->table_base ? __pci_enable_msix(msi) :
-                              __pci_enable_msi(msi);
+    return  msi->table_base ? __pci_enable_msix(msi, desc) :
+        __pci_enable_msi(msi, desc);
 }
 
-void pci_disable_msi(int vector)
+/*
+ * Device only, no irq_desc
+ */
+void pci_disable_msi(struct msi_desc *msi_desc)
 {
-    irq_desc_t *desc = &irq_desc[vector];
-    ASSERT(spin_is_locked(&desc->lock));
-    if ( !desc->msi_desc )
-       return;
-
-    if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
-        __pci_disable_msi(vector);
-    else if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSIX )
-        __pci_disable_msix(vector);
+    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+        __pci_disable_msi(msi_desc);
+    else if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSIX )
+        __pci_disable_msix(msi_desc);
 }
 
 static void msi_free_vectors(struct pci_dev* dev)
 {
     struct msi_desc *entry, *tmp;
     irq_desc_t *desc;
-    unsigned long flags;
+    unsigned long flags, vector;
 
-retry:
     list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
     {
-        desc = &irq_desc[entry->vector];
+        vector = entry->vector;
+        desc = &irq_desc[vector];
+        pci_disable_msi(entry);
 
-        local_irq_save(flags);
-        if ( !spin_trylock(&desc->lock) )
-        {
-             local_irq_restore(flags);
-            goto retry;
-        }
+        spin_lock_irqsave(&desc->lock, flags);
+
+        teardown_msi_vector(vector);
 
         if ( desc->handler == &pci_msi_type )
         {
@@ -753,8 +777,8 @@ retry:
             desc->handler = &no_irq_type;
         }
 
-        msi_free_vector(entry->vector);
         spin_unlock_irqrestore(&desc->lock, flags);
+        msi_free_vector(entry);
     }
 }
 
@@ -766,3 +790,72 @@ void pci_cleanup_msi(struct pci_dev *pdev)
     msi_free_vectors(pdev);
 }
 
+int pci_restore_msi_state(struct pci_dev *pdev)
+{
+    unsigned long flags;
+    int vector;
+    struct msi_desc *entry, *tmp;
+    irq_desc_t *desc;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    if (!pdev)
+        return -EINVAL;
+
+    list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
+    {
+        vector = entry->vector;
+        desc = &irq_desc[vector];
+
+        spin_lock_irqsave(&desc->lock, flags);
+
+        ASSERT(desc->msi_desc == entry);
+
+        if (desc->msi_desc != entry)
+        {
+            dprintk(XENLOG_ERR, "Restore MSI for dev %x:%x not set before?\n",
+                                pdev->bus, pdev->devfn);
+            spin_unlock_irqrestore(&desc->lock, flags);
+            return -EINVAL;
+        }
+
+        if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+            msi_set_enable(pdev, 0);
+        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+            msix_set_enable(pdev, 0);
+
+        write_msi_msg(entry, &entry->msg);
+
+        msi_set_mask_bit(vector, entry->msi_attrib.masked);
+
+        if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+            msi_set_enable(pdev, 1);
+        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+            msix_set_enable(pdev, 1);
+
+        spin_unlock_irqrestore(&desc->lock, flags);
+    }
+
+    return 0;
+}
+
+unsigned int pci_msix_get_table_len(struct pci_dev *pdev)
+{
+    int pos;
+    u16 control;
+    u8 bus, slot, func;
+    unsigned int len;
+
+    bus = pdev->bus;
+    slot = PCI_SLOT(pdev->devfn);
+    func = PCI_FUNC(pdev->devfn);
+
+    pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !pos )
+        return 0;
+
+    control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+    len = msix_table_size(control) * PCI_MSIX_ENTRY_SIZE;
+
+    return len;
+}
index 8f207358056b926d0897562a1d0c2d36c2b09456..8a1f056baf08e8b91710b2d40964cae58278c6da 100644 (file)
@@ -72,8 +72,8 @@ int nmi_active;
 #define P6_EVNTSEL_INT         (1 << 20)
 #define P6_EVNTSEL_OS          (1 << 17)
 #define P6_EVNTSEL_USR         (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT           P6_EVENT_CPU_CLOCKS_NOT_HALTED
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
+#define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c
 
 #define P4_ESCR_EVENT_SELECT(N)        ((N)<<25)
 #define P4_CCCR_OVF_PMI0       (1<<26)
@@ -122,10 +122,17 @@ int __init check_nmi_watchdog (void)
 
     printk("\n");
 
-    /* now that we know it works we can reduce NMI frequency to
-       something more reasonable; makes a difference in some configs */
+    /*
+     * Now that we know it works we can reduce NMI frequency to
+     * something more reasonable; makes a difference in some configs.
+     * There's a limit to how slow we can go because writing the perfctr
+     * MSRs only sets the low 32 bits, with the top 8 bits sign-extended
+     * from those, so it's not possible to set up a delay larger than
+     * 2^31 cycles and smaller than (2^40 - 2^31) cycles. 
+     * (Intel SDM, section 18.22.2)
+     */
     if ( nmi_watchdog == NMI_LOCAL_APIC )
-        nmi_hz = 1;
+        nmi_hz = max(1ul, cpu_khz >> 20);
 
     return 0;
 }
@@ -248,7 +255,7 @@ static void __pminit setup_k7_watchdog(void)
     wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
 }
 
-static void __pminit setup_p6_watchdog(void)
+static void __pminit setup_p6_watchdog(unsigned counter)
 {
     unsigned int evntsel;
 
@@ -260,7 +267,7 @@ static void __pminit setup_p6_watchdog(void)
     evntsel = P6_EVNTSEL_INT
         | P6_EVNTSEL_OS
         | P6_EVNTSEL_USR
-        | P6_NMI_EVENT;
+        | counter;
 
     wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
     write_watchdog_counter("P6_PERFCTR0");
@@ -279,7 +286,7 @@ static int __pminit setup_p4_watchdog(void)
 
     nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0;
     nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
-    if ( smp_num_siblings == 2 )
+    if ( boot_cpu_data.x86_num_siblings == 2 )
         nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
 
     if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
@@ -326,7 +333,9 @@ void __pminit setup_apic_nmi_watchdog(void)
     case X86_VENDOR_INTEL:
         switch (boot_cpu_data.x86) {
         case 6:
-            setup_p6_watchdog();
+            setup_p6_watchdog((boot_cpu_data.x86_model < 14) 
+                              ? P6_EVENT_CPU_CLOCKS_NOT_HALTED
+                              : CORE_EVENT_CPU_CLOCKS_NOT_HALTED);
             break;
         case 15:
             if (!setup_p4_watchdog())
index 0debd987bccde62a90420e88b319c4102d7a3858..8ed3a1149497491c62e3bd0cfb54513520ebb4ec 100644 (file)
@@ -312,7 +312,7 @@ static void dump_numa(unsigned char key)
                for_each_online_node(i)
                        page_num_node[i] = 0;
 
-               list_for_each_entry(page, &d->page_list, list)
+               page_list_for_each(page, &d->page_list)
                {
                        i = phys_to_nid(page_to_mfn(page) << PAGE_SHIFT);
                        page_num_node[i]++;
index 44103420556452db38f2e8aa28fce38a749572f3..c01adab4ab116d9ef841e0fb53ea460d8b21a491 100644 (file)
@@ -36,6 +36,57 @@ static unsigned long saved_lvtpc[NR_CPUS];
 static char *cpu_type;
 
 extern int is_active(struct domain *d);
+extern int is_passive(struct domain *d);
+
+static int passive_domain_msr_op_checks(struct cpu_user_regs *regs ,int *typep, int *indexp)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(current);
+       if ( model == NULL )
+               return 0;
+       if ( model->is_arch_pmu_msr == NULL )
+               return 0;
+       if ( !model->is_arch_pmu_msr((u64)regs->ecx, typep, indexp) )
+               return 0;
+
+       if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
+               if ( ! model->allocated_msr(current) )
+                       return 0;
+       return 1;
+}
+
+int passive_domain_do_rdmsr(struct cpu_user_regs *regs)
+{
+       u64 msr_content;
+       int type, index;
+
+       if ( !passive_domain_msr_op_checks(regs, &type, &index))
+               return 0;
+
+       model->load_msr(current, type, index, &msr_content);
+       regs->eax = msr_content & 0xFFFFFFFF;
+       regs->edx = msr_content >> 32;
+       return 1;
+}
+
+int passive_domain_do_wrmsr(struct cpu_user_regs *regs)
+{
+       u64 msr_content;
+       int type, index;
+
+       if ( !passive_domain_msr_op_checks(regs, &type, &index))
+               return 0;
+
+       msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
+       model->save_msr(current, type, index, msr_content);
+       return 1;
+}
+
+void passive_domain_destroy(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+       if ( vpmu->flags & PASSIVE_DOMAIN_ALLOCATED )
+               model->free_msr(v);
+}
 
 static int nmi_callback(struct cpu_user_regs *regs, int cpu)
 {
@@ -46,6 +97,8 @@ static int nmi_callback(struct cpu_user_regs *regs, int cpu)
        if ( ovf && is_active(current->domain) && !xen_mode )
                send_guest_vcpu_virq(current, VIRQ_XENOPROF);
 
+       if ( ovf == 2 ) 
+                test_and_set_bool(current->nmi_pending);
        return 1;
 }
  
@@ -273,7 +326,7 @@ static int __init p4_init(char ** cpu_type)
        model = &op_p4_spec;
        return 1;
 #else
-       switch (smp_num_siblings) {
+       switch (current_cpu_data.x86_num_siblings) {
                case 1:
                        *cpu_type = "i386/p4";
                        model = &op_p4_spec;
@@ -315,11 +368,10 @@ static int __init ppro_init(char ** cpu_type)
        case 14:
                *cpu_type = "i386/core";
                break;
-       case 15: case 23:
-               *cpu_type = "i386/core_2";
-               ppro_has_global_ctrl = 1;
-               break;
+       case 15:
+       case 23:
        case 26:
+       case 29:
                *cpu_type = "i386/core_2";
                ppro_has_global_ctrl = 1;
                break;
index 8fcb7ce0bc6edf661bffc134fccba080508da898..589fdab4bf940886ed15163a763e2ae1088bd6a6 100644 (file)
@@ -41,7 +41,7 @@ static unsigned int num_counters = NUM_COUNTERS_NON_HT;
 static inline void setup_num_counters(void)
 {
 #ifdef CONFIG_SMP
-       if (smp_num_siblings == 2)
+       if (boot_cpu_data.x86_num_siblings == 2)        /* XXX */
                num_counters = NUM_COUNTERS_HT2;
 #endif
 }
@@ -49,7 +49,7 @@ static inline void setup_num_counters(void)
 static int inline addr_increment(void)
 {
 #ifdef CONFIG_SMP
-       return smp_num_siblings == 2 ? 2 : 1;
+       return boot_cpu_data.x86_num_siblings == 2 ? 2 : 1;
 #else
        return 1;
 #endif
index 519d93104ab975aae543dadad6a7de20c4377ce6..0dc962dfe6b004c7db2a860c95aa45de5d71de0e 100644 (file)
@@ -18,6 +18,8 @@
 #include <xen/sched.h>
 #include <asm/regs.h>
 #include <asm/current.h>
+#include <asm/hvm/vmx/vpmu.h>
+#include <asm/hvm/vmx/vpmu_core2.h>
  
 #include "op_x86_model.h"
 #include "op_counter.h"
 #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
 #define CTRL_SET_UM(val, m) (val |= (m << 8))
 #define CTRL_SET_EVENT(val, e) (val |= e)
-
+#define IS_ACTIVE(val) (val & (1 << 22) )  
+#define IS_ENABLE(val) (val & (1 << 20) )
 static unsigned long reset_value[NUM_COUNTERS];
 int ppro_has_global_ctrl = 0;
+extern int is_passive(struct domain *d);
  
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
@@ -103,6 +107,7 @@ static int ppro_check_ctrs(unsigned int const cpu,
        int ovf = 0;
        unsigned long eip = regs->eip;
        int mode = xenoprofile_get_mode(current, regs);
+       struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context;
 
        for (i = 0 ; i < NUM_COUNTERS; ++i) {
                if (!reset_value[i])
@@ -111,7 +116,18 @@ static int ppro_check_ctrs(unsigned int const cpu,
                if (CTR_OVERFLOWED(low)) {
                        xenoprof_log_event(current, regs, eip, mode, i);
                        CTR_WRITE(reset_value[i], msrs, i);
-                       ovf = 1;
+                       if ( is_passive(current->domain) && (mode != 2) && 
+                               (vcpu_vpmu(current)->flags & PASSIVE_DOMAIN_ALLOCATED) ) 
+                       {
+                               if ( IS_ACTIVE(msrs_content[i].control) )
+                               {
+                                       msrs_content[i].counter = (low | (u64)high << 32);
+                                       if ( IS_ENABLE(msrs_content[i].control) )
+                                               ovf = 2;
+                               }
+                       }
+                       if ( !ovf )
+                               ovf = 1;
                }
        }
 
@@ -159,6 +175,84 @@ static void ppro_stop(struct op_msrs const * const msrs)
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 }
 
+static int ppro_is_arch_pmu_msr(u64 msr_index, int *type, int *index)
+{
+       if ( (msr_index >= MSR_IA32_PERFCTR0) &&
+            (msr_index < (MSR_IA32_PERFCTR0 + NUM_COUNTERS)) )
+       {
+               *type = MSR_TYPE_ARCH_COUNTER;
+               *index = msr_index - MSR_IA32_PERFCTR0;
+               return 1;
+        }
+        if ( (msr_index >= MSR_P6_EVNTSEL0) &&
+            (msr_index < (MSR_P6_EVNTSEL0 + NUM_CONTROLS)) )
+        {
+               *type = MSR_TYPE_ARCH_CTRL;
+               *index = msr_index - MSR_P6_EVNTSEL0;
+               return 1;
+        }
+
+        return 0;
+}
+
+static int ppro_allocate_msr(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+       struct arch_msr_pair *msr_content;
+       
+       msr_content = xmalloc_bytes( sizeof(struct arch_msr_pair) * NUM_COUNTERS );
+       if ( !msr_content )
+               goto out;
+       memset(msr_content, 0, sizeof(struct arch_msr_pair) * NUM_COUNTERS);
+       vpmu->context = (void *)msr_content;
+       vpmu->flags = 0;
+       vpmu->flags |= PASSIVE_DOMAIN_ALLOCATED;
+       return 1;
+out:
+        gdprintk(XENLOG_WARNING, "Insufficient memory for oprofile, oprofile is "
+                 "unavailable on domain %d vcpu %d.\n",
+                 v->vcpu_id, v->domain->domain_id);
+        return 0;      
+}
+
+static void ppro_free_msr(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+
+       if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
+               return;
+       xfree(vpmu->context);
+       vpmu->flags &= ~PASSIVE_DOMAIN_ALLOCATED;
+}
+
+static void ppro_load_msr(struct vcpu *v, int type, int index, u64 *msr_content)
+{
+       struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
+       switch ( type )
+       {
+       case MSR_TYPE_ARCH_COUNTER:
+               *msr_content = msrs[index].counter;
+               break;
+       case MSR_TYPE_ARCH_CTRL:
+               *msr_content = msrs[index].control;
+               break;
+       }       
+}
+
+static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content)
+{
+       struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
+       
+       switch ( type )
+       {
+       case MSR_TYPE_ARCH_COUNTER:
+               msrs[index].counter = msr_content;
+               break;
+       case MSR_TYPE_ARCH_CTRL:
+               msrs[index].control = msr_content;
+               break;
+       }       
+}
 
 struct op_x86_model_spec const op_ppro_spec = {
        .num_counters = NUM_COUNTERS,
@@ -167,5 +261,10 @@ struct op_x86_model_spec const op_ppro_spec = {
        .setup_ctrs = &ppro_setup_ctrs,
        .check_ctrs = &ppro_check_ctrs,
        .start = &ppro_start,
-       .stop = &ppro_stop
+       .stop = &ppro_stop,
+       .is_arch_pmu_msr = &ppro_is_arch_pmu_msr,
+       .allocated_msr = &ppro_allocate_msr,
+       .free_msr = &ppro_free_msr,
+       .load_msr = &ppro_load_msr,
+       .save_msr = &ppro_save_msr
 };
index 2858e8bf908dd8d264453b826d5e71b2e8ef074b..8568bd7f25aade814a3e3c5e74225bfba234e33e 100644 (file)
@@ -41,6 +41,11 @@ struct op_x86_model_spec {
                          struct cpu_user_regs * const regs);
        void (*start)(struct op_msrs const * const msrs);
        void (*stop)(struct op_msrs const * const msrs);
+       int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index);
+       int (*allocated_msr)(struct vcpu *v);
+       void (*free_msr)(struct vcpu *v);
+       void (*load_msr)(struct vcpu * const v, int type, int index, u64 *msr_content);
+        void (*save_msr)(struct vcpu * const v, int type, int index, u64 msr_content);
 };
 
 extern struct op_x86_model_spec const op_ppro_spec;
index 55e54ae669ce178843ce4f9ca284dd805c591733..52633a6ed69552b53af08c9cf5f795819dcdde16 100644 (file)
@@ -14,6 +14,7 @@
 #include <public/xen.h>
 #include <public/physdev.h>
 #include <xsm/xsm.h>
+#include <asm/p2m.h>
 
 #ifndef COMPAT
 typedef long ret_t;
@@ -61,7 +62,7 @@ static int physdev_map_pirq(struct physdev_map_pirq *map)
                 ret = -EINVAL;
                 goto free_domain;
             }
-            vector = IO_APIC_VECTOR(map->index);
+            vector = domain_irq_to_vector(current->domain, map->index);
             if ( !vector )
             {
                 dprintk(XENLOG_G_ERR, "dom%d: map irq with no vector %d\n",
@@ -74,7 +75,7 @@ static int physdev_map_pirq(struct physdev_map_pirq *map)
         case MAP_PIRQ_TYPE_MSI:
             vector = map->index;
             if ( vector == -1 )
-                vector = assign_irq_vector(AUTO_ASSIGN);
+                vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
 
             if ( vector < 0 || vector >= NR_VECTORS )
             {
@@ -99,16 +100,17 @@ static int physdev_map_pirq(struct physdev_map_pirq *map)
             goto free_domain;
     }
 
+    spin_lock(&pcidevs_lock);
     /* Verify or get pirq. */
     spin_lock(&d->event_lock);
+    pirq = domain_vector_to_irq(d, vector);
     if ( map->pirq < 0 )
     {
-        if ( d->arch.vector_pirq[vector] )
+        if ( pirq )
         {
             dprintk(XENLOG_G_ERR, "dom%d: %d:%d already mapped to %d\n",
                     d->domain_id, map->index, map->pirq,
-                    d->arch.vector_pirq[vector]);
-            pirq = d->arch.vector_pirq[vector];
+                    pirq);
             if ( pirq < 0 )
             {
                 ret = -EBUSY;
@@ -128,8 +130,7 @@ static int physdev_map_pirq(struct physdev_map_pirq *map)
     }
     else
     {
-        if ( d->arch.vector_pirq[vector] &&
-             d->arch.vector_pirq[vector] != map->pirq )
+        if ( pirq && pirq != map->pirq )
         {
             dprintk(XENLOG_G_ERR, "dom%d: vector %d conflicts with irq %d\n",
                     d->domain_id, map->index, map->pirq);
@@ -146,6 +147,7 @@ static int physdev_map_pirq(struct physdev_map_pirq *map)
 
 done:
     spin_unlock(&d->event_lock);
+    spin_unlock(&pcidevs_lock);
     if ( (ret != 0) && (map->type == MAP_PIRQ_TYPE_MSI) && (map->index == -1) )
         free_irq_vector(vector);
 free_domain:
@@ -169,9 +171,11 @@ static int physdev_unmap_pirq(struct physdev_unmap_pirq *unmap)
     if ( d == NULL )
         return -ESRCH;
 
+    spin_lock(&pcidevs_lock);
     spin_lock(&d->event_lock);
     ret = unmap_domain_pirq(d, unmap->pirq);
     spin_unlock(&d->event_lock);
+    spin_unlock(&pcidevs_lock);
 
     rcu_unlock_domain(d);
 
@@ -191,10 +195,52 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         ret = -EFAULT;
         if ( copy_from_guest(&eoi, arg, 1) != 0 )
             break;
+        ret = -EINVAL;
+        if ( eoi.irq < 0 || eoi.irq >= NR_IRQS )
+            break;
+        if ( v->domain->arch.pirq_eoi_map )
+            evtchn_unmask(v->domain->pirq_to_evtchn[eoi.irq]);
         ret = pirq_guest_eoi(v->domain, eoi.irq);
         break;
     }
 
+    case PHYSDEVOP_pirq_eoi_gmfn: {
+        struct physdev_pirq_eoi_gmfn info;
+        unsigned long mfn;
+
+        BUILD_BUG_ON(NR_IRQS > (PAGE_SIZE * 8));
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&info, arg, 1) != 0 )
+            break;
+
+        ret = -EINVAL;
+        mfn = gmfn_to_mfn(current->domain, info.gmfn);
+        if ( !mfn_valid(mfn) ||
+             !get_page_and_type(mfn_to_page(mfn), v->domain,
+                                PGT_writable_page) )
+            break;
+
+        if ( cmpxchg(&v->domain->arch.pirq_eoi_map_mfn, 0, mfn) != 0 )
+        {
+            put_page_and_type(mfn_to_page(mfn));
+            ret = -EBUSY;
+            break;
+        }
+
+        v->domain->arch.pirq_eoi_map = map_domain_page_global(mfn);
+        if ( v->domain->arch.pirq_eoi_map == NULL )
+        {
+            v->domain->arch.pirq_eoi_map_mfn = 0;
+            put_page_and_type(mfn_to_page(mfn));
+            ret = -ENOSPC;
+            break;
+        }
+
+        ret = 0;
+        break;
+    }
+
     /* Legacy since 0x00030202. */
     case PHYSDEVOP_IRQ_UNMASK_NOTIFY: {
         ret = pirq_guest_unmask(v->domain);
@@ -211,8 +257,15 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         if ( (irq < 0) || (irq >= NR_IRQS) )
             break;
         irq_status_query.flags = 0;
-        if ( pirq_acktype(v->domain, irq) != 0 )
-            irq_status_query.flags |= XENIRQSTAT_needs_eoi;
+        /*
+         * Even edge-triggered or message-based IRQs can need masking from
+         * time to time. If teh guest is not dynamically checking for this
+         * via the new pirq_eoi_map mechanism, it must conservatively always
+         * execute the EOI hypercall. In practice, this only really makes a
+         * difference for maskable MSI sources, and if those are supported
+         * then dom0 is probably modern anyway.
+         */
+        irq_status_query.flags |= XENIRQSTAT_needs_eoi;
         if ( pirq_shared(v->domain, irq) )
             irq_status_query.flags |= XENIRQSTAT_shared;
         ret = copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0;
@@ -298,10 +351,12 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
 
         irq_op.vector = assign_irq_vector(irq);
 
+        spin_lock(&pcidevs_lock);
         spin_lock(&dom0->event_lock);
         ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector,
                               MAP_PIRQ_TYPE_GSI, NULL);
         spin_unlock(&dom0->event_lock);
+        spin_unlock(&pcidevs_lock);
 
         if ( copy_to_guest(arg, &irq_op, 1) != 0 )
             ret = -EFAULT;
@@ -366,6 +421,50 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
+    case PHYSDEVOP_manage_pci_add_ext: {
+        struct physdev_manage_pci_ext manage_pci_ext;
+        struct pci_dev_info pdev_info;
+
+        ret = -EPERM;
+        if ( !IS_PRIV(current->domain) )
+            break;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&manage_pci_ext, arg, 1) != 0 )
+            break;
+
+        ret = -EINVAL;
+        if ( (manage_pci_ext.is_extfn > 1) || (manage_pci_ext.is_virtfn > 1) )
+            break;
+
+        pdev_info.is_extfn = manage_pci_ext.is_extfn;
+        pdev_info.is_virtfn = manage_pci_ext.is_virtfn;
+        pdev_info.physfn.bus = manage_pci_ext.physfn.bus;
+        pdev_info.physfn.devfn = manage_pci_ext.physfn.devfn;
+        ret = pci_add_device_ext(manage_pci_ext.bus,
+                                 manage_pci_ext.devfn,
+                                 &pdev_info);
+        break;
+    }
+
+    case PHYSDEVOP_restore_msi: {
+        struct physdev_restore_msi restore_msi;
+        struct pci_dev *pdev;
+
+        ret = -EPERM;
+        if ( !IS_PRIV(v->domain) )
+            break;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&restore_msi, arg, 1) != 0 )
+            break;
+
+        spin_lock(&pcidevs_lock);
+        pdev = pci_get_pdev(restore_msi.bus, restore_msi.devfn);
+        ret = pdev ? pci_restore_msi_state(pdev) : -ENODEV;
+        spin_unlock(&pcidevs_lock);
+        break;
+    }
     default:
         ret = -ENOSYS;
         break;
index 99060a0d56db9ac85d7b99558556bed7d5a8cc38..4bf677792325d6af92f055d4b18c96b780554dbf 100644 (file)
@@ -53,15 +53,6 @@ static long cpu_frequency_change_helper(void *data)
     return cpu_frequency_change(this_cpu(freq));
 }
 
-int xenpf_copy_px_states(struct processor_performance *pxpt,
-        struct xen_processor_performance *dom0_px_info)
-{
-    if (!pxpt || !dom0_px_info)
-        return -EINVAL;
-    return  copy_from_compat(pxpt->states, dom0_px_info->states, 
-                    dom0_px_info->state_count);
-}
-
 ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
 {
     ret_t ret = 0;
@@ -346,16 +337,8 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
         for_each_cpu_mask ( cpu, cpumap )
         {
             if ( (v = idle_vcpu[cpu]) != NULL )
-            {
-                idletime = v->runstate.time[RUNSTATE_running];
-                if ( v->is_running )
-                    idletime += now - v->runstate.state_entry_time;
-            }
-            else
-            {
-                idletime = 0;
                 cpu_clear(cpu, cpumap);
-            }
+            idletime = get_cpu_idle_time(cpu);
 
             ret = -EFAULT;
             if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) )
@@ -372,12 +355,13 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xen_platform_op_t) u_xenpf_op)
         switch ( op->u.set_pminfo.type )
         {
         case XEN_PM_PX:
-        {
-
-            ret = set_px_pminfo(op->u.set_pminfo.id,
-                                &op->u.set_pminfo.perf);
+            if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) )
+            {
+                ret = -ENOSYS;
+                break;
+            }
+            ret = set_px_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.perf);
             break;
-        }
  
         case XEN_PM_CX:
             if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_CX) )
index 967014e30185e9165f1f7de7a7d5970124c6bb75..8cf4190144c1051eeed970975d4b7a4c432a7f13 100644 (file)
@@ -39,6 +39,8 @@
 #include <xsm/xsm.h>
 #include <asm/tboot.h>
 
+int __init bzimage_headroom(char *image_start, unsigned long image_length);
+
 #if defined(CONFIG_X86_64)
 #define BOOTSTRAP_DIRECTMAP_END (1UL << 32) /* 4GB */
 #define maddr_to_bootstrap_virt(m) maddr_to_virt(m)
@@ -54,15 +56,6 @@ extern u16 boot_edid_caps;
 extern u8 boot_edid_info[128];
 extern struct boot_video_info boot_vid_info;
 
-/*
- * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the
- * page_info table and allocation bitmap.
- */
-static unsigned int opt_xenheap_megabytes = XENHEAP_DEFAULT_MB;
-#if defined(CONFIG_X86_64)
-integer_param("xenheap_megabytes", opt_xenheap_megabytes);
-#endif
-
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -96,7 +89,7 @@ boolean_param("noapic", skip_ioapic_setup);
 
 /* **** Linux config option: propagated to domain0. */
 /* xen_cpuidle: xen control cstate. */
-/*static*/ int xen_cpuidle;
+/*static*/ int xen_cpuidle = 1;
 boolean_param("cpuidle", xen_cpuidle);
 
 int early_boot = 1;
@@ -104,9 +97,12 @@ int early_boot = 1;
 cpumask_t cpu_present_map;
 
 unsigned long xen_phys_start;
+unsigned long allocator_bitmap_end;
 
+#ifdef CONFIG_X86_32
 /* Limits of Xen heap, used to initialise the allocator. */
-unsigned long xenheap_phys_start, xenheap_phys_end;
+unsigned long xenheap_initial_phys_start, xenheap_phys_end;
+#endif
 
 extern void arch_init_memory(void);
 extern void init_IRQ(void);
@@ -178,19 +174,21 @@ static void __init do_initcalls(void)
     for ( ; ; ) halt();                         \
 } while (0)
 
-static unsigned long __initdata initial_images_start, initial_images_end;
+static unsigned long __initdata initial_images_base;
+static unsigned long __initdata initial_images_start;
+static unsigned long __initdata initial_images_end;
 
 unsigned long __init initial_images_nrpages(void)
 {
-    ASSERT(!(initial_images_start & ~PAGE_MASK));
+    ASSERT(!(initial_images_base & ~PAGE_MASK));
     ASSERT(!(initial_images_end   & ~PAGE_MASK));
     return ((initial_images_end >> PAGE_SHIFT) -
-            (initial_images_start >> PAGE_SHIFT));
+            (initial_images_base >> PAGE_SHIFT));
 }
 
 void __init discard_initial_images(void)
 {
-    init_domheap_pages(initial_images_start, initial_images_end);
+    init_domheap_pages(initial_images_base, initial_images_end);
 }
 
 extern char __per_cpu_start[], __per_cpu_data_end[], __per_cpu_end[];
@@ -230,7 +228,6 @@ static void __init percpu_init_areas(void)
 static void __init init_idle_domain(void)
 {
     struct domain *idle_domain;
-    unsigned int i;
 
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
@@ -243,12 +240,6 @@ static void __init init_idle_domain(void)
     idle_vcpu[0] = this_cpu(curr_vcpu) = current;
 
     setup_idle_pagetable();
-
-    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
-        idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] =
-            l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
-                          __PAGE_HYPERVISOR);
-
 }
 
 static void __init srat_detect_node(int cpu)
@@ -393,6 +384,7 @@ void init_done(void)
     extern char __init_begin[], __init_end[];
 
     /* Free (or page-protect) the init areas. */
+    memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */
 #ifndef MEMORY_GUARD
     init_xenheap_pages(__pa(__init_begin), __pa(__init_end));
 #endif
@@ -426,7 +418,7 @@ void __init __start_xen(unsigned long mbi_p)
     unsigned int initrdidx = 1;
     multiboot_info_t *mbi = __va(mbi_p);
     module_t *mod = (module_t *)__va(mbi->mods_addr);
-    unsigned long nr_pages, modules_length;
+    unsigned long nr_pages, modules_length, modules_headroom;
     int i, e820_warn = 0, bytes = 0;
     struct ns16550_defaults ns16550 = {
         .data_bits = 8,
@@ -456,6 +448,7 @@ void __init __start_xen(unsigned long mbi_p)
     parse_video_info();
 
     set_current((struct vcpu *)0xfffff000); /* debug sanity */
+    idle_vcpu[0] = current;
     set_processor_id(0); /* needed early, for smp_processor_id() */
     if ( cpu_has_efer )
         rdmsrl(MSR_EFER, this_cpu(efer));
@@ -604,23 +597,6 @@ void __init __start_xen(unsigned long mbi_p)
     /* Sanitise the raw E820 map to produce a final clean version. */
     max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr);
 
-#ifdef CONFIG_X86_64
-    /*
-     * On x86/64 we are able to account for the allocation bitmap
-     * (allocated in common/page_alloc.c:init_boot_allocator()) stealing
-     * from the Xen heap. Here we make the Xen heap appropriately larger.
-     */
-    opt_xenheap_megabytes += (max_page / 8) >> 20;
-#endif
-
-    /*
-     * Since there are some stubs getting built on the stacks which use
-     * direct calls/jumps, the heap must be confined to the lower 2G so
-     * that those branches can reach their targets.
-     */
-    if ( opt_xenheap_megabytes > 2048 )
-        opt_xenheap_megabytes = 2048;
-
     /* Create a temporary copy of the E820 map. */
     memcpy(&boot_e820, &e820, sizeof(e820));
 
@@ -641,6 +617,13 @@ void __init __start_xen(unsigned long mbi_p)
      * x86/64, we relocate Xen to higher memory.
      */
     modules_length = mod[mbi->mods_count-1].mod_end - mod[0].mod_start;
+
+    /* ensure mod[0] is mapped before parsing */
+    bootstrap_map(mod[0].mod_start, mod[0].mod_end);
+    modules_headroom = bzimage_headroom(
+                      (char *)(unsigned long)mod[0].mod_start,
+                      (unsigned long)(mod[0].mod_end - mod[0].mod_start));
+
     for ( i = boot_e820.nr_map-1; i >= 0; i-- )
     {
         uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
@@ -659,8 +642,10 @@ void __init __start_xen(unsigned long mbi_p)
             s >> PAGE_SHIFT, (e-s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
 
 #if defined(CONFIG_X86_64)
+/* Relocate Xen image, allocation bitmap, and one page of padding. */
+#define reloc_size ((__pa(&_end) + max_page/8 + PAGE_SIZE + mask) & ~mask)
         /* Is the region suitable for relocating Xen? */
-        if ( !xen_phys_start && (((e-s) >> 20) >= opt_xenheap_megabytes) )
+        if ( !xen_phys_start && ((e-s) >= reloc_size) )
         {
             extern l2_pgentry_t l2_xenmap[];
             l4_pgentry_t *pl4e;
@@ -669,7 +654,7 @@ void __init __start_xen(unsigned long mbi_p)
             int i, j, k;
 
             /* Select relocation address. */
-            e = (e - (opt_xenheap_megabytes << 20)) & ~mask;
+            e -= reloc_size;
             xen_phys_start = e;
             bootsym(trampoline_xen_phys_start) = e;
 
@@ -744,12 +729,15 @@ void __init __start_xen(unsigned long mbi_p)
 #endif
 
         /* Is the region suitable for relocating the multiboot modules? */
-        if ( !initial_images_start && (s < e) && ((e-s) >= modules_length) )
+        if ( !initial_images_start && (s < e) &&
+             ((e-s) >= (modules_length+modules_headroom)) )
         {
             initial_images_end = e;
             e = (e - modules_length) & PAGE_MASK;
             initial_images_start = e;
-            move_memory(initial_images_start, 
+            e -= modules_headroom;
+            initial_images_base = e;
+            move_memory(initial_images_start,
                         mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
         }
 
@@ -763,17 +751,17 @@ void __init __start_xen(unsigned long mbi_p)
 
     if ( !initial_images_start )
         EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n");
-    reserve_e820_ram(&boot_e820, initial_images_start, initial_images_end);
+    reserve_e820_ram(&boot_e820, initial_images_base, initial_images_end);
 
-    /* Initialise Xen heap and boot heap. */
-    xenheap_phys_start = init_boot_allocator(__pa(&_end));
-    xenheap_phys_end   = opt_xenheap_megabytes << 20;
-#if defined(CONFIG_X86_64)
+    /* Initialise boot heap. */
+    allocator_bitmap_end = init_boot_allocator(__pa(&_end));
+#if defined(CONFIG_X86_32)
+    xenheap_initial_phys_start = allocator_bitmap_end;
+    xenheap_phys_end = DIRECTMAP_MBYTES << 20;
+#else
     if ( !xen_phys_start )
         EARLY_FAIL("Not enough memory to relocate Xen.\n");
-    xenheap_phys_end += xen_phys_start;
-    reserve_e820_ram(&boot_e820, xen_phys_start,
-                     xen_phys_start + (opt_xenheap_megabytes<<20));
+    reserve_e820_ram(&boot_e820, __pa(&_start), allocator_bitmap_end);
 #endif
 
     /* Late kexec reservation (dynamic start address). */
@@ -866,23 +854,22 @@ void __init __start_xen(unsigned long mbi_p)
 
     numa_initmem_init(0, max_page);
 
-    /* Initialise the Xen heap, skipping RAM holes. */
-    init_xenheap_pages(xenheap_phys_start, xenheap_phys_end);
-    nr_pages = (xenheap_phys_end - xenheap_phys_start) >> PAGE_SHIFT;
-#ifdef __x86_64__
-    init_xenheap_pages(xen_phys_start, __pa(&_start));
-    nr_pages += (__pa(&_start) - xen_phys_start) >> PAGE_SHIFT;
-    vesa_init();
-#endif
-    xenheap_phys_start = xen_phys_start;
+#if defined(CONFIG_X86_32)
+    /* Initialise the Xen heap. */
+    init_xenheap_pages(xenheap_initial_phys_start, xenheap_phys_end);
+    nr_pages = (xenheap_phys_end - xenheap_initial_phys_start) >> PAGE_SHIFT;
     printk("Xen heap: %luMB (%lukB)\n", 
            nr_pages >> (20 - PAGE_SHIFT),
            nr_pages << (PAGE_SHIFT - 10));
+#endif
 
     end_boot_allocator();
-
     early_boot = 0;
 
+#if defined(CONFIG_X86_64)
+    vesa_init();
+#endif
+
     softirq_init();
 
     early_cpu_init();
@@ -947,6 +934,9 @@ void __init __start_xen(unsigned long mbi_p)
         set_in_cr4(X86_CR4_OSFXSR);
     if ( cpu_has_xmm )
         set_in_cr4(X86_CR4_OSXMMEXCPT);
+
+    local_irq_enable();
+
 #ifdef CONFIG_X86_64
     vesa_mtrr_init();
 #endif
@@ -956,6 +946,8 @@ void __init __start_xen(unsigned long mbi_p)
 
     smp_prepare_cpus(max_cpus);
 
+    spin_debug_enable();
+
     /*
      * Initialise higher-level timer functions. We do this fairly late
      * (post-SMP) because the time bases and scale factors need to be updated 
@@ -968,8 +960,6 @@ void __init __start_xen(unsigned long mbi_p)
 
     serial_init_postirq();
 
-    BUG_ON(!local_irq_is_enabled());
-
     for_each_present_cpu ( i )
     {
         if ( num_online_cpus() >= max_cpus )
@@ -995,9 +985,12 @@ void __init __start_xen(unsigned long mbi_p)
 
     if ( opt_watchdog ) 
         watchdog_enable();
+    
+    if ( !tboot_protect_mem_regions() )
+        panic("Could not protect TXT memory regions\n");
 
     /* Create initial domain 0. */
-    dom0 = domain_create(0, 0, DOM0_SSIDREF);
+    dom0 = domain_create(0, DOMCRF_s3_integrity, DOM0_SSIDREF);
     if ( (dom0 == NULL) || (alloc_vcpu(dom0, 0, 0) == NULL) )
         panic("Error creating domain 0\n");
 
@@ -1052,7 +1045,8 @@ void __init __start_xen(unsigned long mbi_p)
      * above our heap. The second module, if present, is an initrd ramdisk.
      */
     if ( construct_dom0(dom0,
-                        initial_images_start, 
+                        initial_images_base,
+                        initial_images_start,
                         mod[0].mod_end-mod[0].mod_start,
                         _initrd_start,
                         _initrd_len,
@@ -1117,10 +1111,43 @@ void arch_get_xen_caps(xen_capabilities_info_t *info)
 
 int xen_in_range(paddr_t start, paddr_t end)
 {
-    start = max_t(paddr_t, start, xenheap_phys_start);
-    end = min_t(paddr_t, end, xenheap_phys_end);
-    return start < end; 
+    int i;
+    static struct {
+        paddr_t s, e;
+    } xen_regions[5];
+
+    /* initialize first time */
+    if ( !xen_regions[0].s )
+    {
+        extern char __init_begin[], __per_cpu_start[], __per_cpu_end[],
+                    __bss_start[];
+        extern unsigned long allocator_bitmap_end;
+
+        /* S3 resume code (and other real mode trampoline code) */
+        xen_regions[0].s = bootsym_phys(trampoline_start);
+        xen_regions[0].e = bootsym_phys(trampoline_end);
+        /* hypervisor code + data */
+        xen_regions[1].s =__pa(&_stext);
+        xen_regions[1].e = __pa(&__init_begin);
+        /* per-cpu data */
+        xen_regions[2].s = __pa(&__per_cpu_start);
+        xen_regions[2].e = __pa(&__per_cpu_end);
+        /* bss + boot allocator bitmap */
+        xen_regions[3].s = __pa(&__bss_start);
+        xen_regions[3].e = allocator_bitmap_end;
+        /* frametable */
+        xen_regions[4].s = (unsigned long)frame_table;
+        xen_regions[4].e = (unsigned long)frame_table +
+                           PFN_UP(max_page * sizeof(*frame_table));
+    }
+
+    for ( i = 0; i < ARRAY_SIZE(xen_regions); i++ )
+    {
+        if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
+            return 1;
+    }
+
+    return 0;
 }
 
 /*
index 2e8c622963aad12a374d14f089a1e1f72e56fe39..18340e5b25a321e38af323d302db770f836a865d 100644 (file)
@@ -302,6 +302,7 @@ void machine_restart(unsigned int delay_millisecs)
 
     watchdog_disable();
     console_start_sync();
+    spin_debug_disable();
 
     local_irq_enable();
 
index 69218fd322ad5e90514cb7a336efaff4021f6b83..64b55ebc86cb0c345224e1acc5e89551d9a4f0f9 100644 (file)
 /* Set if we find a B stepping CPU */
 static int __devinitdata smp_b_stepping;
 
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-#ifdef CONFIG_X86_HT
-EXPORT_SYMBOL(smp_num_siblings);
-#endif
-
 /* Package ID of each logical CPU */
 int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 
@@ -101,7 +95,7 @@ static cpumask_t smp_commenced_mask;
 static int __devinitdata tsc_sync_disabled;
 
 /* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+struct cpuinfo_x86 cpu_data[NR_CPUS];
 EXPORT_SYMBOL(cpu_data);
 
 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
@@ -112,7 +106,7 @@ static void map_cpu_to_logical_apicid(void);
 /* State of each CPU. */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
-static void *stack_base[NR_CPUS] __cacheline_aligned;
+static void *stack_base[NR_CPUS];
 static DEFINE_SPINLOCK(cpu_add_remove_lock);
 
 /*
@@ -391,9 +385,7 @@ void __devinit smp_callin(void)
        /*
         * Save our processor parameters
         */
-       smp_store_cpu_info(cpuid);
-
-       disable_APIC_timer();
+       smp_store_cpu_info(cpuid);
 
        /*
         * Allow the master to continue.
@@ -423,7 +415,7 @@ set_cpu_sibling_map(int cpu)
 
        cpu_set(cpu, cpu_sibling_setup_map);
 
-       if (smp_num_siblings > 1) {
+       if (c[cpu].x86_num_siblings > 1) {
                for_each_cpu_mask(i, cpu_sibling_setup_map) {
                        if (phys_proc_id[cpu] == phys_proc_id[i] &&
                            cpu_core_id[cpu] == cpu_core_id[i]) {
@@ -437,7 +429,7 @@ set_cpu_sibling_map(int cpu)
                cpu_set(cpu, cpu_sibling_map[cpu]);
        }
 
-       if (current_cpu_data.x86_max_cores == 1) {
+       if (c[cpu].x86_max_cores == 1) {
                cpu_core_map[cpu] = cpu_sibling_map[cpu];
                c[cpu].booted_cores = 1;
                return;
@@ -473,13 +465,6 @@ static void construct_percpu_idt(unsigned int cpu)
 {
        unsigned char idt_load[10];
 
-       /* If IDT table exists since last hotplug, reuse it */
-       if (!idt_tables[cpu]) {
-               idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
-               memcpy(idt_tables[cpu], idt_table,
-                               IDT_ENTRIES*sizeof(idt_entry_t));
-       }
-
        *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1;
        *(unsigned long  *)(&idt_load[2]) = (unsigned long)idt_tables[cpu];
        __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) );
@@ -538,6 +523,8 @@ void __devinit start_secondary(void *unused)
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
 
+       microcode_resume_cpu(cpu);
+
        wmb();
        startup_cpu_idle_loop();
 }
@@ -812,18 +799,10 @@ static inline int alloc_cpu_id(void)
        return cpu;
 }
 
-static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
-{
-       if (idle_vcpu[cpu])
-               return idle_vcpu[cpu];
-
-       return alloc_idle_vcpu(cpu);
-}
-
 static void *prepare_idle_stack(unsigned int cpu)
 {
        if (!stack_base[cpu])
-               stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER);
+               stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, 0);
 
        return stack_base[cpu];
 }
@@ -836,7 +815,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
  */
 {
        unsigned long boot_error;
-       unsigned int i;
+       unsigned int order;
        int timeout;
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
@@ -856,7 +835,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 
        booting_cpu = cpu;
 
-       v = prepare_idle_vcpu(cpu);
+       v = alloc_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
        /* start_eip had better be page-aligned! */
@@ -872,21 +851,21 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 
        gdt = per_cpu(gdt_table, cpu);
        if (gdt == boot_cpu_gdt_table) {
-               i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+               order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
 #ifdef __x86_64__
 #ifdef CONFIG_COMPAT
-               page = alloc_domheap_pages(NULL, i,
+               page = alloc_domheap_pages(NULL, order,
                                           MEMF_node(cpu_to_node(cpu)));
                per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
                memcpy(gdt, boot_cpu_compat_gdt_table,
                       NR_RESERVED_GDT_PAGES * PAGE_SIZE);
                gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
 #endif
-               page = alloc_domheap_pages(NULL, i,
+               page = alloc_domheap_pages(NULL, order,
                                           MEMF_node(cpu_to_node(cpu)));
                per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
 #else
-               per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
+               per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, 0);
 #endif
                memcpy(gdt, boot_cpu_gdt_table,
                       NR_RESERVED_GDT_PAGES * PAGE_SIZE);
@@ -894,13 +873,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
                gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
        }
 
-       for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
-               v->domain->arch.mm_perdomain_pt
-                       [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
-                        FIRST_RESERVED_GDT_PAGE + i]
-                       = l1e_from_page(virt_to_page(gdt) + i,
-                                       __PAGE_HYPERVISOR);
-
 #ifdef __i386__
        if (!per_cpu(doublefault_tss, cpu)) {
                per_cpu(doublefault_tss, cpu) = alloc_xenheap_page();
@@ -908,6 +880,12 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
        }
 #endif
 
+       if (!idt_tables[cpu]) {
+               idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
+               memcpy(idt_tables[cpu], idt_table,
+                      IDT_ENTRIES*sizeof(idt_entry_t));
+       }
+
        /*
         * This grunge runs the startup process for
         * the targeted processor.
@@ -1276,10 +1254,10 @@ int __cpu_disable(void)
        mdelay(1);
        local_irq_disable();
 
-       cpufreq_del_cpu(cpu);
-
        time_suspend();
 
+       cpu_mcheck_disable();
+
        remove_siblinginfo(cpu);
 
        cpu_clear(cpu, map);
@@ -1295,19 +1273,20 @@ int __cpu_disable(void)
 void __cpu_die(unsigned int cpu)
 {
        /* We don't do anything here: idle task is faking death itself. */
-       unsigned int i;
+       unsigned int i = 0;
 
-       for (i = 0; i < 10; i++) {
+       for (;;) {
                /* They ack this in play_dead by setting CPU_DEAD */
                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
-                       printk ("CPU %d is now offline\n", cpu);
+                       printk ("CPU %u is now offline\n", cpu);
                        return;
                }
                mdelay(100);
                mb();
                process_pending_timers();
+               if ((++i % 10) == 0)
+                       printk(KERN_ERR "CPU %u still not dead...\n", cpu);
        }
-       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
 
 static int take_cpu_down(void *unused)
@@ -1338,16 +1317,18 @@ int cpu_down(unsigned int cpu)
 
        printk("Prepare to bring CPU%d down...\n", cpu);
 
+       cpufreq_del_cpu(cpu);
+
        err = stop_machine_run(take_cpu_down, NULL, cpu);
-       if ( err < 0 )
+       if (err < 0)
                goto out;
 
        __cpu_die(cpu);
 
-       if (cpu_online(cpu)) {
-               printk("Bad state (DEAD, but in online map) on CPU%d\n", cpu);
-               err = -EBUSY;
-       }
+       BUG_ON(cpu_online(cpu));
+
+       cpu_mcheck_distribute_cmci();
+
 out:
        spin_unlock(&cpu_add_remove_lock);
        return err;
@@ -1445,8 +1426,10 @@ int __devinit __cpu_up(unsigned int cpu)
         * cpu_callin_map is set during AP kickstart process. Its reset
         * when a cpu is taken offline from cpu_exit_clear().
         */
-       if (!cpu_isset(cpu, cpu_callin_map))
+       if (!cpu_isset(cpu, cpu_callin_map)) {
                ret = __smp_prepare_cpu(cpu);
+               smpboot_restore_warm_reset_vector();
+       }
 
        if (ret)
                return -EIO;
index dbb54ede0985f2a8781ebd1d3aeb073b5c8e736f..faf3f5157c71bdd4d9a8ae86e510246862f9f175 100644 (file)
@@ -38,7 +38,7 @@ static long cpu_down_helper(void *data)
 long arch_do_sysctl(
     struct xen_sysctl *sysctl, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
 {
-    long ret = 0;
+    long ret = 0, status;
 
     switch ( sysctl->cmd )
     {
@@ -102,19 +102,41 @@ long arch_do_sysctl(
     {
         unsigned int cpu = sysctl->u.cpu_hotplug.cpu;
 
+        if (cpu_present(cpu)) {
+            status = cpu_online(cpu) ? XEN_CPU_HOTPLUG_STATUS_ONLINE :
+                XEN_CPU_HOTPLUG_STATUS_OFFLINE;
+        } else {
+            status = -EINVAL;
+        }
+
         switch ( sysctl->u.cpu_hotplug.op )
         {
         case XEN_SYSCTL_CPU_HOTPLUG_ONLINE:
             ret = cpu_up(cpu);
+            /*
+             * In the case of a true hotplug, this CPU wasn't present
+             * before, so return the 'new' status for it.
+             */
+            if (ret == 0 && status == -EINVAL)
+                status = XEN_CPU_HOTPLUG_STATUS_NEW;
             break;
         case XEN_SYSCTL_CPU_HOTPLUG_OFFLINE:
             ret = continue_hypercall_on_cpu(
                 0, cpu_down_helper, (void *)(unsigned long)cpu);
             break;
+        case XEN_SYSCTL_CPU_HOTPLUG_STATUS:
+            ret = 0;
+            break;
         default:
             ret = -EINVAL;
             break;
         }
+
+        /*
+         * If the operation was successful, return the old status.
+         */
+        if (ret >= 0)
+            ret = status;
     }
     break;
 
index ec4aa9436d5f4125e93aaa493a7fd17586d8fcb1..e259cd5da1ecea538480fc5fde0fe094762e8c80 100644 (file)
@@ -3,10 +3,14 @@
 #include <xen/types.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/domain_page.h>
+#include <xen/iommu.h>
 #include <asm/fixmap.h>
 #include <asm/page.h>
 #include <asm/processor.h>
+#include <asm/e820.h>
 #include <asm/tboot.h>
+#include <crypto/vmac.h>
 
 /* tboot=<physical address of shared page> */
 static char opt_tboot[20] = "";
@@ -15,12 +19,63 @@ string_param("tboot", opt_tboot);
 /* Global pointer to shared data; NULL means no measured launch. */
 tboot_shared_t *g_tboot_shared;
 
+static vmac_t domain_mac;     /* MAC for all domains during S3 */
+static vmac_t xenheap_mac;    /* MAC for xen heap during S3 */
+static vmac_t frametable_mac; /* MAC for frame table during S3 */
+
 static const uuid_t tboot_shared_uuid = TBOOT_SHARED_UUID;
 
+/* used by tboot_protect_mem_regions() and/or tboot_parse_dmar_table() */
+static uint64_t txt_heap_base, txt_heap_size;
+static uint64_t sinit_base, sinit_size;
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE       0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE      0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES     ((TXT_PUB_CONFIG_REGS_BASE -                \
+                                  TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_SINIT_BASE            0x0270
+#define TXTCR_SINIT_SIZE            0x0278
+#define TXTCR_HEAP_BASE             0x0300
+#define TXTCR_HEAP_SIZE             0x0308
+
+extern char __init_begin[], __per_cpu_start[], __per_cpu_end[], __bss_start[];
+extern unsigned long allocator_bitmap_end;
+
+#define SHA1_SIZE      20
+typedef uint8_t   sha1_hash_t[SHA1_SIZE];
+
+typedef struct __packed {
+    uint32_t     version;             /* currently 6 */
+    sha1_hash_t  bios_acm_id;
+    uint32_t     edx_senter_flags;
+    uint64_t     mseg_valid;
+    sha1_hash_t  sinit_hash;
+    sha1_hash_t  mle_hash;
+    sha1_hash_t  stm_hash;
+    sha1_hash_t  lcp_policy_hash;
+    uint32_t     lcp_policy_control;
+    uint32_t     rlp_wakeup_addr;
+    uint32_t     reserved;
+    uint32_t     num_mdrs;
+    uint32_t     mdrs_off;
+    uint32_t     num_vtd_dmars;
+    uint32_t     vtd_dmars_off;
+} sinit_mle_data_t;
+
 void __init tboot_probe(void)
 {
     tboot_shared_t *tboot_shared;
     unsigned long p_tboot_shared;
+    uint32_t map_base, map_size;
+    unsigned long map_addr;
 
     /* Look for valid page-aligned address for shared page. */
     p_tboot_shared = simple_strtoul(opt_tboot, NULL, 0);
@@ -30,24 +85,186 @@ void __init tboot_probe(void)
     /* Map and check for tboot UUID. */
     set_fixmap(FIX_TBOOT_SHARED_BASE, p_tboot_shared);
     tboot_shared = (tboot_shared_t *)fix_to_virt(FIX_TBOOT_SHARED_BASE);
+    if ( tboot_shared == NULL )
+        return;
     if ( memcmp(&tboot_shared_uuid, (uuid_t *)tboot_shared, sizeof(uuid_t)) )
         return;
 
+    /* new tboot_shared (w/ GAS support, integrity, etc.) is not backwards
+       compatible */
+    if ( tboot_shared->version < 4 ) {
+        printk("unsupported version of tboot (%u)\n", tboot_shared->version);
+        return;
+    }
+
     g_tboot_shared = tboot_shared;
     printk("TBOOT: found shared page at phys addr %lx:\n", p_tboot_shared);
     printk("  version: %d\n", tboot_shared->version);
     printk("  log_addr: 0x%08x\n", tboot_shared->log_addr);
-    printk("  shutdown_entry32: 0x%08x\n", tboot_shared->shutdown_entry32);
-    printk("  shutdown_entry64: 0x%08x\n", tboot_shared->shutdown_entry64);
-    printk("  shutdown_type: %d\n", tboot_shared->shutdown_type);
-    printk("  s3_tb_wakeup_entry: 0x%08x\n", tboot_shared->s3_tb_wakeup_entry);
-    printk("  s3_k_wakeup_entry: 0x%08x\n", tboot_shared->s3_k_wakeup_entry);
-    printk("  &acpi_sinfo: 0x%p\n", &tboot_shared->acpi_sinfo);
-    if ( tboot_shared->version >= 0x02 )
+    printk("  shutdown_entry: 0x%08x\n", tboot_shared->shutdown_entry);
+    printk("  tboot_base: 0x%08x\n", tboot_shared->tboot_base);
+    printk("  tboot_size: 0x%x\n", tboot_shared->tboot_size);
+
+    /* these will be needed by tboot_protect_mem_regions() and/or
+       tboot_parse_dmar_table(), so get them now */
+
+    map_base = PFN_DOWN(TXT_PUB_CONFIG_REGS_BASE);
+    map_size = PFN_UP(NR_TXT_CONFIG_PAGES * PAGE_SIZE);
+    map_addr = (unsigned long)__va(map_base << PAGE_SHIFT);
+    if ( map_pages_to_xen(map_addr, map_base, map_size, __PAGE_HYPERVISOR) )
+        return;
+
+    /* TXT Heap */
+    txt_heap_base =
+        *(uint64_t *)__va(TXT_PUB_CONFIG_REGS_BASE + TXTCR_HEAP_BASE);
+    txt_heap_size =
+        *(uint64_t *)__va(TXT_PUB_CONFIG_REGS_BASE + TXTCR_HEAP_SIZE);
+
+    /* SINIT */
+    sinit_base =
+        *(uint64_t *)__va(TXT_PUB_CONFIG_REGS_BASE + TXTCR_SINIT_BASE);
+    sinit_size =
+        *(uint64_t *)__va(TXT_PUB_CONFIG_REGS_BASE + TXTCR_SINIT_SIZE);
+
+    destroy_xen_mappings((unsigned long)__va(map_base << PAGE_SHIFT),
+                         (unsigned long)__va((map_base + map_size) << PAGE_SHIFT));
+}
+
+/* definitions from xen/drivers/passthrough/vtd/iommu.h
+ * used to walk through vtd page tables */
+#define LEVEL_STRIDE (9)
+#define PTE_NUM (1<<LEVEL_STRIDE)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define agaw_to_level(val) ((val)+2)
+struct dma_pte {
+    u64 val;
+};
+
+static void update_iommu_mac(vmac_ctx_t *ctx, uint64_t pt_maddr, int level)
+{
+    int i;
+    struct dma_pte *pt_vaddr, *pte;
+    int next_level = level - 1;
+
+    if ( pt_maddr == 0 )
+        return;
+
+    pt_vaddr = (struct dma_pte *)map_domain_page(pt_maddr >> PAGE_SHIFT_4K);
+    vmac_update((void *)pt_vaddr, PAGE_SIZE, ctx);
+
+    for ( i = 0; i < PTE_NUM; i++ )
+    {
+        pte = &pt_vaddr[i];
+        if ( !dma_pte_present(*pte) )
+            continue;
+
+        if ( next_level >= 1 )
+            update_iommu_mac(ctx, dma_pte_addr(*pte), next_level);
+    }
+
+    unmap_domain_page(pt_vaddr);
+}
+
+#define is_page_in_use(page) \
+    ((page->count_info & PGC_count_mask) != 0 || page->count_info == 0)
+
+static void update_pagetable_mac(vmac_ctx_t *ctx)
+{
+    unsigned long mfn;
+
+    for ( mfn = 0; mfn < max_page; mfn++ )
+    {
+        struct page_info *page = mfn_to_page(mfn);
+        if ( is_page_in_use(page) && !is_xen_heap_page(page) ) {
+            if ( page->count_info & PGC_page_table ) {
+                void *pg = map_domain_page(mfn);
+                vmac_update(pg, PAGE_SIZE, ctx);
+                unmap_domain_page(pg);
+            }
+        }
+    }
+}
+static void tboot_gen_domain_integrity(const uint8_t key[TB_KEY_SIZE],
+                                       vmac_t *mac)
+{
+    struct domain *d;
+    struct page_info *page;
+    uint8_t nonce[16] = {};
+    vmac_ctx_t ctx;
+
+    vmac_set_key((uint8_t *)key, &ctx);
+    for_each_domain( d )
+    {
+        if ( !d->arch.s3_integrity )
+            continue;
+        printk("MACing Domain %u\n", d->domain_id);
+
+        page_list_for_each(page, &d->page_list)
+        {
+            void *pg;
+            pg = map_domain_page(page_to_mfn(page));
+            vmac_update(pg, PAGE_SIZE, &ctx);
+            unmap_domain_page(pg);
+        }
+
+        if ( !is_idle_domain(d) )
+        {
+            struct hvm_iommu *hd = domain_hvm_iommu(d);
+            update_iommu_mac(&ctx, hd->pgd_maddr, agaw_to_level(hd->agaw));
+        }
+    }
+
+    /* MAC all shadow page tables */
+    update_pagetable_mac(&ctx);
+
+    *mac = vmac(NULL, 0, nonce, NULL, &ctx);
+
+    printk("MAC for domains is: 0x%08"PRIx64"\n", *mac);
+
+    /* wipe ctx to ensure key is not left in memory */
+    memset(&ctx, 0, sizeof(ctx));
+}
+
+static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE],
+                                        vmac_t *mac)
+{
+    unsigned long mfn;
+    uint8_t nonce[16] = {};
+    vmac_ctx_t ctx;
+
+    vmac_set_key((uint8_t *)key, &ctx);
+    for ( mfn = 0; mfn < max_page; mfn++ )
     {
-        printk("  tboot_base: 0x%08x\n", tboot_shared->tboot_base);
-        printk("  tboot_size: 0x%x\n", tboot_shared->tboot_size);
+        struct page_info *page = __mfn_to_page(mfn);
+        if ( is_page_in_use(page) && is_xen_heap_page(page) ) {
+            void *pg = mfn_to_virt(mfn);
+            vmac_update((uint8_t *)pg, PAGE_SIZE, &ctx);
+        }
     }
+    *mac = vmac(NULL, 0, nonce, NULL, &ctx);
+
+    printk("MAC for xenheap is: 0x%08"PRIx64"\n", *mac);
+
+    /* wipe ctx to ensure key is not left in memory */
+    memset(&ctx, 0, sizeof(ctx));
+}
+
+static void tboot_gen_frametable_integrity(const uint8_t key[TB_KEY_SIZE],
+                                           vmac_t *mac)
+{
+    uint8_t nonce[16] = {};
+    vmac_ctx_t ctx;
+
+    vmac_set_key((uint8_t *)key, &ctx);
+    *mac = vmac((uint8_t *)frame_table,
+                PFN_UP(max_page * sizeof(*frame_table)), nonce, NULL, &ctx);
+
+    printk("MAC for frametable is: 0x%08"PRIx64"\n", *mac);
+
+    /* wipe ctx to ensure key is not left in memory */
+    memset(&ctx, 0, sizeof(ctx));
 }
 
 void tboot_shutdown(uint32_t shutdown_type)
@@ -59,34 +276,63 @@ void tboot_shutdown(uint32_t shutdown_type)
 
     local_irq_disable();
 
+    /* we may be called from an interrupt context, so to prevent */
+    /* 'ASSERT(!in_irq());' in alloc_domheap_pages(), decrease count */
+    while ( in_irq() )
+        irq_exit();
+
     /* Create identity map for tboot shutdown code. */
-    if ( g_tboot_shared->version >= 0x02 )
-    {
-        map_base = PFN_DOWN(g_tboot_shared->tboot_base);
-        map_size = PFN_UP(g_tboot_shared->tboot_size);
-    }
-    else
-    {
-        map_base = 0;
-        map_size = PFN_UP(0xa0000);
-    }
+    /* do before S3 integrity because mapping tboot may change xenheap */
+    map_base = PFN_DOWN(g_tboot_shared->tboot_base);
+    map_size = PFN_UP(g_tboot_shared->tboot_size);
 
     err = map_pages_to_xen(map_base << PAGE_SHIFT, map_base, map_size,
                            __PAGE_HYPERVISOR);
-    if ( err != 0 )
-    {
+    if ( err != 0 ) {
         printk("error (0x%x) mapping tboot pages (mfns) @ 0x%x, 0x%x\n", err,
                map_base, map_size);
         return;
     }
 
+    /* if this is S3 then set regions to MAC */
+    if ( shutdown_type == TB_SHUTDOWN_S3 ) {
+        /*
+         * Xen regions for tboot to MAC
+         */
+        g_tboot_shared->num_mac_regions = 5;
+        /* S3 resume code (and other real mode trampoline code) */
+        g_tboot_shared->mac_regions[0].start = bootsym_phys(trampoline_start);
+        g_tboot_shared->mac_regions[0].size = bootsym_phys(trampoline_end) -
+                                              bootsym_phys(trampoline_start);
+        /* hypervisor code + data */
+        g_tboot_shared->mac_regions[1].start = (uint64_t)__pa(&_stext);
+        g_tboot_shared->mac_regions[1].size = __pa(&__init_begin) -
+                                              __pa(&_stext);
+        /* per-cpu data */
+        g_tboot_shared->mac_regions[2].start = (uint64_t)__pa(&__per_cpu_start);
+        g_tboot_shared->mac_regions[2].size = __pa(&__per_cpu_end) -
+                                              __pa(&__per_cpu_start);
+        /* bss */
+        g_tboot_shared->mac_regions[3].start = (uint64_t)__pa(&__bss_start);
+        g_tboot_shared->mac_regions[3].size = __pa(&_end) - __pa(&__bss_start);
+        /* boot allocator bitmap */
+        g_tboot_shared->mac_regions[4].start = (uint64_t)__pa(&_end);
+        g_tboot_shared->mac_regions[4].size = allocator_bitmap_end -
+                                              __pa(&_end);
+
+        /*
+         * MAC domains and other Xen memory
+         */
+        /* Xen has no better entropy source for MAC key than tboot's */
+        /* MAC domains first in case it perturbs xenheap */
+        tboot_gen_domain_integrity(g_tboot_shared->s3_key, &domain_mac);
+        tboot_gen_frametable_integrity(g_tboot_shared->s3_key, &frametable_mac);
+        tboot_gen_xenheap_integrity(g_tboot_shared->s3_key, &xenheap_mac);
+    }
+
     write_ptbase(idle_vcpu[0]);
 
-#ifdef __x86_64__
-    asm volatile ( "call *%%rdi" :: "D" (g_tboot_shared->shutdown_entry64) );
-#else
-    asm volatile ( "call *%0" :: "r" (g_tboot_shared->shutdown_entry32) );
-#endif
+    ((void(*)(void))(unsigned long)g_tboot_shared->shutdown_entry)();
 
     BUG(); /* should not reach here */
 }
@@ -96,16 +342,117 @@ int tboot_in_measured_env(void)
     return (g_tboot_shared != NULL);
 }
 
-int tboot_in_range(paddr_t start, paddr_t end)
+int __init tboot_protect_mem_regions(void)
 {
-    if ( g_tboot_shared == NULL || g_tboot_shared->version < 0x02 )
+    int rc;
+
+    if ( !tboot_in_measured_env() )
+        return 1;
+
+    /* TXT Heap */
+    if ( txt_heap_base == 0 )
+        return 0;
+    rc = e820_change_range_type(
+        &e820, txt_heap_base, txt_heap_base + txt_heap_size,
+        E820_RESERVED, E820_UNUSABLE);
+    if ( !rc )
         return 0;
 
-    start = max_t(paddr_t, start, g_tboot_shared->tboot_base);
-    end = min_t(paddr_t, end, 
-                g_tboot_shared->tboot_base + g_tboot_shared->tboot_size);
-    return start < end; 
+    /* SINIT */
+    if ( sinit_base == 0 )
+        return 0;
+    rc = e820_change_range_type(
+        &e820, sinit_base, sinit_base + sinit_size,
+        E820_RESERVED, E820_UNUSABLE);
+    if ( !rc )
+        return 0;
+
+    /* TXT Private Space */
+    rc = e820_change_range_type(
+        &e820, TXT_PRIV_CONFIG_REGS_BASE,
+        TXT_PRIV_CONFIG_REGS_BASE + NR_TXT_CONFIG_PAGES * PAGE_SIZE,
+        E820_RESERVED, E820_UNUSABLE);
+    if ( !rc )
+        return 0;
+
+    return 1;
+}
+
+int __init tboot_parse_dmar_table(acpi_table_handler dmar_handler)
+{
+    uint32_t map_base, map_size;
+    unsigned long map_vaddr;
+    void *heap_ptr;
+    struct acpi_table_header *dmar_table;
+    int rc;
+
+    if ( !tboot_in_measured_env() )
+        return acpi_table_parse(ACPI_SIG_DMAR, dmar_handler);
+
+    /* ACPI tables may not be DMA protected by tboot, so use DMAR copy */
+    /* SINIT saved in SinitMleData in TXT heap (which is DMA protected) */
+
+    if ( txt_heap_base == 0 )
+        return 1;
+
+    /* map TXT heap into Xen addr space */
+    map_base = PFN_DOWN(txt_heap_base);
+    map_size = PFN_UP(txt_heap_size);
+    map_vaddr = (unsigned long)__va(map_base << PAGE_SHIFT);
+    if ( map_pages_to_xen(map_vaddr, map_base, map_size, __PAGE_HYPERVISOR) )
+        return 1;
+
+    /* walk heap to SinitMleData */
+    heap_ptr = __va(txt_heap_base);
+    /* skip BiosData */
+    heap_ptr += *(uint64_t *)heap_ptr;
+    /* skip OsMleData */
+    heap_ptr += *(uint64_t *)heap_ptr;
+    /* skip OsSinitData */
+    heap_ptr += *(uint64_t *)heap_ptr;
+    /* now points to SinitMleDataSize; set to SinitMleData */
+    heap_ptr += sizeof(uint64_t);
+    /* get addr of DMAR table */
+    dmar_table = (struct acpi_table_header *)(heap_ptr +
+            ((sinit_mle_data_t *)heap_ptr)->vtd_dmars_off - sizeof(uint64_t));
+
+    rc = dmar_handler(dmar_table);
+
+    destroy_xen_mappings(
+        (unsigned long)__va(map_base << PAGE_SHIFT),
+        (unsigned long)__va((map_base + map_size) << PAGE_SHIFT));
+  
+    /* acpi_parse_dmar() zaps APCI DMAR signature in TXT heap table */
+    /* but dom0 will read real table, so must zap it there too */
+    dmar_table = NULL;
+    acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_table);
+    if ( dmar_table != NULL )
+        ((struct acpi_table_dmar *)dmar_table)->header.signature[0] = '\0';
+
+    return rc;
+}
+
+int tboot_s3_resume(void)
+{
+    vmac_t mac;
+
+    if ( !tboot_in_measured_env() )
+        return 0;
+
+    /* need to do these in reverse order of shutdown */
+    tboot_gen_xenheap_integrity(g_tboot_shared->s3_key, &mac);
+    if ( mac != xenheap_mac )
+        return -1;
+
+    tboot_gen_frametable_integrity(g_tboot_shared->s3_key, &mac);
+    if ( mac != frametable_mac )
+        return -2;
+
+    tboot_gen_domain_integrity(g_tboot_shared->s3_key, &mac);
+    if ( mac != domain_mac )
+        return -3;
+
+    return 0;
 }
 
 /*
index 2459e9738cbe27d7bee9cee1197513a2cc0d8cd2..223b44bbf961574509fcb0420ac1960ceccc8136 100644 (file)
@@ -48,17 +48,18 @@ struct time_scale {
 
 struct cpu_time {
     u64 local_tsc_stamp;
-    u64 cstate_tsc_stamp;
     s_time_t stime_local_stamp;
     s_time_t stime_master_stamp;
     struct time_scale tsc_scale;
-    u64 cstate_plt_count_stamp;
 };
 
 struct platform_timesource {
+    char *id;
     char *name;
     u64 frequency;
     u64 (*read_counter)(void);
+    int (*init)(struct platform_timesource *);
+    void (*resume)(struct platform_timesource *);
     int counter_bits;
 };
 
@@ -68,9 +69,6 @@ static DEFINE_PER_CPU(struct cpu_time, cpu_time);
 #define EPOCH MILLISECS(1000)
 static struct timer calibration_timer;
 
-/* TSC is invariant on C state entry? */
-static bool_t tsc_invariant;
-
 /*
  * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
  * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
@@ -148,6 +146,28 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale)
     return product;
 }
 
+/* Compute the reciprocal of the given time_scale. */
+static inline struct time_scale scale_reciprocal(struct time_scale scale)
+{
+    struct time_scale reciprocal;
+    u32 dividend;
+
+    dividend = 0x80000000u;
+    reciprocal.shift = 1 - scale.shift;
+    while ( unlikely(dividend >= scale.mul_frac) )
+    {
+        dividend >>= 1;
+        reciprocal.shift++;
+    }
+
+    asm (
+        "divl %4"
+        : "=a" (reciprocal.mul_frac), "=d" (dividend)
+        : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
+
+    return reciprocal;
+}
+
 /*
  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
  * IPIs in place of local APIC timers
@@ -360,15 +380,22 @@ static u64 read_pit_count(void)
     return count32;
 }
 
-static void init_pit(struct platform_timesource *pts)
+static int init_pit(struct platform_timesource *pts)
 {
-    pts->name = "PIT";
-    pts->frequency = CLOCK_TICK_RATE;
-    pts->read_counter = read_pit_count;
-    pts->counter_bits = 32;
     using_pit = 1;
+    return 1;
 }
 
+static struct platform_timesource plt_pit =
+{
+    .id = "pit",
+    .name = "PIT",
+    .frequency = CLOCK_TICK_RATE,
+    .read_counter = read_pit_count,
+    .counter_bits = 32,
+    .init = init_pit
+};
+
 /************************************************************
  * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
  */
@@ -385,14 +412,28 @@ static int init_hpet(struct platform_timesource *pts)
     if ( hpet_rate == 0 )
         return 0;
 
-    pts->name = "HPET";
     pts->frequency = hpet_rate;
-    pts->read_counter = read_hpet_count;
-    pts->counter_bits = 32;
-
     return 1;
 }
 
+static void resume_hpet(struct platform_timesource *pts)
+{
+    u64 hpet_rate = hpet_setup();
+
+    BUG_ON(hpet_rate == 0);
+    pts->frequency = hpet_rate;
+}
+
+static struct platform_timesource plt_hpet =
+{
+    .id = "hpet",
+    .name = "HPET",
+    .read_counter = read_hpet_count,
+    .counter_bits = 32,
+    .init = init_hpet,
+    .resume = resume_hpet
+};
+
 /************************************************************
  * PLATFORM TIMER 3: IBM 'CYCLONE' TIMER
  */
@@ -440,20 +481,24 @@ static int init_cyclone(struct platform_timesource *pts)
         printk(KERN_ERR "Cyclone: Could not find valid CBAR value.\n");
         return 0;
     }
+
     /* Enable timer and map the counter register. */
     *(map_cyclone_reg(base + CYCLONE_PMCC_OFFSET)) = 1;
     *(map_cyclone_reg(base + CYCLONE_MPCS_OFFSET)) = 1;
     cyclone_timer = map_cyclone_reg(base + CYCLONE_MPMC_OFFSET);
-
-    pts->name = "IBM Cyclone";
-    pts->frequency = CYCLONE_TIMER_FREQ;
-    pts->read_counter = read_cyclone_count;
-    pts->counter_bits = 32;
-
     return 1;
 }
 
+static struct platform_timesource plt_cyclone =
+{
+    .id = "cyclone",
+    .name = "IBM Cyclone",
+    .frequency = CYCLONE_TIMER_FREQ,
+    .read_counter = read_cyclone_count,
+    .counter_bits = 32,
+    .init = init_cyclone
+};
+
 /************************************************************
  * PLATFORM TIMER 4: ACPI PM TIMER
  */
@@ -473,14 +518,39 @@ static int init_pmtimer(struct platform_timesource *pts)
     if ( pmtmr_ioport == 0 )
         return 0;
 
-    pts->name = "ACPI PM Timer";
-    pts->frequency = ACPI_PM_FREQUENCY;
-    pts->read_counter = read_pmtimer_count;
-    pts->counter_bits = 24;
-
     return 1;
 }
 
+static struct platform_timesource plt_pmtimer =
+{
+    .id = "acpi",
+    .name = "ACPI PM Timer",
+    .frequency = ACPI_PM_FREQUENCY,
+    .read_counter = read_pmtimer_count,
+    .counter_bits = 24,
+    .init = init_pmtimer
+};
+
+static struct time_scale pmt_scale;
+static struct time_scale pmt_scale_r;
+static __init int init_pmtmr_scale(void)
+{
+    set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
+    pmt_scale_r = scale_reciprocal(pmt_scale);
+    return 0;
+}
+__initcall(init_pmtmr_scale);
+
+uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
+{
+    return scale_delta(ticks, &pmt_scale);
+}
+
+uint64_t ns_to_acpi_pm_tick(uint64_t ns)
+{
+    return scale_delta(ns, &pmt_scale_r);
+}
+
 /************************************************************
  * GENERIC PLATFORM TIMER INFRASTRUCTURE
  */
@@ -537,37 +607,46 @@ static void platform_time_calibration(void)
 {
     u64 count;
     s_time_t stamp;
+    unsigned long flags;
 
-    spin_lock_irq(&platform_timer_lock);
+    spin_lock_irqsave(&platform_timer_lock, flags);
     count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
     stamp = __read_platform_stime(count);
     stime_platform_stamp = stamp;
     platform_timer_stamp = count;
-    spin_unlock_irq(&platform_timer_lock);
+    spin_unlock_irqrestore(&platform_timer_lock, flags);
 }
 
 static void resume_platform_timer(void)
 {
-    /* No change in platform_stime across suspend/resume. */
-    platform_timer_stamp = plt_stamp64;
+    /* Timer source can be reset when backing from S3 to S0 */
+    if ( plt_src.resume )
+        plt_src.resume(&plt_src);
+
+    plt_stamp64 = platform_timer_stamp;
     plt_stamp = plt_src.read_counter();
 }
 
 static void init_platform_timer(void)
 {
-    struct platform_timesource *pts = &plt_src;
-    int rc = -1;
+    static struct platform_timesource * const plt_timers[] = {
+        &plt_cyclone, &plt_hpet, &plt_pmtimer, &plt_pit
+    };
+
+    struct platform_timesource *pts = NULL;
+    int i, rc = -1;
 
     if ( opt_clocksource[0] != '\0' )
     {
-        if ( !strcmp(opt_clocksource, "pit") )
-            rc = (init_pit(pts), 1);
-        else if ( !strcmp(opt_clocksource, "hpet") )
-            rc = init_hpet(pts);
-        else if ( !strcmp(opt_clocksource, "cyclone") )
-            rc = init_cyclone(pts);
-        else if ( !strcmp(opt_clocksource, "acpi") )
-            rc = init_pmtimer(pts);
+        for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
+        {
+            pts = plt_timers[i];
+            if ( !strcmp(opt_clocksource, pts->id) )
+            {
+                rc = pts->init(pts);
+                break;
+            }
+        }
 
         if ( rc <= 0 )
             printk("WARNING: %s clocksource '%s'.\n",
@@ -575,11 +654,17 @@ static void init_platform_timer(void)
                    opt_clocksource);
     }
 
-    if ( (rc <= 0) &&
-         !init_cyclone(pts) &&
-         !init_hpet(pts) &&
-         !init_pmtimer(pts) )
-        init_pit(pts);
+    if ( rc <= 0 )
+    {
+        for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
+        {
+            pts = plt_timers[i];
+            if ( (rc = pts->init(pts)) > 0 )
+                break;
+        }
+    }
+
+    BUG_ON(rc <= 0);
 
     plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
 
@@ -588,37 +673,33 @@ static void init_platform_timer(void)
     plt_overflow_period = scale_delta(
         1ull << (pts->counter_bits-1), &plt_scale);
     init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
+    plt_src = *pts;
     plt_overflow(NULL);
 
     platform_timer_stamp = plt_stamp64;
+    stime_platform_stamp = NOW();
 
     printk("Platform timer is %s %s\n",
            freq_string(pts->frequency), pts->name);
 }
 
-void cstate_save_tsc(void)
+void cstate_restore_tsc(void)
 {
     struct cpu_time *t = &this_cpu(cpu_time);
+    struct time_scale sys_to_tsc = scale_reciprocal(t->tsc_scale);
+    s_time_t stime_delta;
+    u64 tsc_delta;
 
-    if ( tsc_invariant )
+    if ( boot_cpu_has(X86_FEATURE_NOSTOP_TSC) )
         return;
 
-    t->cstate_plt_count_stamp = plt_src.read_counter();
-    rdtscll(t->cstate_tsc_stamp);
-}
-
-void cstate_restore_tsc(void)
-{
-    struct cpu_time *t = &this_cpu(cpu_time);
-    u64 plt_count_delta, tsc_delta;
+    stime_delta = read_platform_stime() - t->stime_master_stamp;
+    if ( stime_delta < 0 )
+        stime_delta = 0;
 
-    if ( tsc_invariant )
-        return;
+    tsc_delta = scale_delta(stime_delta, &sys_to_tsc);
 
-    plt_count_delta = (plt_src.read_counter() -
-                       t->cstate_plt_count_stamp) & plt_mask;
-    tsc_delta = scale_delta(plt_count_delta, &plt_scale) * cpu_khz/1000000UL;
-    wrmsrl(MSR_IA32_TSC, t->cstate_tsc_stamp + tsc_delta);
+    wrmsrl(MSR_IA32_TSC, t->local_tsc_stamp + tsc_delta);
 }
 
 /***************************************************************************
@@ -878,6 +959,18 @@ static void local_time_calibration(void)
     /* The overall calibration scale multiplier. */
     u32 calibration_mul_frac;
 
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        /* Atomically read cpu_calibration struct and write cpu_time struct. */
+        local_irq_disable();
+        t->local_tsc_stamp    = c->local_tsc_stamp;
+        t->stime_local_stamp  = c->stime_master_stamp;
+        t->stime_master_stamp = c->stime_master_stamp;
+        local_irq_enable();
+        update_vcpu_system_time(current);
+        goto out;
+    }
+
     prev_tsc          = t->local_tsc_stamp;
     prev_local_stime  = t->stime_local_stamp;
     prev_master_stime = t->stime_master_stamp;
@@ -994,30 +1087,66 @@ static void local_time_calibration(void)
  */
 struct calibration_rendezvous {
     cpumask_t cpu_calibration_map;
-    atomic_t nr_cpus;
+    atomic_t count_start;
+    atomic_t count_end;
     s_time_t master_stime;
+    u64 master_tsc_stamp;
 };
 
+#define NR_LOOPS 5
+
 static void time_calibration_rendezvous(void *_r)
 {
+    int i;
     struct cpu_calibration *c = &this_cpu(cpu_calibration);
     struct calibration_rendezvous *r = _r;
     unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
 
-    if ( smp_processor_id() == 0 )
-    {
-        while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
-            cpu_relax();
-        r->master_stime = read_platform_stime();
-        mb(); /* write r->master_stime /then/ signal */
-        atomic_inc(&r->nr_cpus);
-    }
-    else
+    /* 
+     * Loop is used here to get rid of the cache's side effect to enlarge
+     * the TSC difference among CPUs.
+     */
+    for ( i = 0; i < NR_LOOPS; i++ )
     {
-        atomic_inc(&r->nr_cpus);
-        while ( atomic_read(&r->nr_cpus) != total_cpus )
-            cpu_relax();
-        mb(); /* receive signal /then/ read r->master_stime */
+        if ( smp_processor_id() == 0 )
+        {
+            while ( atomic_read(&r->count_start) != (total_cpus - 1) )
+                mb();
+   
+            if ( r->master_stime == 0 )
+            {
+                r->master_stime = read_platform_stime();
+                if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+                    rdtscll(r->master_tsc_stamp);
+            }
+            atomic_set(&r->count_end, 0);
+            wmb();
+            atomic_inc(&r->count_start);
+    
+            if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 
+                 i == NR_LOOPS - 1 )
+                write_tsc((u32)r->master_tsc_stamp, (u32)(r->master_tsc_stamp >> 32));
+    
+            while (atomic_read(&r->count_end) != total_cpus - 1)
+                mb();
+            atomic_set(&r->count_start, 0);
+            wmb();
+            atomic_inc(&r->count_end);
+        }
+        else
+        {
+            atomic_inc(&r->count_start);
+            while ( atomic_read(&r->count_start) != total_cpus )
+                mb();
+    
+            if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 
+                 i == NR_LOOPS - 1 )
+                write_tsc((u32)r->master_tsc_stamp, (u32)(r->master_tsc_stamp >> 32));
+    
+            atomic_inc(&r->count_end);
+            while (atomic_read(&r->count_end) != total_cpus)
+                mb();
+        }
     }
 
     rdtscll(c->local_tsc_stamp);
@@ -1031,7 +1160,9 @@ static void time_calibration(void *unused)
 {
     struct calibration_rendezvous r = {
         .cpu_calibration_map = cpu_online_map,
-        .nr_cpus = ATOMIC_INIT(0)
+        .count_start = ATOMIC_INIT(0),
+        .count_end = ATOMIC_INIT(0),
+        .master_stime = 0
     };
 
     /* @wait=1 because we must wait for all cpus before freeing @r. */
@@ -1047,7 +1178,7 @@ void init_percpu_time(void)
 
     local_irq_save(flags);
     rdtscll(t->local_tsc_stamp);
-    now = !plt_src.read_counter ? 0 : read_platform_stime();
+    now = read_platform_stime();
     local_irq_restore(flags);
 
     t->stime_master_stamp = now;
@@ -1063,23 +1194,25 @@ void init_percpu_time(void)
 /* Late init function (after all CPUs are booted). */
 int __init init_xen_time(void)
 {
-    local_irq_disable();
-
-    /* check if TSC is invariant during deep C state
-       this is a new feature introduced by Nehalem*/
-    if ( cpuid_edx(0x80000007) & (1u<<8) )
-        tsc_invariant = 1;
+    /* If we have constant TSCs then scale factor can be shared. */
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+    {
+        int cpu;
+        for_each_cpu ( cpu )
+            per_cpu(cpu_time, cpu).tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
+    }
 
     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
 
-    init_percpu_time();
-
-    stime_platform_stamp = 0;
-    init_platform_timer();
+    /* System time (get_s_time()) starts ticking from now. */
+    rdtscll(this_cpu(cpu_time).local_tsc_stamp);
 
+    /* NB. get_cmos_time() can take over one second to execute. */
     do_settime(get_cmos_time(), 0, NOW());
 
-    local_irq_enable();
+    init_platform_timer();
+
+    init_percpu_time();
 
     return 0;
 }
@@ -1100,24 +1233,19 @@ void __init early_time_init(void)
     setup_irq(0, &irq0);
 }
 
-/* force_hpet_broadcast: if true, force using hpet_broadcast to fix lapic stop
-   issue for deep C state with pit disabled */
-static int force_hpet_broadcast;
-boolean_param("hpetbroadcast", force_hpet_broadcast);
-
 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
 static int disable_pit_irq(void)
 {
-    if ( using_pit || !cpu_has_apic || (xen_cpuidle && !force_hpet_broadcast) )
+    if ( using_pit || !cpu_has_apic )
         return 0;
 
     /*
      * If we do not rely on PIT CH0 then we can use HPET for one-shot timer 
      * emulation when entering deep C states.
      * XXX dom0 may rely on RTC interrupt delivery, so only enable
-     * hpet_broadcast if force_hpet_broadcast.
+     * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
      */
-    if ( xen_cpuidle && force_hpet_broadcast )
+    if ( xen_cpuidle )
     {
         hpet_broadcast_init();
         if ( !hpet_broadcast_is_available() )
@@ -1176,6 +1304,9 @@ int time_suspend(void)
         cmos_utc_offset = -get_cmos_time();
         cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL);
         kill_timer(&calibration_timer);
+
+        /* Sync platform timer stamps. */
+        platform_time_calibration();
     }
 
     /* Better to cancel calibration timer for accuracy. */
@@ -1188,19 +1319,18 @@ int time_resume(void)
 {
     /*u64 tmp = */init_pit_and_calibrate_tsc();
 
-    disable_pit_irq();
-
     /* Disable this while calibrate_tsc_ap() also is skipped. */
     /*set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);*/
 
     resume_platform_timer();
 
+    disable_pit_irq();
+
     init_percpu_time();
 
     do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW());
 
-    if ( !is_idle_vcpu(current) )
-        update_vcpu_system_time(current);
+    update_vcpu_system_time(current);
 
     return 0;
 }
index ce96ad6f68eac02c958ce31274851727bc9aca5a..dfdc7bf044bbcfa48787accb3565f19c1197762f 100644 (file)
@@ -723,10 +723,11 @@ static void pv_cpuid(struct cpu_user_regs *regs)
     {
         /* Modify Feature Information. */
         __clear_bit(X86_FEATURE_VME, &d);
-        __clear_bit(X86_FEATURE_PSE, &d);
+        if ( !cpu_has_apic )
+            __clear_bit(X86_FEATURE_APIC, &d);
+        if ( !opt_allow_hugepage )
+            __clear_bit(X86_FEATURE_PSE, &d);
         __clear_bit(X86_FEATURE_PGE, &d);
-        __clear_bit(X86_FEATURE_MCE, &d);
-        __clear_bit(X86_FEATURE_MCA, &d);
         __clear_bit(X86_FEATURE_PSE36, &d);
     }
     switch ( (uint32_t)regs->eax )
@@ -754,6 +755,10 @@ static void pv_cpuid(struct cpu_user_regs *regs)
         __clear_bit(X86_FEATURE_XTPR % 32, &c);
         __clear_bit(X86_FEATURE_PDCM % 32, &c);
         __clear_bit(X86_FEATURE_DCA % 32, &c);
+        __clear_bit(X86_FEATURE_XSAVE % 32, &c);
+        if ( !cpu_has_apic )
+           __clear_bit(X86_FEATURE_X2APIC % 32, &c);
+        __set_bit(X86_FEATURE_HYPERVISOR % 32, &c);
         break;
     case 0x80000001:
         /* Modify Feature Information. */
@@ -771,6 +776,8 @@ static void pv_cpuid(struct cpu_user_regs *regs)
         __clear_bit(X86_FEATURE_RDTSCP % 32, &d);
 
         __clear_bit(X86_FEATURE_SVME % 32, &c);
+        if ( !cpu_has_apic )
+           __clear_bit(X86_FEATURE_EXTAPICSPACE % 32, &c);
         __clear_bit(X86_FEATURE_OSVW % 32, &c);
         __clear_bit(X86_FEATURE_IBS % 32, &c);
         __clear_bit(X86_FEATURE_SKINIT % 32, &c);
@@ -1030,7 +1037,7 @@ static int handle_gdt_ldt_mapping_fault(
 #endif
 
 static int __spurious_page_fault(
-    unsigned long addr, struct cpu_user_regs *regs)
+    unsigned long addr, unsigned int error_code)
 {
     unsigned long mfn, cr3 = read_cr3();
 #if CONFIG_PAGING_LEVELS >= 4
@@ -1052,17 +1059,17 @@ static int __spurious_page_fault(
         return 0;
 
     /* Reserved bit violations are never spurious faults. */
-    if ( regs->error_code & PFEC_reserved_bit )
+    if ( error_code & PFEC_reserved_bit )
         return 0;
 
     required_flags  = _PAGE_PRESENT;
-    if ( regs->error_code & PFEC_write_access )
+    if ( error_code & PFEC_write_access )
         required_flags |= _PAGE_RW;
-    if ( regs->error_code & PFEC_user_mode )
+    if ( error_code & PFEC_user_mode )
         required_flags |= _PAGE_USER;
 
     disallowed_flags = 0;
-    if ( regs->error_code & PFEC_insn_fetch )
+    if ( error_code & PFEC_insn_fetch )
         disallowed_flags |= _PAGE_NX;
 
     mfn = cr3 >> PAGE_SHIFT;
@@ -1120,7 +1127,7 @@ static int __spurious_page_fault(
     dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
             "at addr %lx, e/c %04x\n",
             current->domain->domain_id, current->vcpu_id,
-            addr, regs->error_code);
+            addr, error_code);
 #if CONFIG_PAGING_LEVELS >= 4
     dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
 #endif
@@ -1129,14 +1136,11 @@ static int __spurious_page_fault(
 #endif
     dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
     dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
-#ifndef NDEBUG
-    show_registers(regs);
-#endif
     return 1;
 }
 
 static int spurious_page_fault(
-    unsigned long addr, struct cpu_user_regs *regs)
+    unsigned long addr, unsigned int error_code)
 {
     unsigned long flags;
     int           is_spurious;
@@ -1146,7 +1150,7 @@ static int spurious_page_fault(
      * page tables from becoming invalid under our feet during the walk.
      */
     local_irq_save(flags);
-    is_spurious = __spurious_page_fault(addr, regs);
+    is_spurious = __spurious_page_fault(addr, error_code);
     local_irq_restore(flags);
 
     return is_spurious;
@@ -1161,15 +1165,17 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
     if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
         return 0;
 
+    /* Faults from external-mode guests are handled by shadow/hap */
+    if ( paging_mode_external(d) && guest_mode(regs) )
+    {
+        int ret = paging_fault(addr, regs);
+        if ( ret == EXCRET_fault_fixed )
+            trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
+        return ret;
+    }
+
     if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
     {
-        if ( paging_mode_external(d) && guest_mode(regs) )
-        {
-            int ret = paging_fault(addr, regs);
-            if ( ret == EXCRET_fault_fixed )
-                trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr);
-            return ret;
-        }
         if ( !(regs->error_code & PFEC_reserved_bit) &&
              (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
             return handle_gdt_ldt_mapping_fault(
@@ -1186,7 +1192,9 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
          ptwr_do_page_fault(v, addr, regs) )
         return EXCRET_fault_fixed;
 
-    if ( paging_mode_enabled(d) )
+    /* For non-external shadowed guests, we fix up both their own 
+     * pagefaults and Xen's, since they share the pagetables. */
+    if ( paging_mode_enabled(d) && !paging_mode_external(d) )
     {
         int ret = paging_fault(addr, regs);
         if ( ret == EXCRET_fault_fixed )
@@ -1208,9 +1216,13 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
 {
     unsigned long addr, fixup;
+    unsigned int error_code;
 
     addr = read_cr2();
 
+    /* fixup_page_fault() might change regs->error_code, so cache it here. */
+    error_code = regs->error_code;
+
     DEBUGGER_trap_entry(TRAP_page_fault, regs);
 
     perfc_incr(page_faults);
@@ -1220,7 +1232,7 @@ asmlinkage void do_page_fault(struct cpu_user_regs *regs)
 
     if ( unlikely(!guest_mode(regs)) )
     {
-        if ( spurious_page_fault(addr, regs) )
+        if ( spurious_page_fault(addr, error_code) )
             return;
 
         if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
@@ -1239,11 +1251,11 @@ asmlinkage void do_page_fault(struct cpu_user_regs *regs)
         panic("FATAL PAGE FAULT\n"
               "[error_code=%04x]\n"
               "Faulting linear address: %p\n",
-              regs->error_code, _p(addr));
+              error_code, _p(addr));
     }
 
     if ( unlikely(current->domain->arch.suppress_spurious_page_faults
-                  && spurious_page_fault(addr, regs)) )
+                  && spurious_page_fault(addr, error_code)) )
         return;
 
     propagate_page_fault(addr, regs->error_code);
@@ -1619,6 +1631,16 @@ void (*pv_post_outb_hook)(unsigned int port, u8 value);
 # define read_sreg(regs, sr) read_segment_register(sr)
 #endif
 
+static int is_cpufreq_controller(struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            (d->domain_id == 0));
+}
+
+/*Intel vMCE MSRs virtualization*/
+extern int intel_mce_wrmsr(u32 msr, u32 lo,  u32 hi);
+extern int intel_mce_rdmsr(u32 msr, u32 *lo,  u32 *hi);
+
 static int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
@@ -2002,9 +2024,12 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case 4: /* Read CR4 */
             /*
              * Guests can read CR4 to see what features Xen has enabled. We
-             * therefore lie about PGE & PSE as they are unavailable to guests.
+             * therefore lie about PGE as it is unavailable to guests.
+             * Also disallow PSE if hugepages are not enabled.
              */
-            *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
+            *reg = read_cr4() & ~X86_CR4_PGE;
+            if ( !opt_allow_hugepage )
+                *reg &= ~X86_CR4_PSE;
             break;
 
         default:
@@ -2127,7 +2152,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_K8_PSTATE7:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
                 goto fail;
-            if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            if ( !is_cpufreq_controller(v->domain) )
                 break;
             if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
                 goto fail;
@@ -2162,10 +2187,12 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
             if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
                 goto fail;
             break;
+        case MSR_IA32_MPERF:
+        case MSR_IA32_APERF:
         case MSR_IA32_PERF_CTL:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
                 goto fail;
-            if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            if ( !is_cpufreq_controller(v->domain) )
                 break;
             if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
                 goto fail;
@@ -2173,12 +2200,23 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_IA32_THERM_CONTROL:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
                 goto fail;
+            if ( (v->domain->domain_id != 0) || !v->domain->is_pinned )
+                break;
             if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
                 goto fail;
             break;
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
                 break;
+            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+            {
+                int rc = intel_mce_wrmsr(regs->ecx, eax, edx);
+                if ( rc == -1 )
+                    goto fail;
+                if ( rc == 0 )
+                    break;
+            }
+
             if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
                  (eax != l) || (edx != h) )
         invalid:
@@ -2231,7 +2269,7 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
         case MSR_K8_PSTATE7:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
                 goto fail;
-            if ( cpufreq_controller != FREQCTL_dom0_kernel )
+            if ( !is_cpufreq_controller(v->domain) )
             {
                 regs->eax = regs->edx = 0;
                 break;
@@ -2249,7 +2287,6 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
                          MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
             break;
         case MSR_EFER:
-        case MSR_IA32_THERM_CONTROL:
         case MSR_AMD_PATCHLEVEL:
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
@@ -2263,6 +2300,16 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
                         _p(regs->ecx));*/
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
                 goto fail;
+
+            if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+            {
+                int rc = intel_mce_rdmsr(regs->ecx, &eax, &edx);
+                if ( rc == -1 )
+                    goto fail;
+                if ( rc == 0 )
+                    break;
+            }
+
             break;
         }
         break;
@@ -2978,20 +3025,31 @@ void set_intr_gate(unsigned int n, void *addr)
     __set_intr_gate(n, 0, addr);
 }
 
-void set_tss_desc(unsigned int n, void *addr)
+void load_TR(void)
 {
+    struct tss_struct *tss = &init_tss[smp_processor_id()];
+    struct desc_ptr old_gdt, tss_gdt = {
+        .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
+        .limit = LAST_RESERVED_GDT_BYTE
+    };
+
     _set_tssldt_desc(
-        per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
-        (unsigned long)addr,
+        this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+        (unsigned long)tss,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         9);
 #ifdef CONFIG_COMPAT
     _set_tssldt_desc(
-        per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
-        (unsigned long)addr,
+        this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+        (unsigned long)tss,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         11);
 #endif
+
+    /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
+    asm volatile (
+        "sgdt %1; lgdt %2; ltr %%ax; lgdt %1"
+        : : "a" (TSS_ENTRY << 3), "m" (old_gdt), "m" (tss_gdt) : "memory" );
 }
 
 void __devinit percpu_traps_init(void)
@@ -3077,7 +3135,8 @@ long register_guest_nmi_callback(unsigned long address)
 
     t->vector  = TRAP_nmi;
     t->flags   = 0;
-    t->cs      = !IS_COMPAT(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS;
+    t->cs      = (is_pv_32on64_domain(d) ?
+                  FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS);
     t->address = address;
     TI_SET_IF(t, 1);
 
index 0ed2a0a26f21d673da5db23078e70ddad8c4cf59..98e831ee00f5d5c3ffebab987caa272d000da2ba 100644 (file)
@@ -88,7 +88,9 @@ void __dummy__(void)
     BLANK();
 
     OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
-    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
     OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
     BLANK();
 
index 10f807b3e3abc18bcab5e7e2343ab563401aa659..516a630af517c338982ca961a822e33aecb516b5 100644 (file)
@@ -43,7 +43,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
 void *map_domain_page(unsigned long mfn)
 {
     unsigned long va;
-    unsigned int idx, i;
+    unsigned int idx, i, flags;
     struct vcpu *v;
     struct mapcache_domain *dcache;
     struct mapcache_vcpu *vcache;
@@ -69,7 +69,7 @@ void *map_domain_page(unsigned long mfn)
         goto out;
     }
 
-    spin_lock(&dcache->lock);
+    spin_lock_irqsave(&dcache->lock, flags);
 
     /* Has some other CPU caused a wrap? We must flush if so. */
     if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
@@ -105,7 +105,7 @@ void *map_domain_page(unsigned long mfn)
     set_bit(idx, dcache->inuse);
     dcache->cursor = idx + 1;
 
-    spin_unlock(&dcache->lock);
+    spin_unlock_irqrestore(&dcache->lock, flags);
 
     l1e_write(&dcache->l1tab[idx], l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
 
@@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn)
     return (void *)va;
 }
 
-void unmap_domain_page(void *va)
+void unmap_domain_page(const void *va)
 {
     unsigned int idx;
     struct vcpu *v;
@@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned long mfn)
     return (void *)va;
 }
 
-void unmap_domain_page_global(void *va)
+void unmap_domain_page_global(const void *va)
 {
     unsigned long __va = (unsigned long)va;
     l2_pgentry_t *pl2e;
index 8a08617981ef650f3328f91297e879d9fea5a773..764d3290e14aa896ccbf4508ea4f7deb41c6c4ef 100644 (file)
@@ -703,6 +703,7 @@ ENTRY(hypercall_table)
         .long do_sysctl             /* 35 */
         .long do_domctl
         .long do_kexec_op
+        .long do_tmem_op
         .rept __HYPERVISOR_arch_0-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -750,6 +751,7 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec_op          */
+        .byte 1 /* do_tmem_op           */
         .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
index b22ddbf5dc0a0d4d8fa1b8c5b98a2c27ba099353..11b8cae944b16c388ddebedf26130841b0e3b95f 100644 (file)
@@ -6,8 +6,6 @@
  * - Magnus Damm <magnus@valinux.co.jp>
  */
 
-#ifndef CONFIG_COMPAT
-
 #include <xen/types.h>
 #include <xen/kernel.h>
 #include <asm/page.h>
@@ -20,7 +18,6 @@ int machine_kexec_get_xen(xen_kexec_range_t *range)
                       (unsigned long)range->start;
         return 0;
 }
-#endif
 
 /*
  * Local variables:
index ea3c18ca886388949091cd5ba8be29ca06242a87..668f729cf6a55168588cbfdbc7eb68d5704277c6 100644 (file)
@@ -43,7 +43,7 @@ static unsigned long mpt_size;
 void *alloc_xen_pagetable(void)
 {
     extern int early_boot;
-    extern unsigned long xenheap_phys_start;
+    extern unsigned long xenheap_initial_phys_start;
     unsigned long mfn;
 
     if ( !early_boot )
@@ -53,8 +53,8 @@ void *alloc_xen_pagetable(void)
         return v;
     }
 
-    mfn = xenheap_phys_start >> PAGE_SHIFT;
-    xenheap_phys_start += PAGE_SIZE;
+    mfn = xenheap_initial_phys_start >> PAGE_SHIFT;
+    xenheap_initial_phys_start += PAGE_SIZE;
     return mfn_to_virt(mfn);
 }
 
@@ -132,30 +132,6 @@ void __init setup_idle_pagetable(void)
                                 __PAGE_HYPERVISOR));
 }
 
-unsigned long clone_idle_pagetable(struct vcpu *v)
-{
-    unsigned int i;
-    struct domain *d = v->domain;
-    l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
-    l2_pgentry_t *l2_table = alloc_xenheap_page();
-
-    if ( !l2_table )
-        return 0;
-
-    memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
-    l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] =
-        l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
-
-    copy_page(l2_table, idle_pg_table_l2 +
-              l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
-    for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
-        l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-            l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
-                          __PAGE_HYPERVISOR);
-
-    return __pa(l3_table);
-}
-
 void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
 {
     int i;
@@ -183,15 +159,6 @@ void __init subarch_init_memory(void)
     unsigned long m2p_start_mfn;
     unsigned int i, j;
 
-    /*
-     * We are rather picky about the layout of 'struct page_info'. The
-     * count_info and domain fields must be adjacent, as we perform atomic
-     * 64-bit operations on them. Also, just for sanity, we assert the size
-     * of the structure here.
-     */
-    BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) != 
-                 (offsetof(struct page_info, count_info) + sizeof(u32)));
-    BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
     BUILD_BUG_ON(sizeof(struct page_info) != 24);
 
     /* M2P table is mappable read-only by privileged domains. */
index 2c3d21fcede130cc659e7c83143bcfea2b25db18..1bbb65405a4db04d73d870ae970c5c2018d21578 100644 (file)
@@ -26,7 +26,6 @@ SECTIONS
        *(.fixup)
        *(.gnu.warning)
        } :text =0x9090
-  .text.lock : { *(.text.lock) } :text /* out-of-line lock text */
 
   _etext = .;                  /* End of text section */
 
@@ -92,6 +91,7 @@ SECTIONS
        *(.exit.text)
        *(.exit.data)
        *(.exitcall.exit)
+       *(.eh_frame)
        }
 
   /* Stabs debugging sections.  */
index 268494745ce45b2e188117e84d4b3d260751f8cb..a1428273ebda0274d603c10d5bb15c3bde1fbb30 100644 (file)
@@ -13,15 +13,4 @@ obj-$(CONFIG_COMPAT) += domain.o
 obj-$(CONFIG_COMPAT) += physdev.o
 obj-$(CONFIG_COMPAT) += platform_hypercall.o
 obj-$(CONFIG_COMPAT) += cpu_idle.o
-
-ifeq ($(CONFIG_COMPAT),y)
-# extra dependencies
-compat.o:      ../compat.c
-domctl.o:      ../domctl.c
-mm.o:          compat/mm.c
-physdev.o:     ../physdev.c
-platform_hypercall.o: ../platform_hypercall.c
-sysctl.o:      ../sysctl.c
-traps.o:       compat/traps.c
-cpu_idle.o:    ../acpi/cpu_idle.c
-endif
+obj-$(CONFIG_COMPAT) += cpufreq.o
index ca00490756c528d0fa365e2cd52a50e25ef0d9a8..d4c4262f1f68b8af8ef108917b1d4d342cba46d9 100644 (file)
@@ -60,6 +60,8 @@ void __dummy__(void)
     DEFINE(UREGS_user_sizeof, sizeof(struct cpu_user_regs));
     BLANK();
 
+    OFFSET(irq_caps_offset, struct domain, irq_caps);
+    OFFSET(next_in_list_offset, struct domain, next_in_list);
     OFFSET(VCPU_processor, struct vcpu, processor);
     OFFSET(VCPU_domain, struct vcpu, domain);
     OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info);
@@ -107,7 +109,9 @@ void __dummy__(void)
     BLANK();
 
     OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
-    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
     OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
     BLANK();
 
index 085babedc49e854485bf6bd78ade5ed015ad12ee..7efedc903f062e6c85191f622b0155e955104faa 100644 (file)
@@ -234,6 +234,7 @@ ENTRY(compat_syscall)
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
 2:      movl  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
+        subl  $2,UREGS_rip(%rsp)
         movq  VCPU_gp_fault_addr(%rbx),%rax
         movzwl VCPU_gp_fault_sel(%rbx),%esi
         movb  $(TBF_EXCEPTION|TBF_EXCEPTION_ERRCODE|TBF_INTERRUPT),%cl
@@ -407,6 +408,7 @@ ENTRY(compat_hypercall_table)
         .quad do_sysctl                 /* 35 */
         .quad do_domctl
         .quad compat_kexec_op
+        .quad do_tmem_op
         .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
         .quad compat_ni_hypercall
         .endr
@@ -454,6 +456,7 @@ ENTRY(compat_hypercall_args_table)
         .byte 1 /* do_sysctl                */  /* 35 */
         .byte 1 /* do_domctl                */
         .byte 2 /* compat_kexec_op          */
+        .byte 1 /* do_tmem_op               */
         .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
index 913760c14f8fc01ebcf283bb7cc722627721d4cc..0ac685f3669995247e7de8f9477f99f102e9900c 100644 (file)
@@ -69,20 +69,6 @@ int compat_arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
-    case XENMEM_remove_from_physmap:
-    {
-        struct compat_remove_from_physmap cmp;
-        struct xen_remove_from_physmap *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
-
-        if ( copy_from_guest(&cmp, arg, 1) )
-            return -EFAULT;
-
-        XLAT_remove_from_physmap(nat, &cmp);
-        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
-
-        break;
-    }
-
     case XENMEM_set_memory_map:
     {
         struct compat_foreign_memory_map cmp;
@@ -128,6 +114,29 @@ int compat_arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        struct compat_pod_target cmp;
+        struct xen_pod_target *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
+
+        if ( copy_from_guest(&cmp, arg, 1) )
+            return -EFAULT;
+
+        XLAT_pod_target(nat, &cmp);
+
+        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
+        if ( rc < 0 )
+            break;
+
+        XLAT_pod_target(&cmp, nat);
+
+        if ( copy_to_guest(arg, &cmp, 1) )
+            rc = -EFAULT;
+
+        break;
+    }
+
     case XENMEM_machphys_mapping:
     {
         struct domain *d = current->domain;
@@ -231,6 +240,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mmuext_op_compat_t) cmp_uops,
             case MMUEXT_PIN_L4_TABLE:
             case MMUEXT_UNPIN_TABLE:
             case MMUEXT_NEW_BASEPTR:
+            case MMUEXT_CLEAR_PAGE:
+            case MMUEXT_COPY_PAGE:
                 arg1 = XLAT_mmuext_op_arg1_mfn;
                 break;
             default:
@@ -258,6 +269,9 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mmuext_op_compat_t) cmp_uops,
             case MMUEXT_INVLPG_MULTI:
                 arg2 = XLAT_mmuext_op_arg2_vcpumask;
                 break;
+            case MMUEXT_COPY_PAGE:
+                arg2 = XLAT_mmuext_op_arg2_src_mfn;
+                break;
             default:
                 arg2 = -1;
                 break;
index 6718f30a54ddd82fe1cdb6fd67c933ddb1eb8453..c26248e97b1efc8d254ac79a2b9f94b930bbc362 100644 (file)
@@ -44,7 +44,7 @@ DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t);
     xlat_page_current = xlat_page_start; \
 } while (0)
 
-static void *xlat_malloc(unsigned long *xlat_page_current, size_t size)
+void *xlat_malloc(unsigned long *xlat_page_current, size_t size)
 {
     void *ret;
 
diff --git a/xen/arch/x86/x86_64/cpufreq.c b/xen/arch/x86/x86_64/cpufreq.c
new file mode 100644 (file)
index 0000000..d005dfd
--- /dev/null
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * cpufreq.c -- adapt 32b compat guest to 64b hypervisor.
+ *
+ *  Copyright (C) 2008, Liu Jinsong <jinsong.liu@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/xmalloc.h>
+#include <xen/guest_access.h>
+#include <compat/platform.h>
+
+DEFINE_XEN_GUEST_HANDLE(compat_processor_px_t);
+
+#define xlat_page_start ((unsigned long)COMPAT_ARG_XLAT_VIRT_BASE)
+
+#define xlat_malloc_init(xlat_page_current)    do { \
+    xlat_page_current = xlat_page_start; \
+} while (0)
+
+extern void *xlat_malloc(unsigned long *xlat_page_current, size_t size);
+
+#define xlat_malloc_array(_p, _t, _c) ((_t *) xlat_malloc(&_p, sizeof(_t) * _c))
+
+extern int 
+set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf);
+
+int 
+compat_set_px_pminfo(uint32_t cpu, struct compat_processor_performance *perf)
+{
+    struct xen_processor_performance *xen_perf;
+    unsigned long xlat_page_current;
+
+    xlat_malloc_init(xlat_page_current);
+
+    xen_perf = xlat_malloc_array(xlat_page_current,
+                                  struct xen_processor_performance, 1);
+    if ( unlikely(xen_perf == NULL) )
+       return -EFAULT;
+
+#define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \
+    XEN_GUEST_HANDLE(compat_processor_px_t) states; \
+    if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \
+        return -EFAULT; \
+    guest_from_compat_handle(states, (_s_)->states); \
+    (_d_)->states = guest_handle_cast(states, xen_processor_px_t); \
+} while (0)
+
+    XLAT_processor_performance(xen_perf, perf);
+#undef XLAT_processor_performance_HNDL_states
+
+    return set_px_pminfo(cpu, xen_perf);
+}
index d6491ce2ed32f6b8b2c6eab887be736b6ea9e05d..dce286ef3f44e173c240b693d9da24b5cd61eacc 100644 (file)
@@ -692,6 +692,7 @@ ENTRY(hypercall_table)
         .quad do_sysctl             /* 35 */
         .quad do_domctl
         .quad do_kexec_op
+        .quad do_tmem_op
         .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -739,7 +740,7 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec             */
-        .byte 1 /* do_xsm_op            */
+        .byte 1 /* do_tmem_op           */
         .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
index 96413cb8287a4b74417ca3033875b17e839547d0..0d4b1a10cf9f5fa42d6818b596d92804efd89d43 100644 (file)
@@ -6,20 +6,17 @@
  * - Magnus Damm <magnus@valinux.co.jp>
  */
 
-#ifndef CONFIG_COMPAT
-
 #include <xen/types.h>
+#include <xen/kernel.h>
 #include <asm/page.h>
 #include <public/kexec.h>
 
 int machine_kexec_get_xen(xen_kexec_range_t *range)
 {
-        range->start = xenheap_phys_start;
-        range->size = (unsigned long)xenheap_phys_end -
-                      (unsigned long)range->start;
+        range->start = virt_to_maddr(_start);
+        range->size = virt_to_maddr(_end) - (unsigned long)range->start;
         return 0;
 }
-#endif
 
 /*
  * Local variables:
index 6903a227a35ed716c73bda7bb0e45a9d2bb38810..81440d100af49e4a5ca0d9b7bb48ec3501e4a8c3 100644 (file)
@@ -21,7 +21,6 @@
 #include <xen/lib.h>
 #include <xen/init.h>
 #include <xen/mm.h>
-#include <xen/numa.h>
 #include <xen/sched.h>
 #include <xen/guest_access.h>
 #include <asm/current.h>
@@ -31,6 +30,7 @@
 #include <asm/fixmap.h>
 #include <asm/hypercall.h>
 #include <asm/msr.h>
+#include <asm/numa.h>
 #include <public/memory.h>
 
 #ifdef CONFIG_COMPAT
@@ -106,6 +106,7 @@ l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
 void __init paging_init(void)
 {
     unsigned long i, mpt_size, va;
+    unsigned int memflags;
     l3_pgentry_t *l3_ro_mpt;
     l2_pgentry_t *l2_ro_mpt = NULL;
     struct page_info *l1_pg, *l2_pg, *l3_pg;
@@ -126,7 +127,36 @@ void __init paging_init(void)
     mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
     {
-        if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
+        BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
+        memflags = MEMF_node(phys_to_nid(i <<
+            (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
+
+        if ( cpu_has_page1gb &&
+             !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
+             (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) &&
+             (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
+                                          memflags)) != NULL )
+        {
+            map_pages_to_xen(
+                RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
+                page_to_mfn(l1_pg),
+                1UL << (2 * PAGETABLE_ORDER),
+                PAGE_HYPERVISOR);
+            memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
+                   0x77, 1UL << L3_PAGETABLE_SHIFT);
+
+            ASSERT(!l2_table_offset(va));
+            /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
+            l3e_write(&l3_ro_mpt[l3_table_offset(va)],
+                l3e_from_page(l1_pg,
+                    /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
+            i += (1UL << PAGETABLE_ORDER) - 1;
+            continue;
+        }
+
+        if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
+                                          memflags)) == NULL )
             goto nomem;
         map_pages_to_xen(
             RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
@@ -137,14 +167,13 @@ void __init paging_init(void)
                1UL << L2_PAGETABLE_SHIFT);
         if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
         {
-            if ( (l2_pg = alloc_domheap_page(NULL, 0)) == NULL )
+            if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL )
                 goto nomem;
-            va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
             l2_ro_mpt = page_to_virt(l2_pg);
             clear_page(l2_ro_mpt);
             l3e_write(&l3_ro_mpt[l3_table_offset(va)],
                       l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER));
-            l2_ro_mpt += l2_table_offset(va);
+            ASSERT(!l2_table_offset(va));
         }
         /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */
         l2e_write(l2_ro_mpt, l2e_from_page(
@@ -173,7 +202,10 @@ void __init paging_init(void)
         m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
     {
-        if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL )
+        memflags = MEMF_node(phys_to_nid(i <<
+            (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
+        if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
+                                          memflags)) == NULL )
             goto nomem;
         map_pages_to_xen(
             RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
@@ -207,24 +239,6 @@ void __init setup_idle_pagetable(void)
                   __PAGE_HYPERVISOR));
 }
 
-unsigned long clone_idle_pagetable(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    struct page_info *page = alloc_domheap_page(NULL,
-                                                MEMF_node(vcpu_to_node(v)));
-    l4_pgentry_t *l4_table = page_to_virt(page);
-
-    if ( !page )
-        return 0;
-
-    copy_page(l4_table, idle_pg_table);
-    l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
-        l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
-                      __PAGE_HYPERVISOR);
-
-    return __pa(l4_table);
-}
-
 void __init zap_low_mappings(void)
 {
     BUG_ON(num_online_cpus() != 1);
@@ -240,36 +254,36 @@ void __init zap_low_mappings(void)
 
 void __init subarch_init_memory(void)
 {
-    unsigned long i, v, m2p_start_mfn;
+    unsigned long i, n, v, m2p_start_mfn;
     l3_pgentry_t l3e;
     l2_pgentry_t l2e;
 
-    /*
-     * We are rather picky about the layout of 'struct page_info'. The
-     * count_info and domain fields must be adjacent, as we perform atomic
-     * 64-bit operations on them.
-     */
-    BUILD_BUG_ON(offsetof(struct page_info, u.inuse._domain) != 
-                 (offsetof(struct page_info, count_info) + sizeof(u32)));
-    BUILD_BUG_ON((offsetof(struct page_info, count_info) & 7) != 0);
-    BUILD_BUG_ON(sizeof(struct page_info) !=
-                 (32 + BITS_TO_LONGS(NR_CPUS)*sizeof(long)));
-
+    BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+    BUILD_BUG_ON(RDWR_MPT_VIRT_END   & ((1UL << L3_PAGETABLE_SHIFT) - 1));
     /* M2P table is mappable read-only by privileged domains. */
     for ( v  = RDWR_MPT_VIRT_START;
           v != RDWR_MPT_VIRT_END;
-          v += 1 << L2_PAGETABLE_SHIFT )
+          v += n << PAGE_SHIFT )
     {
+        n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
         l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
             l3_table_offset(v)];
         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
             continue;
-        l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
-        if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-            continue;
-        m2p_start_mfn = l2e_get_pfn(l2e);
+        if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
+        {
+            n = L1_PAGETABLE_ENTRIES;
+            l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+            if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+                continue;
+            m2p_start_mfn = l2e_get_pfn(l2e);
+        }
+        else
+        {
+            m2p_start_mfn = l3e_get_pfn(l3e);
+        }
 
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        for ( i = 0; i < n; i++ )
         {
             struct page_info *page = mfn_to_page(m2p_start_mfn + i);
             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
@@ -313,18 +327,29 @@ long subarch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         if ( copy_from_guest(&xmml, arg, 1) )
             return -EFAULT;
 
+        BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
+        BUILD_BUG_ON(RDWR_MPT_VIRT_END   & ((1UL << L3_PAGETABLE_SHIFT) - 1));
         for ( i = 0, v = RDWR_MPT_VIRT_START;
               (i != xmml.max_extents) && (v != RDWR_MPT_VIRT_END);
-              i++, v += 1 << 21 )
+              i++, v += 1UL << L2_PAGETABLE_SHIFT )
         {
             l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
                 l3_table_offset(v)];
             if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
                 break;
-            l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
-            if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-                break;
-            mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
+            if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
+            {
+                l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
+                if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+                    break;
+                mfn = l2e_get_pfn(l2e);
+            }
+            else
+            {
+                mfn = l3e_get_pfn(l3e)
+                    + (l2_table_offset(v) << PAGETABLE_ORDER);
+            }
+            ASSERT(!l1_table_offset(v));
             if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
                 return -EFAULT;
         }
index 32abe933d39229cafc29f9a1797d095f1ef5b72b..d20eafcaff9053ee5dbf66129e1e9cf2263b416c 100644 (file)
@@ -18,6 +18,9 @@
 #define physdev_eoi                compat_physdev_eoi
 #define physdev_eoi_t              physdev_eoi_compat_t
 
+#define physdev_pirq_eoi_gmfn      compat_physdev_pirq_eoi_gmfn
+#define physdev_pirq_eoi_gmfn_t    physdev_pirq_eoi_gmfn_compat_t
+
 #define physdev_set_iobitmap       compat_physdev_set_iobitmap
 #define physdev_set_iobitmap_t     physdev_set_iobitmap_compat_t
 
index 02364c4535a32424c9dbe5c8c0e44791ef9b4b93..2002900d61ec9a48b3ed510d034493a32d287f03 100644 (file)
@@ -11,14 +11,14 @@ DEFINE_XEN_GUEST_HANDLE(compat_platform_op_t);
 #define xen_platform_op_t   compat_platform_op_t
 #define do_platform_op(x)   compat_platform_op(_##x)
 
-#define xenpf_copy_px_states compat_xenpf_copy_px_states
-
 #define xen_processor_px    compat_processor_px
 #define xen_processor_px_t  compat_processor_px_t
 #define xen_processor_performance    compat_processor_performance
 #define xen_processor_performance_t  compat_processor_performance_t
 #define xenpf_set_processor_pminfo   compat_pf_set_processor_pminfo
 
+#define set_px_pminfo          compat_set_px_pminfo
+
 #define xen_processor_power     compat_processor_power
 #define xen_processor_power_t   compat_processor_power_t
 #define set_cx_pminfo           compat_set_cx_pminfo
index 34f4ecf59cb23a1cd492e4193814412914b1eca4..70fef66f84a95aed912da5ab94e3a08d68ebdad4 100644 (file)
@@ -14,6 +14,8 @@
 #include <xen/nmi.h>
 #include <asm/current.h>
 #include <asm/flushtlb.h>
+#include <asm/traps.h>
+#include <asm/event.h>
 #include <asm/msr.h>
 #include <asm/page.h>
 #include <asm/shared.h>
@@ -265,6 +267,9 @@ unsigned long do_iret(void)
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     struct iret_context iret_saved;
     struct vcpu *v = current;
+    struct domain *d = v->domain;
+    struct bank_entry *entry;
+    int cpu = smp_processor_id();
 
     if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
                                  sizeof(iret_saved))) )
@@ -304,6 +309,48 @@ unsigned long do_iret(void)
        && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
         vcpu_set_affinity(v, &v->cpu_affinity_tmp);
 
+   /*Currently, only inject vMCE to DOM0.*/
+    if (v->trap_priority >= VCPU_TRAP_NMI) {
+        printk(KERN_DEBUG "MCE: Return from vMCE# trap!");
+        if (d->domain_id == 0 && v->vcpu_id == 0) {
+            if ( !d->arch.vmca_msrs.nr_injection ) {
+                printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is 0\n");
+                goto end;
+            }
+
+            d->arch.vmca_msrs.nr_injection--;
+            if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+                    struct bank_entry, list);
+                printk(KERN_DEBUG "MCE: Delete last injection Node\n");
+                list_del(&entry->list);
+            }
+            else
+                printk(KERN_DEBUG "MCE: Not found last injection "
+                    "Node, something Wrong!\n");
+
+            /* futher injection*/
+            if ( d->arch.vmca_msrs.nr_injection > 0) {
+                if ( d->arch.vmca_msrs.nr_injection > 0 &&
+                        guest_has_trap_callback(d, v->vcpu_id,
+                            TRAP_machine_check) &&
+                        !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
+                    cpumask_t affinity;
+
+                    dom0->vcpu[0]->cpu_affinity_tmp =
+                            dom0->vcpu[0]->cpu_affinity;
+                    cpus_clear(affinity);
+                    cpu_set(cpu, affinity);
+                    printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
+                        dom0->vcpu[0]->processor);
+                    vcpu_set_affinity(dom0->vcpu[0], &affinity);
+                    vcpu_kick(dom0->vcpu[0]);
+                }
+            }
+        }
+    } /* end of outer-if */
+
+end:
     /* Restore previous trap priority */
     v->trap_priority = v->old_trap_priority;
 
index 55559f4678c35d11e58a566fd432df6eb498126f..f6c37f02ccf9665ea95e0198d02d9648025d0248 100644 (file)
@@ -24,7 +24,6 @@ SECTIONS
        *(.fixup)
        *(.gnu.warning)
        } :text = 0x9090
-  .text.lock : { *(.text.lock) } :text /* out-of-line lock text */
 
   _etext = .;                  /* End of text section */
 
@@ -90,6 +89,7 @@ SECTIONS
        *(.exit.text)
        *(.exit.data)
        *(.exitcall.exit)
+       *(.eh_frame)
        }
 
   /* Stabs debugging sections.  */
index 8a6202b934c35b7f21cdcd9c6ad4966d65dcd1d5..26ab8e4030529921e6d6f2e1d5c249e881981b35 100644 (file)
@@ -28,6 +28,7 @@
 #define DstImplicit (0<<1) /* Destination operand is implicit in the opcode. */
 #define DstBitBase  (1<<1) /* Memory operand, bit string. */
 #define DstReg      (2<<1) /* Register operand. */
+#define DstEax      DstReg /* Register EAX (aka DstReg with no ModRM) */
 #define DstMem      (3<<1) /* Memory operand. */
 #define DstMask     (3<<1)
 /* Source operand type. */
@@ -51,35 +52,35 @@ static uint8_t opcode_table[256] = {
     /* 0x00 - 0x07 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, ImplicitOps, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, ImplicitOps,
     /* 0x08 - 0x0F */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, ImplicitOps, 0,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, 0,
     /* 0x10 - 0x17 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, ImplicitOps, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, ImplicitOps,
     /* 0x18 - 0x1F */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, ImplicitOps, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, ImplicitOps,
     /* 0x20 - 0x27 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
     /* 0x28 - 0x2F */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
     /* 0x30 - 0x37 */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
     /* 0x38 - 0x3F */
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm, 0, ImplicitOps,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps,
     /* 0x40 - 0x4F */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -125,7 +126,7 @@ static uint8_t opcode_table[256] = {
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps, ImplicitOps,
     /* 0xA8 - 0xAF */
-    ByteOp|DstReg|SrcImm, DstReg|SrcImm,
+    ByteOp|DstEax|SrcImm, DstEax|SrcImm,
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps|Mov, ImplicitOps|Mov,
     ByteOp|ImplicitOps, ImplicitOps,
@@ -236,7 +237,8 @@ static uint8_t twobyte_table[256] = {
     DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
     ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
     /* 0xC0 - 0xC7 */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
+    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
+    0, DstMem|SrcReg|ModRM|Mov,
     0, 0, 0, ImplicitOps|ModRM,
     /* 0xC8 - 0xCF */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -686,12 +688,12 @@ static void __put_rep_prefix(
 })
 
 /* Clip maximum repetitions so that the index register only just wraps. */
-#define truncate_ea_and_reps(ea, reps, bytes_per_rep) ({                \
-    unsigned long __todo = (ctxt->regs->eflags & EF_DF) ? (ea) : ~(ea); \
-    __todo = truncate_word(__todo, ad_bytes);                           \
-    __todo = (__todo / (bytes_per_rep)) + 1;                            \
-    (reps) = (__todo < (reps)) ? __todo : (reps);                       \
-    truncate_word((ea), ad_bytes);                                      \
+#define truncate_ea_and_reps(ea, reps, bytes_per_rep) ({                  \
+    unsigned long __todo = (ctxt->regs->eflags & EFLG_DF) ? (ea) : ~(ea); \
+    __todo = truncate_word(__todo, ad_bytes);                             \
+    __todo = (__todo / (bytes_per_rep)) + 1;                              \
+    (reps) = (__todo < (reps)) ? __todo : (reps);                         \
+    truncate_word((ea), ad_bytes);                                        \
 })
 
 /* Compatibility function: read guest memory, zero-extend result to a ulong. */
@@ -1573,59 +1575,35 @@ x86_emulate(
 
     switch ( b )
     {
-    case 0x04 ... 0x05: /* add imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x00 ... 0x03: add: /* add */
+    case 0x00 ... 0x05: add: /* add */
         emulate_2op_SrcV("add", src, dst, _regs.eflags);
         break;
 
-    case 0x0c ... 0x0d: /* or imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x08 ... 0x0b: or:  /* or */
+    case 0x08 ... 0x0d: or:  /* or */
         emulate_2op_SrcV("or", src, dst, _regs.eflags);
         break;
 
-    case 0x14 ... 0x15: /* adc imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x10 ... 0x13: adc: /* adc */
+    case 0x10 ... 0x15: adc: /* adc */
         emulate_2op_SrcV("adc", src, dst, _regs.eflags);
         break;
 
-    case 0x1c ... 0x1d: /* sbb imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x18 ... 0x1b: sbb: /* sbb */
+    case 0x18 ... 0x1d: sbb: /* sbb */
         emulate_2op_SrcV("sbb", src, dst, _regs.eflags);
         break;
 
-    case 0x24 ... 0x25: /* and imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x20 ... 0x23: and: /* and */
+    case 0x20 ... 0x25: and: /* and */
         emulate_2op_SrcV("and", src, dst, _regs.eflags);
         break;
 
-    case 0x2c ... 0x2d: /* sub imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x28 ... 0x2b: sub: /* sub */
+    case 0x28 ... 0x2d: sub: /* sub */
         emulate_2op_SrcV("sub", src, dst, _regs.eflags);
         break;
 
-    case 0x34 ... 0x35: /* xor imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x30 ... 0x33: xor: /* xor */
+    case 0x30 ... 0x35: xor: /* xor */
         emulate_2op_SrcV("xor", src, dst, _regs.eflags);
         break;
 
-    case 0x3c ... 0x3d: /* cmp imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
-    case 0x38 ... 0x3b: cmp: /* cmp */
+    case 0x38 ... 0x3d: cmp: /* cmp */
         emulate_2op_SrcV("cmp", src, dst, _regs.eflags);
         dst.type = OP_NONE;
         break;
@@ -1987,8 +1965,6 @@ x86_emulate(
         break;
 
     case 0xa8 ... 0xa9: /* test imm,%%eax */
-        dst.reg = (unsigned long *)&_regs.eax;
-        dst.val = _regs.eax;
     case 0x84 ... 0x85: test: /* test */
         emulate_2op_SrcV("test", src, dst, _regs.eflags);
         dst.type = OP_NONE;
@@ -3910,6 +3886,12 @@ x86_emulate(
         }
         goto add;
 
+    case 0xc3: /* movnti */
+        /* Ignore the non-temporal hint for now. */
+        generate_exception_if(dst.bytes <= 2, EXC_UD, -1);
+        dst.val = src.val;
+        break;
+
     case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ {
         unsigned long old[2], exp[2], new[2];
         unsigned int i;
index a373528ea4b98787ffd04195a6c4e28bd50adf80..a43fa3a2945069a8b3f30dd8db0739ed68660a10 100644 (file)
@@ -67,6 +67,7 @@ typedef union segment_attributes {
         uint16_t l:   1;    /* 9;  Bit 53 */
         uint16_t db:  1;    /* 10; Bit 54 */
         uint16_t g:   1;    /* 11; Bit 55 */
+        uint16_t pad: 4;
     } fields;
 } __attribute__ ((packed)) segment_attributes_t;
 
index 5190d789dc97e850985dd1860fbe62bf8e4b4236..3054f2e271723d6a61063474750a7a3215c502c0 100644 (file)
@@ -16,6 +16,7 @@ obj-y += sched_sedf.o
 obj-y += schedule.o
 obj-y += shutdown.o
 obj-y += softirq.o
+obj-y += spinlock.o
 obj-y += stop_machine.o
 obj-y += string.o
 obj-y += symbols.o
@@ -41,13 +42,3 @@ subdir-$(x86_64) += hvm
 subdir-$(ia64) += hvm
 
 subdir-y += libelf
-
-# Object file contains changeset and compiler information.
-version.o: $(BASEDIR)/include/xen/compile.h
-
-ifeq ($(CONFIG_COMPAT),y)
-# extra dependencies
-grant_table.o: compat/grant_table.c
-schedule.o: compat/schedule.c
-xenoprof.o: compat/xenoprof.c
-endif
index dfab7bc77cf62f4eadea950434c5c19bd806ae1d..9a36a3dcd3576d3469db0b8bb2dce4df29e855ee 100644 (file)
@@ -3,7 +3,3 @@ obj-y += kernel.o
 obj-y += memory.o
 obj-y += multicall.o
 obj-y += xlat.o
-
-# extra dependencies
-kernel.o:      ../kernel.c
-multicall.o:   ../multicall.c
index 17519f5688fee843827692d9847be795e638b04c..779cad9f2609c8bdaee4b7b5986d5332d95439cc 100644 (file)
@@ -19,12 +19,10 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE(void) compat)
             XEN_GUEST_HANDLE(void) hnd;
             struct xen_memory_reservation *rsrv;
             struct xen_memory_exchange *xchg;
-            struct xen_translate_gpfn_list *xlat;
         } nat;
         union {
             struct compat_memory_reservation rsrv;
             struct compat_memory_exchange xchg;
-            struct compat_translate_gpfn_list xlat;
         } cmp;
 
         set_xen_guest_handle(nat.hnd, (void *)COMPAT_ARG_XLAT_VIRT_BASE);
@@ -182,52 +180,6 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE(void) compat)
             nat.hnd = compat;
             break;
 
-        case XENMEM_translate_gpfn_list:
-            if ( copy_from_guest(&cmp.xlat, compat, 1) )
-                return -EFAULT;
-
-            /* Is size too large for us to encode a continuation? */
-            if ( cmp.xlat.nr_gpfns > (UINT_MAX >> MEMOP_EXTENT_SHIFT) )
-                return -EINVAL;
-
-            if ( !compat_handle_okay(cmp.xlat.gpfn_list, cmp.xlat.nr_gpfns) ||
-                 !compat_handle_okay(cmp.xlat.mfn_list,  cmp.xlat.nr_gpfns) )
-                return -EFAULT;
-
-            end_extent = start_extent + (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.xlat)) /
-                                        sizeof(*space);
-            if ( end_extent > cmp.xlat.nr_gpfns )
-                end_extent = cmp.xlat.nr_gpfns;
-
-            space = (xen_pfn_t *)(nat.xlat + 1);
-            /* Code below depends upon .gpfn_list preceding .mfn_list. */
-            BUILD_BUG_ON(offsetof(xen_translate_gpfn_list_t, gpfn_list) > offsetof(xen_translate_gpfn_list_t, mfn_list));
-#define XLAT_translate_gpfn_list_HNDL_gpfn_list(_d_, _s_) \
-            do \
-            { \
-                set_xen_guest_handle((_d_)->gpfn_list, space - start_extent); \
-                for ( i = start_extent; i < end_extent; ++i ) \
-                { \
-                    compat_pfn_t pfn; \
-                    if ( __copy_from_compat_offset(&pfn, (_s_)->gpfn_list, i, 1) ) \
-                        return -EFAULT; \
-                    *space++ = pfn; \
-                } \
-            } while (0)
-#define XLAT_translate_gpfn_list_HNDL_mfn_list(_d_, _s_) \
-            (_d_)->mfn_list = (_d_)->gpfn_list
-            XLAT_translate_gpfn_list(nat.xlat, &cmp.xlat);
-#undef XLAT_translate_gpfn_list_HNDL_mfn_list
-#undef XLAT_translate_gpfn_list_HNDL_gpfn_list
-
-            if ( end_extent < cmp.xlat.nr_gpfns )
-            {
-                nat.xlat->nr_gpfns = end_extent;
-                ++split;
-            }
-
-            break;
-
         default:
             return compat_arch_memory_op(cmd, compat);
         }
@@ -335,27 +287,6 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE(void) compat)
         case XENMEM_maximum_gpfn:
             break;
 
-        case XENMEM_translate_gpfn_list:
-            if ( split < 0 )
-                end_extent = cmd >> MEMOP_EXTENT_SHIFT;
-            else
-                BUG_ON(rc);
-
-            for ( ; start_extent < end_extent; ++start_extent )
-            {
-                compat_pfn_t pfn = nat.xlat->mfn_list.p[start_extent];
-
-                BUG_ON(pfn != nat.xlat->mfn_list.p[start_extent]);
-                if ( __copy_to_compat_offset(cmp.xlat.mfn_list, start_extent, &pfn, 1) )
-                {
-                    if ( split < 0 )
-                        /* Cannot cancel the continuation... */
-                        domain_crash(current->domain);
-                    return -EFAULT;
-                }
-            }
-            break;
-
         default:
             domain_crash(current->domain);
             split = 0;
index 7e1f0ebe28a860df86e32cff36c36742e63d2718..187735b18ce2e24da67a873837b003841cd8fe77 100644 (file)
 #include <xen/percpu.h>
 #include <xen/multicall.h>
 #include <xen/rcupdate.h>
+#include <acpi/cpufreq/cpufreq.h>
 #include <asm/debugger.h>
 #include <public/sched.h>
 #include <public/vcpu.h>
 #include <xsm/xsm.h>
+#include <xen/trace.h>
 
 /* Linux config option: propageted to domain0 */
 /* xen_processor_pmbits: xen control Cx, Px, ... */
-unsigned int xen_processor_pmbits = 0;
+unsigned int xen_processor_pmbits = XEN_PROCESSOR_PM_PX;
 
 /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */
 static unsigned int opt_dom0_vcpus_pin;
 boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin);
 
-enum cpufreq_controller cpufreq_controller;
+/* set xen as default cpufreq */
+enum cpufreq_controller cpufreq_controller = FREQCTL_xen;
+
 static void __init setup_cpufreq_option(char *str)
 {
+    char *arg;
+
     if ( !strcmp(str, "dom0-kernel") )
     {
         xen_processor_pmbits &= ~XEN_PROCESSOR_PM_PX;
         cpufreq_controller = FREQCTL_dom0_kernel;
         opt_dom0_vcpus_pin = 1;
+        return;
     }
-    else if ( !strcmp(str, "xen") )
+
+    if ( !strcmp(str, "none") )
     {
-        xen_processor_pmbits |= XEN_PROCESSOR_PM_PX;
-        cpufreq_controller = FREQCTL_xen;
+        xen_processor_pmbits &= ~XEN_PROCESSOR_PM_PX;
+        cpufreq_controller = FREQCTL_none;
+        return;
     }
+
+    if ( (arg = strpbrk(str, ",:")) != NULL )
+        *arg++ = '\0';
+
+    if ( !strcmp(str, "xen") )
+        if ( arg && *arg )
+            cpufreq_cmdline_parse(arg);
 }
 custom_param("cpufreq", setup_cpufreq_option);
 
@@ -73,16 +89,6 @@ int current_domain_id(void)
     return current->domain->domain_id;
 }
 
-static struct domain *alloc_domain_struct(void)
-{
-    return xmalloc(struct domain);
-}
-
-static void free_domain_struct(struct domain *d)
-{
-    xfree(d);
-}
-
 static void __domain_finalise_shutdown(struct domain *d)
 {
     struct vcpu *v;
@@ -134,13 +140,16 @@ struct vcpu *alloc_vcpu(
     v->domain = d;
     v->vcpu_id = vcpu_id;
 
-    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
-    v->runstate.state_entry_time = NOW();
-
     spin_lock_init(&v->virq_lock);
 
-    if ( !is_idle_domain(d) )
+    if ( is_idle_domain(d) )
+    {
+        v->runstate.state = RUNSTATE_running;
+    }
+    else
     {
+        v->runstate.state = RUNSTATE_offline;        
+        v->runstate.state_entry_time = NOW();
         set_bit(_VPF_down, &v->pause_flags);
         v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
     }
@@ -211,8 +220,8 @@ struct domain *domain_create(
     spin_lock_init(&d->page_alloc_lock);
     spin_lock_init(&d->shutdown_lock);
     spin_lock_init(&d->hypercall_deadlock_mutex);
-    INIT_LIST_HEAD(&d->page_list);
-    INIT_LIST_HEAD(&d->xenpage_list);
+    INIT_PAGE_LIST_HEAD(&d->page_list);
+    INIT_PAGE_LIST_HEAD(&d->xenpage_list);
 
     if ( domcr_flags & DOMCRF_hvm )
         d->is_hvm = 1;
@@ -457,7 +466,9 @@ void domain_shutdown(struct domain *d, u8 reason)
 
     for_each_vcpu ( d, v )
     {
-        if ( v->defer_shutdown )
+        if ( reason == SHUTDOWN_crash )
+            v->defer_shutdown = 0;
+        else if ( v->defer_shutdown )
             continue;
         vcpu_pause_nosync(v);
         v->paused_for_shutdown = 1;
@@ -545,14 +556,17 @@ static void complete_domain_destroy(struct rcu_head *head)
         sched_destroy_vcpu(v);
     }
 
-    rangeset_domain_destroy(d);
-
     grant_table_destroy(d);
 
     arch_domain_destroy(d);
 
+    rangeset_domain_destroy(d);
+
     sched_destroy_domain(d);
 
+    /* Free page used by xen oprofile buffer. */
+    free_xenoprof_pages(d);
+
     for ( i = MAX_VIRT_CPUS-1; i >= 0; i-- )
         if ( (v = d->vcpu[i]) != NULL )
             free_vcpu_struct(v);
@@ -582,6 +596,7 @@ void domain_destroy(struct domain *d)
         return;
 
     /* Delete from task list and task hashtable. */
+    TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
     spin_lock(&domlist_update_lock);
     pd = &domain_list;
     while ( *pd != d ) 
index f4787b22e3a5a74b965dbb6ed1572d8ff7969986..23c2f4e529cf0e09a27112a1917b21134e218dec 100644 (file)
@@ -242,13 +242,15 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL )
             goto svc_out;
 
-        if ( !IS_COMPAT(v->domain) )
-            ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
 #ifdef CONFIG_COMPAT
+        if ( !is_pv_32on64_vcpu(v) )
+            ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
         else
             ret = copy_from_guest(c.cmp,
                                   guest_handle_cast(op->u.vcpucontext.ctxt,
                                                     void), 1);
+#else
+        ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
 #endif
         ret = ret ? -EFAULT : 0;
 
@@ -339,7 +341,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         ret = -EINVAL;
         if ( supervisor_mode_kernel ||
              (op->u.createdomain.flags &
-             ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_hap)) )
+             ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_hap |
+               XEN_DOMCTL_CDF_s3_integrity)) )
             break;
 
         dom = op->domain;
@@ -371,6 +374,8 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
             domcr_flags |= DOMCRF_hvm;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hap )
             domcr_flags |= DOMCRF_hap;
+        if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity )
+            domcr_flags |= DOMCRF_s3_integrity;
 
         ret = -ENOMEM;
         d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref);
@@ -428,7 +433,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
 
             cpu = (i == 0) ?
                 default_vcpu0_location() :
-                (d->vcpu[i-1]->processor + 1) % num_online_cpus();
+                cycle_cpu(d->vcpu[i-1]->processor, cpu_online_map);
 
             if ( alloc_vcpu(d, i, cpu) == NULL )
                 goto maxvcpu_out;
@@ -593,12 +598,14 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
         if ( v != current )
             vcpu_unpause(v);
 
-        if ( !IS_COMPAT(v->domain) )
-            ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
 #ifdef CONFIG_COMPAT
+        if ( !is_pv_32on64_vcpu(v) )
+            ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
         else
             ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt,
                                                   void), c.cmp, 1);
+#else
+        ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
 #endif
 
         if ( copy_to_guest(u_domctl, op, 1) || ret )
index 8e3e0d1f64d81fddd68f35ddb01cd176d0fd13e1..3f36d09e831754aaef6331fb14b79f572747e1a7 100644 (file)
@@ -386,7 +386,7 @@ static long __evtchn_close(struct domain *d1, int port1)
             if ( v->virq_to_evtchn[chn1->u.virq] != port1 )
                 continue;
             v->virq_to_evtchn[chn1->u.virq] = 0;
-            spin_barrier(&v->virq_lock);
+            spin_barrier_irq(&v->virq_lock);
         }
         break;
 
@@ -548,7 +548,7 @@ static int evtchn_set_pending(struct vcpu *v, int port)
         return 1;
 
     if ( !test_bit        (port, &shared_info(d, evtchn_mask)) &&
-         !test_and_set_bit(port / BITS_PER_GUEST_LONG(d),
+         !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d),
                            &vcpu_info(v, evtchn_pending_sel)) )
     {
         vcpu_mark_events_pending(v);
@@ -762,10 +762,9 @@ long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id)
 }
 
 
-static long evtchn_unmask(evtchn_unmask_t *unmask)
+int evtchn_unmask(unsigned int port)
 {
     struct domain *d = current->domain;
-    int            port = unmask->port;
     struct vcpu   *v;
 
     spin_lock(&d->event_lock);
@@ -784,7 +783,7 @@ static long evtchn_unmask(evtchn_unmask_t *unmask)
      */
     if ( test_and_clear_bit(port, &shared_info(d, evtchn_mask)) &&
          test_bit          (port, &shared_info(d, evtchn_pending)) &&
-         !test_and_set_bit (port / BITS_PER_GUEST_LONG(d),
+         !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d),
                             &vcpu_info(v, evtchn_pending_sel)) )
     {
         vcpu_mark_events_pending(v);
@@ -916,7 +915,7 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         struct evtchn_unmask unmask;
         if ( copy_from_guest(&unmask, arg, 1) != 0 )
             return -EFAULT;
-        rc = evtchn_unmask(&unmask);
+        rc = evtchn_unmask(unmask.port);
         break;
     }
 
index afd03fec09d592fe80d73133de196bb02730f6d7..1c32772a88b50547fd3438029eb473bd10f13399 100644 (file)
@@ -195,7 +195,7 @@ static void
 __gnttab_map_grant_ref(
     struct gnttab_map_grant_ref *op)
 {
-    struct domain *ld, *rd;
+    struct domain *ld, *rd, *owner;
     struct vcpu   *led;
     int            handle;
     unsigned long  frame = 0, nr_gets = 0;
@@ -317,6 +317,7 @@ __gnttab_map_grant_ref(
         if ( !act->pin )
         {
             act->domid = scombo.shorts.domid;
+            act->gfn = sha->frame;
             act->frame = gmfn_to_mfn(rd, sha->frame);
         }
     }
@@ -335,8 +336,13 @@ __gnttab_map_grant_ref(
 
     spin_unlock(&rd->grant_table->lock);
 
-    if ( is_iomem_page(frame) )
+    if ( !mfn_valid(frame) ||
+         (owner = page_get_owner_and_reference(mfn_to_page(frame))) == dom_io )
     {
+        /* Only needed the reference to confirm dom_io ownership. */
+        if ( mfn_valid(frame) )
+            put_page(mfn_to_page(frame));
+
         if ( !iomem_access_permitted(rd, frame, frame) )
         {
             gdprintk(XENLOG_WARNING,
@@ -351,20 +357,11 @@ __gnttab_map_grant_ref(
         if ( rc != GNTST_okay )
             goto undo_out;
     }
-    else
+    else if ( owner == rd )
     {
-        if ( unlikely(!mfn_valid(frame)) ||
-             unlikely(!(gnttab_host_mapping_get_page_type(op, ld, rd) ?
-                        get_page_and_type(mfn_to_page(frame), rd,
-                                          PGT_writable_page) :
-                        get_page(mfn_to_page(frame), rd))) )
-        {
-            if ( !rd->is_dying )
-                gdprintk(XENLOG_WARNING, "Could not pin grant frame %lx\n",
-                         frame);
-            rc = GNTST_general_error;
-            goto undo_out;
-        }
+        if ( gnttab_host_mapping_get_page_type(op, ld, rd) &&
+             !get_page_type(mfn_to_page(frame), PGT_writable_page) )
+            goto could_not_pin;
 
         nr_gets++;
         if ( op->flags & GNTMAP_host_map )
@@ -382,6 +379,17 @@ __gnttab_map_grant_ref(
             }
         }
     }
+    else
+    {
+    could_not_pin:
+        if ( !rd->is_dying )
+            gdprintk(XENLOG_WARNING, "Could not pin grant frame %lx\n",
+                     frame);
+        if ( owner != NULL )
+            put_page(mfn_to_page(frame));
+        rc = GNTST_general_error;
+        goto undo_out;
+    }
 
     if ( need_iommu(ld) &&
          !(old_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) &&
@@ -1191,7 +1199,7 @@ gnttab_transfer(
         /* Okay, add the page to 'e'. */
         if ( unlikely(e->tot_pages++ == 0) )
             get_knownalive_domain(e);
-        list_add_tail(&page->list, &e->page_list);
+        page_list_add_tail(page, &e->page_list);
         page_set_owner(page, e);
 
         spin_unlock(&e->page_alloc_lock);
@@ -1335,6 +1343,7 @@ __acquire_grant_for_copy(
         if ( !act->pin )
         {
             act->domid = scombo.shorts.domid;
+            act->gfn = sha->frame;
             act->frame = gmfn_to_mfn(rd, sha->frame);
         }
     }
index cb3cf6688567d4e50899fcdcc458db33567282f8..66876e6a1c3bba2634684a089a1a0dd742237d73 100644 (file)
@@ -26,6 +26,7 @@
 #include <xen/version.h>
 #include <public/version.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
 
 #include <asm/hvm/support.h>
 
@@ -75,6 +76,53 @@ size_t hvm_save_size(struct domain *d)
     return sz;
 }
 
+/* Extract a single instance of a save record, by marshalling all
+ * records of that type and copying out the one we need. */
+int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance, 
+                 XEN_GUEST_HANDLE_64(uint8) handle)
+{
+    int rv = 0;
+    size_t sz = 0;
+    struct vcpu *v;
+    hvm_domain_context_t ctxt = { 0, };
+
+    if ( d->is_dying 
+         || typecode > HVM_SAVE_CODE_MAX 
+         || hvm_sr_handlers[typecode].size < sizeof(struct hvm_save_descriptor)
+         || hvm_sr_handlers[typecode].save == NULL )
+        return -EINVAL;
+
+    if ( hvm_sr_handlers[typecode].kind == HVMSR_PER_VCPU )
+        for_each_vcpu(d, v)
+            sz += hvm_sr_handlers[typecode].size;
+    else 
+        sz = hvm_sr_handlers[typecode].size;
+    
+    if ( (instance + 1) * hvm_sr_handlers[typecode].size > sz )
+        return -EINVAL;
+
+    ctxt.size = sz;
+    ctxt.data = xmalloc_bytes(sz);
+    if ( !ctxt.data )
+        return -ENOMEM;
+
+    if ( hvm_sr_handlers[typecode].save(d, &ctxt) != 0 )
+    {
+        gdprintk(XENLOG_ERR, 
+                 "HVM save: failed to save type %"PRIu16"\n", typecode);
+        rv = -EFAULT;
+    }
+    else if ( copy_to_guest(handle,
+                            ctxt.data 
+                            + (instance * hvm_sr_handlers[typecode].size) 
+                            + sizeof (struct hvm_save_descriptor), 
+                            hvm_sr_handlers[typecode].size
+                            - sizeof (struct hvm_save_descriptor)) )
+        rv = -EFAULT;
+
+    xfree(ctxt.data);
+    return rv;
+}
 
 int hvm_save(struct domain *d, hvm_domain_context_t *h)
 {
diff --git a/xen/common/inflate.c b/xen/common/inflate.c
new file mode 100644 (file)
index 0000000..999a33b
--- /dev/null
@@ -0,0 +1,1303 @@
+#define DEBG(x)
+#define DEBG1(x)
+/* inflate.c -- Not copyrighted 1992 by Mark Adler
+   version c10p1, 10 January 1993 */
+
+/* 
+ * Adapted for booting Linux by Hannu Savolainen 1993
+ * based on gzip-1.0.3 
+ *
+ * Nicolas Pitre <nico@cam.org>, 1999/04/14 :
+ *   Little mods for all variable to reside either into rodata or bss segments
+ *   by marking constant variables with 'const' and initializing all the others
+ *   at run-time only.  This allows for the kernel uncompressor to run
+ *   directly from Flash or ROM memory on embedded systems.
+ */
+
+/*
+   Inflate deflated (PKZIP's method 8 compressed) data.  The compression
+   method searches for as much of the current string of bytes (up to a
+   length of 258) in the previous 32 K bytes.  If it doesn't find any
+   matches (of at least length 3), it codes the next byte.  Otherwise, it
+   codes the length of the matched string and its distance backwards from
+   the current position.  There is a single Huffman code that codes both
+   single bytes (called "literals") and match lengths.  A second Huffman
+   code codes the distance information, which follows a length code.  Each
+   length or distance code actually represents a base value and a number
+   of "extra" (sometimes zero) bits to get to add to the base value.  At
+   the end of each deflated block is a special end-of-block (EOB) literal/
+   length code.  The decoding process is basically: get a literal/length
+   code; if EOB then done; if a literal, emit the decoded byte; if a
+   length then get the distance and emit the referred-to bytes from the
+   sliding window of previously emitted data.
+
+   There are (currently) three kinds of inflate blocks: stored, fixed, and
+   dynamic.  The compressor deals with some chunk of data at a time, and
+   decides which method to use on a chunk-by-chunk basis.  A chunk might
+   typically be 32 K or 64 K.  If the chunk is incompressible, then the
+   "stored" method is used.  In this case, the bytes are simply stored as
+   is, eight bits per byte, with none of the above coding.  The bytes are
+   preceded by a count, since there is no longer an EOB code.
+
+   If the data is compressible, then either the fixed or dynamic methods
+   are used.  In the dynamic method, the compressed data is preceded by
+   an encoding of the literal/length and distance Huffman codes that are
+   to be used to decode this block.  The representation is itself Huffman
+   coded, and so is preceded by a description of that code.  These code
+   descriptions take up a little space, and so for small blocks, there is
+   a predefined set of codes, called the fixed codes.  The fixed method is
+   used if the block codes up smaller that way (usually for quite small
+   chunks), otherwise the dynamic method is used.  In the latter case, the
+   codes are customized to the probabilities in the current block, and so
+   can code it much better than the pre-determined fixed codes.
+   The Huffman codes themselves are decoded using a multi-level table
+   lookup, in order to maximize the speed of decoding plus the speed of
+   building the decoding tables.  See the comments below that precede the
+   lbits and dbits tuning parameters.
+ */
+
+
+/*
+   Notes beyond the 1.93a appnote.txt:
+
+   1. Distance pointers never point before the beginning of the output
+      stream.
+   2. Distance pointers can point back across blocks, up to 32k away.
+   3. There is an implied maximum of 7 bits for the bit length table and
+      15 bits for the actual data.
+   4. If only one code exists, then it is encoded using one bit.  (Zero
+      would be more efficient, but perhaps a little confusing.)  If two
+      codes exist, they are coded using one bit each (0 and 1).
+   5. There is no way of sending zero distance codes--a dummy must be
+      sent if there are none.  (History: a pre 2.0 version of PKZIP would
+      store blocks with no distance codes, but this was discovered to be
+      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
+      zero distance codes, which is sent as one code of zero bits in
+      length.
+   6. There are up to 286 literal/length codes.  Code 256 represents the
+      end-of-block.  Note however that the static length tree defines
+      288 codes just to fill out the Huffman codes.  Codes 286 and 287
+      cannot be used though, since there is no length base or extra bits
+      defined for them.  Similarly, there are up to 30 distance codes.
+      However, static trees define 32 codes (all 5 bits) to fill out the
+      Huffman codes, but the last two had better not show up in the data.
+   7. Unzip can check dynamic Huffman blocks for complete code sets.
+      The exception is that a single code would not be complete (see #4).
+   8. The five bits following the block type is really the number of
+      literal codes sent minus 257.
+   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+      (1+6+6).  Therefore, to output three times the length, you output
+      three codes (1+1+1), whereas to output four times the same length,
+      you only need two codes (1+3).  Hmm.
+  10. In the tree reconstruction algorithm, Code = Code + Increment
+      only if BitLength(i) is not zero.  (Pretty obvious.)
+  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
+  12. Note: length code 284 can represent 227-258, but length code 285
+      really is 258.  The last length deserves its own, short code
+      since it gets used a lot in very redundant files.  The length
+      258 is special since 258 - 3 (the min match length) is 255.
+  13. The literal/length and distance code bit lengths are read as a
+      single stream of lengths.  It is possible (and advantageous) for
+      a repeat code (16, 17, or 18) to go across the boundary between
+      the two sets of lengths.
+ */
+
+#ifdef RCSID
+static char rcsid[] = "#Id: inflate.c,v 0.14 1993/06/10 13:27:04 jloup Exp #";
+#endif
+
+#ifndef STATIC
+
+#if defined(STDC_HEADERS) || defined(HAVE_STDLIB_H)
+#  include <sys/types.h>
+#  include <stdlib.h>
+#endif
+
+#include "gzip.h"
+#define STATIC
+#endif /* !STATIC */
+
+#ifndef INIT
+#define INIT
+#endif
+#define slide window
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+   that have 16-bit pointers (e.g. PC's in the small or medium model).
+   Valid extra bits are 0..13.  e == 15 is EOB (end of block), e == 16
+   means that v is a literal, 16 < e < 32 means that v is a pointer to
+   the next table, which codes e - 16 bits, and lastly e == 99 indicates
+   an unused code.  If a code with e == 99 is looked up, this implies an
+   error in the data. */
+struct huft {
+    uch e;                /* number of extra bits or operation */
+    uch b;                /* number of bits in this code or subcode */
+    union {
+        ush n;              /* literal, length base, or distance base */
+        struct huft *t;     /* pointer to next level of table */
+    } v;
+};
+
+
+/* Function prototypes */
+STATIC int INIT huft_build OF((unsigned *, unsigned, unsigned, 
+                               const ush *, const ush *, struct huft **, int *));
+STATIC int INIT huft_free OF((struct huft *));
+STATIC int INIT inflate_codes OF((struct huft *, struct huft *, int, int));
+STATIC int INIT inflate_stored OF((void));
+STATIC int INIT inflate_fixed OF((void));
+STATIC int INIT inflate_dynamic OF((void));
+STATIC int INIT inflate_block OF((int *));
+STATIC int INIT inflate OF((void));
+
+
+/* The inflate algorithm uses a sliding 32 K byte window on the uncompressed
+   stream to find repeated byte strings.  This is implemented here as a
+   circular buffer.  The index is updated simply by incrementing and then
+   ANDing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32 K area.  It is assumed
+   to be usable as if it were declared "uch slide[32768];" or as just
+   "uch *slide;" and then malloc'ed in the latter case.  The definition
+   must be in unzip.h, included above. */
+/* unsigned wp;             current position in slide */
+#define wp outcnt
+#define flush_output(w) (wp=(w),flush_window())
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+static const unsigned border[] = {    /* Order of the bit length code lengths */
+    16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+static const ush cplens[] = {         /* Copy lengths for literal codes 257..285 */
+    3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+    35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+/* note: see note #13 above about the 258 in this list. */
+static const ush cplext[] = {         /* Extra bits for literal codes 257..285 */
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+    3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */
+static const ush cpdist[] = {         /* Copy offsets for distance codes 0..29 */
+    1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+    257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+    8193, 12289, 16385, 24577};
+static const ush cpdext[] = {         /* Extra bits for distance codes */
+    0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+    7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+    12, 12, 13, 13};
+
+
+
+/* Macros for inflate() bit peeking and grabbing.
+   The usage is:
+   
+        NEEDBITS(j)
+        x = b & mask_bits[j];
+        DUMPBITS(j)
+
+   where NEEDBITS makes sure that b has at least j bits in it, and
+   DUMPBITS removes the bits from b.  The macros use the variable k
+   for the number of bits in b.  Normally, b and k are register
+   variables for speed, and are initialized at the beginning of a
+   routine that uses these macros from a global bit buffer and count.
+
+   If we assume that EOB will be the longest code, then we will never
+   ask for bits with NEEDBITS that are beyond the end of the stream.
+   So, NEEDBITS should not read any more bytes than are needed to
+   meet the request.  Then no bytes need to be "returned" to the buffer
+   at the end of the last block.
+
+   However, this assumption is not true for fixed blocks--the EOB code
+   is 7 bits, but the other literal/length codes can be 8 or 9 bits.
+   (The EOB code is shorter than other codes because fixed blocks are
+   generally short.  So, while a block always has an EOB, many other
+   literal/length codes have a significantly lower probability of
+   showing up at all.)  However, by making the first table have a
+   lookup of seven bits, the EOB code will be found in that first
+   lookup, and so will not require that too many bits be pulled from
+   the stream.
+ */
+
+STATIC ulg bb;                         /* bit buffer */
+STATIC unsigned bk;                    /* bits in bit buffer */
+
+STATIC const ush mask_bits[] = {
+    0x0000,
+    0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+    0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+#define NEXTBYTE()  ({ int v = get_byte(); if (v < 0) goto underrun; (uch)v; })
+#define NEEDBITS(n) {while(k<(n)){b|=((ulg)NEXTBYTE())<<k;k+=8;}}
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+#ifndef NO_INFLATE_MALLOC
+/* A trivial malloc implementation, adapted from
+ *  malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ */
+
+static unsigned long malloc_ptr;
+static int malloc_count;
+
+static void *malloc(int size)
+{
+    void *p;
+
+    if (size < 0)
+        error("Malloc error");
+    if (!malloc_ptr)
+        malloc_ptr = free_mem_ptr;
+
+    malloc_ptr = (malloc_ptr + 3) & ~3;     /* Align */
+
+    p = (void *)malloc_ptr;
+    malloc_ptr += size;
+
+    if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr)
+        error("Out of memory");
+
+    malloc_count++;
+    return p;
+}
+
+static void free(void *where)
+{
+    malloc_count--;
+    if (!malloc_count)
+        malloc_ptr = free_mem_ptr;
+}
+#else
+#define malloc(a) kmalloc(a, GFP_KERNEL)
+#define free(a) kfree(a)
+#endif
+
+/*
+   Huffman code decoding is performed using a multi-level table lookup.
+   The fastest way to decode is to simply build a lookup table whose
+   size is determined by the longest code.  However, the time it takes
+   to build this table can also be a factor if the data being decoded
+   is not very long.  The most common codes are necessarily the
+   shortest codes, so those codes dominate the decoding time, and hence
+   the speed.  The idea is you can have a shorter table that decodes the
+   shorter, more probable codes, and then point to subsidiary tables for
+   the longer codes.  The time it costs to decode the longer codes is
+   then traded against the time it takes to make longer tables.
+
+   This results of this trade are in the variables lbits and dbits
+   below.  lbits is the number of bits the first level table for literal/
+   length codes can decode in one step, and dbits is the same thing for
+   the distance codes.  Subsequent tables are also less than or equal to
+   those sizes.  These values may be adjusted either when all of the
+   codes are shorter than that, in which case the longest code length in
+   bits is used, or when the shortest code is *longer* than the requested
+   table size, in which case the length of the shortest code in bits is
+   used.
+
+   There are two different values for the two tables, since they code a
+   different number of possibilities each.  The literal/length table
+   codes 286 possible values, or in a flat code, a little over eight
+   bits.  The distance table codes 30 possible values, or a little less
+   than five bits, flat.  The optimum values for speed end up being
+   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+   The optimum values may differ though from machine to machine, and
+   possibly even between compilers.  Your mileage may vary.
+ */
+
+
+STATIC const int lbits = 9;          /* bits in base literal/length lookup table */
+STATIC const int dbits = 6;          /* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16         /* maximum bit length of any code (16 for explode) */
+#define N_MAX 288       /* maximum number of codes in any set */
+
+
+STATIC unsigned hufts;         /* track memory usage */
+
+
+STATIC int INIT huft_build(
+    unsigned *b,            /* code lengths in bits (all assumed <= BMAX) */
+    unsigned n,             /* number of codes (assumed <= N_MAX) */
+    unsigned s,             /* number of simple-valued codes (0..s-1) */
+    const ush *d,           /* list of base values for non-simple codes */
+    const ush *e,           /* list of extra bits for non-simple codes */
+    struct huft **t,        /* result: starting table */
+    int *m                  /* maximum lookup bits, returns actual */
+    )
+/* Given a list of code lengths and a maximum table size, make a set of
+   tables to decode that set of codes.  Return zero on success, one if
+   the given code set is incomplete (the tables are still built in this
+   case), two if the input is invalid (all zero length codes or an
+   oversubscribed set of lengths), and three if not enough memory. */
+{
+    unsigned a;                   /* counter for codes of length k */
+    unsigned f;                   /* i repeats in table every f entries */
+    int g;                        /* maximum code length */
+    int h;                        /* table level */
+    register unsigned i;          /* counter, current code */
+    register unsigned j;          /* counter */
+    register int k;               /* number of bits in current code */
+    int l;                        /* bits per table (returned in m) */
+    register unsigned *p;         /* pointer into c[], b[], or v[] */
+    register struct huft *q;      /* points to current table */
+    struct huft r;                /* table entry for structure assignment */
+    register int w;               /* bits before this table == (l * h) */
+    unsigned *xp;                 /* pointer into x */
+    int y;                        /* number of dummy codes added */
+    unsigned z;                   /* number of entries in current table */
+    struct {
+        unsigned c[BMAX+1];           /* bit length count table */
+        struct huft *u[BMAX];         /* table stack */
+        unsigned v[N_MAX];            /* values in order of bit length */
+        unsigned x[BMAX+1];           /* bit offsets, then code stack */
+    } *stk;
+    unsigned *c, *v, *x;
+    struct huft **u;
+    int ret;
+
+    DEBG("huft1 ");
+
+    stk = malloc(sizeof(*stk));
+    if (stk == NULL)
+        return 3;   /* out of memory */
+
+    c = stk->c;
+    v = stk->v;
+    x = stk->x;
+    u = stk->u;
+
+    /* Generate counts for each bit length */
+    memzero(stk->c, sizeof(stk->c));
+    p = b;  i = n;
+    do {
+        Tracecv(*p, (stderr, (n-i >= ' ' && n-i <= '~' ? "%c %d\n" : "0x%x %d\n"), 
+                     n-i, *p));
+        c[*p]++;                    /* assume all entries <= BMAX */
+        p++;                      /* Can't combine with above line (Solaris bug) */
+    } while (--i);
+    if (c[0] == n)                /* null input--all zero length codes */
+    {
+        *t = (struct huft *)NULL;
+        *m = 0;
+        ret = 2;
+        goto out;
+    }
+
+    DEBG("huft2 ");
+
+    /* Find minimum and maximum length, bound *m by those */
+    l = *m;
+    for (j = 1; j <= BMAX; j++)
+        if (c[j])
+            break;
+    k = j;                        /* minimum code length */
+    if ((unsigned)l < j)
+        l = j;
+    for (i = BMAX; i; i--)
+        if (c[i])
+            break;
+    g = i;                        /* maximum code length */
+    if ((unsigned)l > i)
+        l = i;
+    *m = l;
+
+    DEBG("huft3 ");
+
+    /* Adjust last length count to fill out codes, if needed */
+    for (y = 1 << j; j < i; j++, y <<= 1)
+        if ((y -= c[j]) < 0) {
+            ret = 2;                 /* bad input: more codes than bits */
+            goto out;
+        }
+    if ((y -= c[i]) < 0) {
+        ret = 2;
+        goto out;
+    }
+    c[i] += y;
+
+    DEBG("huft4 ");
+
+    /* Generate starting offsets into the value table for each length */
+    x[1] = j = 0;
+    p = c + 1;  xp = x + 2;
+    while (--i) {                 /* note that i == g from above */
+        *xp++ = (j += *p++);
+    }
+
+    DEBG("huft5 ");
+
+    /* Make a table of values in order of bit lengths */
+    p = b;  i = 0;
+    do {
+        if ((j = *p++) != 0)
+            v[x[j]++] = i;
+    } while (++i < n);
+    n = x[g];                   /* set n to length of v */
+
+    DEBG("h6 ");
+
+    /* Generate the Huffman codes and for each, make the table entries */
+    x[0] = i = 0;                 /* first Huffman code is zero */
+    p = v;                        /* grab values in bit order */
+    h = -1;                       /* no tables yet--level -1 */
+    w = -l;                       /* bits decoded == (l * h) */
+    u[0] = (struct huft *)NULL;   /* just to keep compilers happy */
+    q = (struct huft *)NULL;      /* ditto */
+    z = 0;                        /* ditto */
+    DEBG("h6a ");
+
+    /* go through the bit lengths (k already is bits in shortest code) */
+    for (; k <= g; k++)
+    {
+        DEBG("h6b ");
+        a = c[k];
+        while (a--)
+        {
+            DEBG("h6b1 ");
+            /* here i is the Huffman code of length k bits for value *p */
+            /* make tables up to required level */
+            while (k > w + l)
+            {
+                DEBG1("1 ");
+                h++;
+                w += l;                 /* previous table always l bits */
+
+                /* compute minimum size table less than or equal to l bits */
+                z = (z = g - w) > (unsigned)l ? l : z;  /* upper limit on table size */
+                if ((f = 1 << (j = k - w)) > a + 1)     /* try a k-w bit table */
+                {                       /* too few codes for k-w bit table */
+                    DEBG1("2 ");
+                    f -= a + 1;           /* deduct codes from patterns left */
+                    xp = c + k;
+                    if (j < z)
+                        while (++j < z)       /* try smaller tables up to z bits */
+                        {
+                            if ((f <<= 1) <= *++xp)
+                                break;            /* enough codes to use up j bits */
+                            f -= *xp;           /* else deduct codes from patterns */
+                        }
+                }
+                DEBG1("3 ");
+                z = 1 << j;             /* table entries for j-bit table */
+
+                /* allocate and link in new table */
+                if ((q = (struct huft *)malloc((z + 1)*sizeof(struct huft))) ==
+                    (struct huft *)NULL)
+                {
+                    if (h)
+                        huft_free(u[0]);
+                    ret = 3;             /* not enough memory */
+                    goto out;
+                }
+                DEBG1("4 ");
+                hufts += z + 1;         /* track memory usage */
+                *t = q + 1;             /* link to list for huft_free() */
+                *(t = &(q->v.t)) = (struct huft *)NULL;
+                u[h] = ++q;             /* table starts after link */
+
+                DEBG1("5 ");
+                /* connect to last table, if there is one */
+                if (h)
+                {
+                    x[h] = i;             /* save pattern for backing up */
+                    r.b = (uch)l;         /* bits to dump before this table */
+                    r.e = (uch)(16 + j);  /* bits in this table */
+                    r.v.t = q;            /* pointer to this table */
+                    j = i >> (w - l);     /* (get around Turbo C bug) */
+                    u[h-1][j] = r;        /* connect to last table */
+                }
+                DEBG1("6 ");
+            }
+            DEBG("h6c ");
+
+            /* set up table entry in r */
+            r.b = (uch)(k - w);
+            if (p >= v + n)
+                r.e = 99;               /* out of values--invalid code */
+            else if (*p < s)
+            {
+                r.e = (uch)(*p < 256 ? 16 : 15);    /* 256 is end-of-block code */
+                r.v.n = (ush)(*p);             /* simple code is just the value */
+                p++;                           /* one compiler does not like *p++ */
+            }
+            else
+            {
+                r.e = (uch)e[*p - s];   /* non-simple--look up in lists */
+                r.v.n = d[*p++ - s];
+            }
+            DEBG("h6d ");
+
+            /* fill code-like entries with r */
+            f = 1 << (k - w);
+            for (j = i >> w; j < z; j += f)
+                q[j] = r;
+
+            /* backwards increment the k-bit code i */
+            for (j = 1 << (k - 1); i & j; j >>= 1)
+                i ^= j;
+            i ^= j;
+
+            /* backup over finished tables */
+            while ((i & ((1 << w) - 1)) != x[h])
+            {
+                h--;                    /* don't need to update q */
+                w -= l;
+            }
+            DEBG("h6e ");
+        }
+        DEBG("h6f ");
+    }
+
+    DEBG("huft7 ");
+
+    /* Return true (1) if we were given an incomplete table */
+    ret = y != 0 && g != 1;
+
+ out:
+    free(stk);
+    return ret;
+}
+
+
+
+STATIC int INIT huft_free(
+    struct huft *t         /* table to free */
+    )
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+   list of the tables it made, with the links in a dummy first entry of
+   each table. */
+{
+    register struct huft *p, *q;
+
+
+    /* Go through linked list, freeing from the malloced (t[-1]) address. */
+    p = t;
+    while (p != (struct huft *)NULL)
+    {
+        q = (--p)->v.t;
+        free((char*)p);
+        p = q;
+    } 
+    return 0;
+}
+
+
+STATIC int INIT inflate_codes(
+    struct huft *tl,    /* literal/length decoder tables */
+    struct huft *td,    /* distance decoder tables */
+    int bl,             /* number of bits decoded by tl[] */
+    int bd              /* number of bits decoded by td[] */
+    )
+/* inflate (decompress) the codes in a deflated (compressed) block.
+   Return an error code or zero if it all goes ok. */
+{
+    register unsigned e;  /* table entry flag/number of extra bits */
+    unsigned n, d;        /* length and index for copy */
+    unsigned w;           /* current window position */
+    struct huft *t;       /* pointer to table entry */
+    unsigned ml, md;      /* masks for bl and bd bits */
+    register ulg b;       /* bit buffer */
+    register unsigned k;  /* number of bits in bit buffer */
+
+
+    /* make local copies of globals */
+    b = bb;                       /* initialize bit buffer */
+    k = bk;
+    w = wp;                       /* initialize window position */
+
+    /* inflate the coded data */
+    ml = mask_bits[bl];           /* precompute masks for speed */
+    md = mask_bits[bd];
+    for (;;)                      /* do until end of block */
+    {
+        NEEDBITS((unsigned)bl)
+            if ((e = (t = tl + ((unsigned)b & ml))->e) > 16)
+                do {
+                    if (e == 99)
+                        return 1;
+                    DUMPBITS(t->b)
+                        e -= 16;
+                    NEEDBITS(e)
+                        } while ((e = (t = t->v.t + ((unsigned)b & mask_bits[e]))->e) > 16);
+        DUMPBITS(t->b)
+            if (e == 16)                /* then it's a literal */
+            {
+                slide[w++] = (uch)t->v.n;
+                Tracevv((stderr, "%c", slide[w-1]));
+                if (w == WSIZE)
+                {
+                    flush_output(w);
+                    w = 0;
+                }
+            }
+            else                        /* it's an EOB or a length */
+            {
+                /* exit if end of block */
+                if (e == 15)
+                    break;
+
+                /* get length of block to copy */
+                NEEDBITS(e)
+                    n = t->v.n + ((unsigned)b & mask_bits[e]);
+                DUMPBITS(e);
+
+                /* decode distance of block to copy */
+                NEEDBITS((unsigned)bd)
+                    if ((e = (t = td + ((unsigned)b & md))->e) > 16)
+                        do {
+                            if (e == 99)
+                                return 1;
+                            DUMPBITS(t->b)
+                                e -= 16;
+                            NEEDBITS(e)
+                                } while ((e = (t = t->v.t + ((unsigned)b & mask_bits[e]))->e) > 16);
+                DUMPBITS(t->b)
+                    NEEDBITS(e)
+                    d = w - t->v.n - ((unsigned)b & mask_bits[e]);
+                DUMPBITS(e)
+                    Tracevv((stderr,"\\[%d,%d]", w-d, n));
+
+                /* do the copy */
+                do {
+                    n -= (e = (e = WSIZE - ((d &= WSIZE-1) > w ? d : w)) > n ? n : e);
+#if !defined(NOMEMCPY) && !defined(DEBUG)
+                    if (w - d >= e)         /* (this test assumes unsigned comparison) */
+                    {
+                        memcpy(slide + w, slide + d, e);
+                        w += e;
+                        d += e;
+                    }
+                    else                      /* do it slow to avoid memcpy() overlap */
+#endif /* !NOMEMCPY */
+                        do {
+                            slide[w++] = slide[d++];
+                            Tracevv((stderr, "%c", slide[w-1]));
+                        } while (--e);
+                    if (w == WSIZE)
+                    {
+                        flush_output(w);
+                        w = 0;
+                    }
+                } while (n);
+            }
+    }
+
+
+    /* restore the globals from the locals */
+    wp = w;                       /* restore global window pointer */
+    bb = b;                       /* restore global bit buffer */
+    bk = k;
+
+    /* done */
+    return 0;
+
+ underrun:
+    return 4;   /* Input underrun */
+}
+
+
+
+STATIC int INIT inflate_stored(void)
+/* "decompress" an inflated type 0 (stored) block. */
+{
+    unsigned n;           /* number of bytes in block */
+    unsigned w;           /* current window position */
+    register ulg b;       /* bit buffer */
+    register unsigned k;  /* number of bits in bit buffer */
+
+    DEBG("<stor");
+
+    /* make local copies of globals */
+    b = bb;                       /* initialize bit buffer */
+    k = bk;
+    w = wp;                       /* initialize window position */
+
+
+    /* go to byte boundary */
+    n = k & 7;
+    DUMPBITS(n);
+
+
+    /* get the length and its complement */
+    NEEDBITS(16)
+        n = ((unsigned)b & 0xffff);
+    DUMPBITS(16)
+        NEEDBITS(16)
+        if (n != (unsigned)((~b) & 0xffff))
+            return 1;                   /* error in compressed data */
+    DUMPBITS(16)
+
+
+        /* read and output the compressed data */
+        while (n--)
+        {
+            NEEDBITS(8)
+                slide[w++] = (uch)b;
+            if (w == WSIZE)
+            {
+                flush_output(w);
+                w = 0;
+            }
+            DUMPBITS(8)
+                }
+
+
+    /* restore the globals from the locals */
+    wp = w;                       /* restore global window pointer */
+    bb = b;                       /* restore global bit buffer */
+    bk = k;
+
+    DEBG(">");
+    return 0;
+
+ underrun:
+    return 4;   /* Input underrun */
+}
+
+
+/*
+ * We use `noinline' here to prevent gcc-3.5 from using too much stack space
+ */
+STATIC int noinline INIT inflate_fixed(void)
+/* decompress an inflated type 1 (fixed Huffman codes) block.  We should
+   either replace this with a custom decoder, or at least precompute the
+   Huffman tables. */
+{
+    int i;                /* temporary variable */
+    struct huft *tl;      /* literal/length code table */
+    struct huft *td;      /* distance code table */
+    int bl;               /* lookup bits for tl */
+    int bd;               /* lookup bits for td */
+    unsigned *l;          /* length list for huft_build */
+
+    DEBG("<fix");
+
+    l = malloc(sizeof(*l) * 288);
+    if (l == NULL)
+        return 3;   /* out of memory */
+
+    /* set up literal table */
+    for (i = 0; i < 144; i++)
+        l[i] = 8;
+    for (; i < 256; i++)
+        l[i] = 9;
+    for (; i < 280; i++)
+        l[i] = 7;
+    for (; i < 288; i++)          /* make a complete, but wrong code set */
+        l[i] = 8;
+    bl = 7;
+    if ((i = huft_build(l, 288, 257, cplens, cplext, &tl, &bl)) != 0) {
+        free(l);
+        return i;
+    }
+
+    /* set up distance table */
+    for (i = 0; i < 30; i++)      /* make an incomplete code set */
+        l[i] = 5;
+    bd = 5;
+    if ((i = huft_build(l, 30, 0, cpdist, cpdext, &td, &bd)) > 1)
+    {
+        huft_free(tl);
+        free(l);
+
+        DEBG(">");
+        return i;
+    }
+
+
+    /* decompress until an end-of-block code */
+    if (inflate_codes(tl, td, bl, bd)) {
+        free(l);
+        return 1;
+    }
+
+    /* free the decoding tables, return */
+    free(l);
+    huft_free(tl);
+    huft_free(td);
+    return 0;
+}
+
+
+/*
+ * We use `noinline' here to prevent gcc-3.5 from using too much stack space
+ */
+STATIC int noinline INIT inflate_dynamic(void)
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+{
+    int i;                /* temporary variables */
+    unsigned j;
+    unsigned l;           /* last length */
+    unsigned m;           /* mask for bit lengths table */
+    unsigned n;           /* number of lengths to get */
+    struct huft *tl;      /* literal/length code table */
+    struct huft *td;      /* distance code table */
+    int bl;               /* lookup bits for tl */
+    int bd;               /* lookup bits for td */
+    unsigned nb;          /* number of bit length codes */
+    unsigned nl;          /* number of literal/length codes */
+    unsigned nd;          /* number of distance codes */
+    unsigned *ll;         /* literal/length and distance code lengths */
+    register ulg b;       /* bit buffer */
+    register unsigned k;  /* number of bits in bit buffer */
+    int ret;
+
+    DEBG("<dyn");
+
+#ifdef PKZIP_BUG_WORKAROUND
+    ll = malloc(sizeof(*ll) * (288+32));  /* literal/length and distance code lengths */
+#else
+    ll = malloc(sizeof(*ll) * (286+30));  /* literal/length and distance code lengths */
+#endif
+
+    if (ll == NULL)
+        return 1;
+
+    /* make local bit buffer */
+    b = bb;
+    k = bk;
+
+
+    /* read in table lengths */
+    NEEDBITS(5)
+        nl = 257 + ((unsigned)b & 0x1f);      /* number of literal/length codes */
+    DUMPBITS(5)
+        NEEDBITS(5)
+        nd = 1 + ((unsigned)b & 0x1f);        /* number of distance codes */
+    DUMPBITS(5)
+        NEEDBITS(4)
+        nb = 4 + ((unsigned)b & 0xf);         /* number of bit length codes */
+    DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+        if (nl > 288 || nd > 32)
+#else
+            if (nl > 286 || nd > 30)
+#endif
+            {
+                ret = 1;             /* bad lengths */
+                goto out;
+            }
+
+    DEBG("dyn1 ");
+
+    /* read in bit-length-code lengths */
+    for (j = 0; j < nb; j++)
+    {
+        NEEDBITS(3)
+            ll[border[j]] = (unsigned)b & 7;
+        DUMPBITS(3)
+            }
+    for (; j < 19; j++)
+        ll[border[j]] = 0;
+
+    DEBG("dyn2 ");
+
+    /* build decoding table for trees--single level, 7 bit lookup */
+    bl = 7;
+    if ((i = huft_build(ll, 19, 19, NULL, NULL, &tl, &bl)) != 0)
+    {
+        if (i == 1)
+            huft_free(tl);
+        ret = i;                   /* incomplete code set */
+        goto out;
+    }
+
+    DEBG("dyn3 ");
+
+    /* read in literal and distance code lengths */
+    n = nl + nd;
+    m = mask_bits[bl];
+    i = l = 0;
+    while ((unsigned)i < n)
+    {
+        NEEDBITS((unsigned)bl)
+            j = (td = tl + ((unsigned)b & m))->b;
+        DUMPBITS(j)
+            j = td->v.n;
+        if (j < 16)                 /* length of code in bits (0..15) */
+            ll[i++] = l = j;          /* save last length in l */
+        else if (j == 16)           /* repeat last length 3 to 6 times */
+        {
+            NEEDBITS(2)
+                j = 3 + ((unsigned)b & 3);
+            DUMPBITS(2)
+                if ((unsigned)i + j > n) {
+                    ret = 1;
+                    goto out;
+                }
+            while (j--)
+                ll[i++] = l;
+        }
+        else if (j == 17)           /* 3 to 10 zero length codes */
+        {
+            NEEDBITS(3)
+                j = 3 + ((unsigned)b & 7);
+            DUMPBITS(3)
+                if ((unsigned)i + j > n) {
+                    ret = 1;
+                    goto out;
+                }
+            while (j--)
+                ll[i++] = 0;
+            l = 0;
+        }
+        else                        /* j == 18: 11 to 138 zero length codes */
+        {
+            NEEDBITS(7)
+                j = 11 + ((unsigned)b & 0x7f);
+            DUMPBITS(7)
+                if ((unsigned)i + j > n) {
+                    ret = 1;
+                    goto out;
+                }
+            while (j--)
+                ll[i++] = 0;
+            l = 0;
+        }
+    }
+
+    DEBG("dyn4 ");
+
+    /* free decoding table for trees */
+    huft_free(tl);
+
+    DEBG("dyn5 ");
+
+    /* restore the global bit buffer */
+    bb = b;
+    bk = k;
+
+    DEBG("dyn5a ");
+
+    /* build the decoding tables for literal/length and distance codes */
+    bl = lbits;
+    if ((i = huft_build(ll, nl, 257, cplens, cplext, &tl, &bl)) != 0)
+    {
+        DEBG("dyn5b ");
+        if (i == 1) {
+            error("incomplete literal tree");
+            huft_free(tl);
+        }
+        ret = i;                   /* incomplete code set */
+        goto out;
+    }
+    DEBG("dyn5c ");
+    bd = dbits;
+    if ((i = huft_build(ll + nl, nd, 0, cpdist, cpdext, &td, &bd)) != 0)
+    {
+        DEBG("dyn5d ");
+        if (i == 1) {
+            error("incomplete distance tree");
+#ifdef PKZIP_BUG_WORKAROUND
+            i = 0;
+        }
+#else
+        huft_free(td);
+    }
+    huft_free(tl);
+    ret = i;                   /* incomplete code set */
+    goto out;
+#endif
+}
+
+DEBG("dyn6 ");
+
+  /* decompress until an end-of-block code */
+if (inflate_codes(tl, td, bl, bd)) {
+    ret = 1;
+    goto out;
+}
+
+DEBG("dyn7 ");
+
+  /* free the decoding tables, return */
+huft_free(tl);
+huft_free(td);
+
+DEBG(">");
+ret = 0;
+out:
+free(ll);
+return ret;
+
+underrun:
+ret = 4;   /* Input underrun */
+goto out;
+}
+
+
+
+STATIC int INIT inflate_block(
+int *e                  /* last block flag */
+)
+/* decompress an inflated block */
+{
+unsigned t;           /* block type */
+register ulg b;       /* bit buffer */
+register unsigned k;  /* number of bits in bit buffer */
+
+DEBG("<blk");
+
+/* make local bit buffer */
+b = bb;
+k = bk;
+
+
+/* read in last block bit */
+NEEDBITS(1)
+    *e = (int)b & 1;
+    DUMPBITS(1)
+
+
+    /* read in block type */
+    NEEDBITS(2)
+    t = (unsigned)b & 3;
+    DUMPBITS(2)
+
+
+    /* restore the global bit buffer */
+    bb = b;
+    bk = k;
+
+    /* inflate that block type */
+    if (t == 2)
+    return inflate_dynamic();
+    if (t == 0)
+    return inflate_stored();
+    if (t == 1)
+    return inflate_fixed();
+
+    DEBG(">");
+
+    /* bad block type */
+    return 2;
+
+    underrun:
+    return 4;   /* Input underrun */
+}
+
+
+
+STATIC int INIT inflate(void)
+/* decompress an inflated entry */
+{
+    int e;                /* last block flag */
+    int r;                /* result code */
+    unsigned h;           /* maximum struct huft's malloc'ed */
+
+    /* initialize window, bit buffer */
+    wp = 0;
+    bk = 0;
+    bb = 0;
+
+
+    /* decompress until the last block */
+    h = 0;
+    do {
+        hufts = 0;
+#ifdef ARCH_HAS_DECOMP_WDOG
+        arch_decomp_wdog();
+#endif
+        r = inflate_block(&e);
+        if (r)
+            return r;
+        if (hufts > h)
+            h = hufts;
+    } while (!e);
+
+    /* Undo too much lookahead. The next read will be byte aligned so we
+     * can discard unused bits in the last meaningful byte.
+     */
+    while (bk >= 8) {
+        bk -= 8;
+        inptr--;
+    }
+
+    /* flush out slide */
+    flush_output(wp);
+
+
+    /* return success */
+#ifdef DEBUG
+    fprintf(stderr, "<%u> ", h);
+#endif /* DEBUG */
+    return 0;
+}
+
+/**********************************************************************
+ *
+ * The following are support routines for inflate.c
+ *
+ **********************************************************************/
+
+static ulg crc_32_tab[256];
+static ulg crc;  /* initialized in makecrc() so it'll reside in bss */
+#define CRC_VALUE (crc ^ 0xffffffffUL)
+
+/*
+ * Code to compute the CRC-32 table. Borrowed from 
+ * gzip-1.0.3/makecrc.c.
+ */
+
+static void INIT
+makecrc(void)
+{
+/* Not copyrighted 1990 Mark Adler */
+
+    unsigned long c;      /* crc shift register */
+    unsigned long e;      /* polynomial exclusive-or pattern */
+    int i;                /* counter for all possible eight bit values */
+    int k;                /* byte being shifted into crc apparatus */
+
+    /* terms of polynomial defining this crc (except x^32): */
+    static const int p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+    /* Make exclusive-or pattern from polynomial */
+    e = 0;
+    for (i = 0; i < sizeof(p)/sizeof(int); i++)
+        e |= 1L << (31 - p[i]);
+
+    crc_32_tab[0] = 0;
+
+    for (i = 1; i < 256; i++)
+    {
+        c = 0;
+        for (k = i | 256; k != 1; k >>= 1)
+        {
+            c = c & 1 ? (c >> 1) ^ e : c >> 1;
+            if (k & 1)
+                c ^= e;
+        }
+        crc_32_tab[i] = c;
+    }
+
+    /* this is initialized here so this code could reside in ROM */
+    crc = (ulg)0xffffffffUL; /* shift register contents */
+}
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
+#define RESERVED     0xC0 /* bit 6,7:   reserved */
+
+/*
+ * Do the uncompression!
+ */
+static int INIT gunzip(void)
+{
+    uch flags;
+    unsigned char magic[2]; /* magic header */
+    char method;
+    ulg orig_crc = 0;       /* original crc */
+    ulg orig_len = 0;       /* original uncompressed length */
+    int res;
+
+    magic[0] = NEXTBYTE();
+    magic[1] = NEXTBYTE();
+    method   = NEXTBYTE();
+
+    if (magic[0] != 037 ||
+        ((magic[1] != 0213) && (magic[1] != 0236))) {
+        error("bad gzip magic numbers");
+        return -1;
+    }
+
+    /* We only support method #8, DEFLATED */
+    if (method != 8)  {
+        error("internal error, invalid method");
+        return -1;
+    }
+
+    flags  = (uch)get_byte();
+    if ((flags & ENCRYPTED) != 0) {
+        error("Input is encrypted");
+        return -1;
+    }
+    if ((flags & CONTINUATION) != 0) {
+        error("Multi part input");
+        return -1;
+    }
+    if ((flags & RESERVED) != 0) {
+        error("Input has invalid flags");
+        return -1;
+    }
+    NEXTBYTE(); /* Get timestamp */
+    NEXTBYTE();
+    NEXTBYTE();
+    NEXTBYTE();
+
+    (void)NEXTBYTE();  /* Ignore extra flags for the moment */
+    (void)NEXTBYTE();  /* Ignore OS type for the moment */
+
+    if ((flags & EXTRA_FIELD) != 0) {
+        unsigned len = (unsigned)NEXTBYTE();
+        len |= ((unsigned)NEXTBYTE())<<8;
+        while (len--) (void)NEXTBYTE();
+    }
+
+    /* Get original file name if it was truncated */
+    if ((flags & ORIG_NAME) != 0) {
+        /* Discard the old name */
+        while (NEXTBYTE() != 0) /* null */ ;
+    } 
+
+    /* Discard file comment if any */
+    if ((flags & COMMENT) != 0) {
+        while (NEXTBYTE() != 0) /* null */ ;
+    }
+
+    /* Decompress */
+    if ((res = inflate())) {
+        switch (res) {
+        case 0:
+            break;
+        case 1:
+            error("invalid compressed format (err=1)");
+            break;
+        case 2:
+            error("invalid compressed format (err=2)");
+            break;
+        case 3:
+            error("out of memory");
+            break;
+        case 4:
+            error("out of input data");
+            break;
+        default:
+            error("invalid compressed format (other)");
+        }
+        return -1;
+    }
+     
+    /* Get the crc and original length */
+    /* crc32  (see algorithm.doc)
+     * uncompressed input size modulo 2^32
+     */
+    orig_crc = (ulg) NEXTBYTE();
+    orig_crc |= (ulg) NEXTBYTE() << 8;
+    orig_crc |= (ulg) NEXTBYTE() << 16;
+    orig_crc |= (ulg) NEXTBYTE() << 24;
+    
+    orig_len = (ulg) NEXTBYTE();
+    orig_len |= (ulg) NEXTBYTE() << 8;
+    orig_len |= (ulg) NEXTBYTE() << 16;
+    orig_len |= (ulg) NEXTBYTE() << 24;
+    
+    /* Validate decompression */
+    if (orig_crc != CRC_VALUE) {
+        error("crc error");
+        return -1;
+    }
+    if (orig_len != bytes_out) {
+        error("length error");
+        return -1;
+    }
+    return 0;
+
+ underrun:   /* NEXTBYTE() goto's here if needed */
+    error("out of input data");
+    return -1;
+}
index 2a88602e518b7cf352c2ba946b90a62c069717fd..91376718176e042f62f7acbe420f050f196c7c41 100644 (file)
@@ -221,7 +221,9 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDLE(void) arg)
                 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
 #ifdef CONFIG_X86
             if ( !is_hvm_vcpu(current) )
-                fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad;
+                fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
+                             (1U << XENFEAT_highmem_assist) |
+                             (1U << XENFEAT_gnttab_map_avail_bits);
 #endif
             break;
         default:
index 2eb73e94b5399cd7d126a3d51f33a849d6d890b8..aa7b5ee577f67370778ac6c6c6e68335a38bfdc8 100644 (file)
@@ -336,7 +336,6 @@ static void crash_save_vmcoreinfo(void)
     VMCOREINFO_SYMBOL(frame_table);
     VMCOREINFO_SYMBOL(alloc_bitmap);
     VMCOREINFO_SYMBOL(max_page);
-    VMCOREINFO_SYMBOL(xenheap_phys_end);
 
     VMCOREINFO_STRUCT_SIZE(page_info);
     VMCOREINFO_STRUCT_SIZE(domain);
index e24a6d335f172acdadb12b7ed6342c592b2b16be..c481df0f3f87585981f314da4f9ae17c31ac0771 100644 (file)
@@ -183,10 +183,10 @@ static void dump_domains(unsigned char key)
     {
         printk("General information for domain %u:\n", d->domain_id);
         cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask);
-        printk("    refcnt=%d nr_pages=%d xenheap_pages=%d "
-               "dirty_cpus=%s\n",
-               atomic_read(&d->refcnt),
-               d->tot_pages, d->xenheap_pages, tmpstr);
+        printk("    refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d "
+               "dirty_cpus=%s max_pages=%u\n",
+               atomic_read(&d->refcnt), d->is_dying,
+               d->tot_pages, d->xenheap_pages, tmpstr, d->max_pages);
         printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
                "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
                d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3],
@@ -225,7 +225,7 @@ static void dump_domains(unsigned char key)
                    test_bit(v->virq_to_evtchn[VIRQ_DEBUG], 
                             &shared_info(d, evtchn_mask)),
                    test_bit(v->virq_to_evtchn[VIRQ_DEBUG] /
-                            BITS_PER_GUEST_LONG(d),
+                            BITS_PER_EVTCHN_WORD(d),
                             &vcpu_info(v, evtchn_pending_sel)));
             send_guest_vcpu_virq(v, VIRQ_DEBUG);
         }
index 6b1a6379fc93611709d2beb907f3f9bfa1fcbd8d..ff7084c11fb61930cafac03dd39c0c3795ef134c 100644 (file)
@@ -90,6 +90,7 @@ int elf_xen_parse_note(struct elf_binary *elf,
         [XEN_ELFNOTE_ENTRY] = { "ENTRY", 0},
         [XEN_ELFNOTE_HYPERCALL_PAGE] = { "HYPERCALL_PAGE", 0},
         [XEN_ELFNOTE_VIRT_BASE] = { "VIRT_BASE", 0},
+        [XEN_ELFNOTE_INIT_P2M] = { "INIT_P2M", 0},
         [XEN_ELFNOTE_PADDR_OFFSET] = { "PADDR_OFFSET", 0},
         [XEN_ELFNOTE_HV_START_LOW] = { "HV_START_LOW", 0},
         [XEN_ELFNOTE_XEN_VERSION] = { "XEN_VERSION", 1},
@@ -164,6 +165,9 @@ int elf_xen_parse_note(struct elf_binary *elf,
     case XEN_ELFNOTE_ENTRY:
         parms->virt_entry = val;
         break;
+    case XEN_ELFNOTE_INIT_P2M:
+        parms->p2m_base = val;
+        break;
     case XEN_ELFNOTE_PADDR_OFFSET:
         parms->elf_paddr_offset = val;
         break;
@@ -392,6 +396,7 @@ static int elf_xen_addr_calc_check(struct elf_binary *elf,
     elf_msg(elf, "    virt_kstart      = 0x%" PRIx64 "\n", parms->virt_kstart);
     elf_msg(elf, "    virt_kend        = 0x%" PRIx64 "\n", parms->virt_kend);
     elf_msg(elf, "    virt_entry       = 0x%" PRIx64 "\n", parms->virt_entry);
+    elf_msg(elf, "    p2m_base         = 0x%" PRIx64 "\n", parms->p2m_base);
 
     if ( (parms->virt_kstart > parms->virt_kend) ||
          (parms->virt_entry < parms->virt_kstart) ||
@@ -403,6 +408,15 @@ static int elf_xen_addr_calc_check(struct elf_binary *elf,
         return -1;
     }
 
+    if ( (parms->p2m_base != UNSET_ADDR) &&
+         (parms->p2m_base >= parms->virt_kstart) &&
+         (parms->p2m_base < parms->virt_kend) )
+    {
+        elf_err(elf, "%s: ERROR: P->M table base is out of bounds.\n",
+                __FUNCTION__);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -422,6 +436,7 @@ int elf_xen_parse(struct elf_binary *elf,
     parms->virt_entry = UNSET_ADDR;
     parms->virt_hypercall = UNSET_ADDR;
     parms->virt_hv_start_low = UNSET_ADDR;
+    parms->p2m_base = UNSET_ADDR;
     parms->elf_paddr_offset = UNSET_ADDR;
 
     /* Find and parse elf notes. */
index 4ae587cd5dd03db743fed86135c70d3cf4214532..e207b690c428c014ec0d0dfc83e02523684a5b8c 100644 (file)
@@ -7,9 +7,9 @@
 #include <xen/types.h>
 #include <xen/string.h>
 #include <xen/lib.h>
+#include <xen/libelf.h>
 #include <asm/byteorder.h>
 #include <public/elfnote.h>
-#include <public/libelf.h>
 
 #define elf_msg(elf, fmt, args ... ) \
    if (elf->verbose) printk(fmt, ## args )
@@ -49,7 +49,7 @@
 #error Unsupported OS
 #endif
 #include <xen/elfnote.h>
-#include <xen/libelf.h>
+#include <xen/libelf/libelf.h>
 
 #include "xenctrl.h"
 #include "xc_private.h"
index d39c2f59c0204d070f4ed76f078faba25cafc9b2..55e2d8a046ed6116d6d37a6731b1e0d3d3992449 100644 (file)
@@ -111,31 +111,40 @@ static void populate_physmap(struct memop_args *a)
         if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
             goto out;
 
-        page = alloc_domheap_pages(d, a->extent_order, a->memflags);
-        if ( unlikely(page == NULL) ) 
+        if ( a->memflags & MEMF_populate_on_demand )
         {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
+            if ( guest_physmap_mark_populate_on_demand(d, gpfn,
+                                                       a->extent_order) < 0 )
+                goto out;
         }
+        else
+        {
+            page = alloc_domheap_pages(d, a->extent_order, a->memflags);
+            if ( unlikely(page == NULL) ) 
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }
 
-        mfn = page_to_mfn(page);
-        guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
+            mfn = page_to_mfn(page);
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
 
-        if ( !paging_mode_translate(d) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
+            if ( !paging_mode_translate(d) )
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);
 
-            /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
-                goto out;
+                /* Inform the domain of the new page's machine address. */ 
+                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
+                    goto out;
+            }
         }
     }
 
- out:
+out:
     a->nr_done = i;
 }
 
@@ -192,6 +201,11 @@ static void decrease_reservation(struct memop_args *a)
         if ( unlikely(__copy_from_guest_offset(&gmfn, a->extent_list, i, 1)) )
             goto out;
 
+        /* See if populate-on-demand wants to handle this */
+        if ( is_hvm_domain(a->domain)
+             && p2m_pod_decrease_reservation(a->domain, gmfn, a->extent_order) )
+            continue;
+
         for ( j = 0; j < (1 << a->extent_order); j++ )
             if ( !guest_remove_page(a->domain, gmfn + j) )
                 goto out;
@@ -201,77 +215,11 @@ static void decrease_reservation(struct memop_args *a)
     a->nr_done = i;
 }
 
-static long translate_gpfn_list(
-    XEN_GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
-{
-    struct xen_translate_gpfn_list op;
-    unsigned long i;
-    xen_pfn_t gpfn;
-    xen_pfn_t mfn;
-    struct domain *d;
-    int rc;
-
-    if ( copy_from_guest(&op, uop, 1) )
-        return -EFAULT;
-
-    /* Is size too large for us to encode a continuation? */
-    if ( op.nr_gpfns > (ULONG_MAX >> MEMOP_EXTENT_SHIFT) )
-        return -EINVAL;
-
-    if ( !guest_handle_subrange_okay(op.gpfn_list, *progress, op.nr_gpfns-1) ||
-         !guest_handle_subrange_okay(op.mfn_list, *progress, op.nr_gpfns-1) )
-        return -EFAULT;
-
-    rc = rcu_lock_target_domain_by_id(op.domid, &d);
-    if ( rc )
-        return rc;
-
-    if ( !paging_mode_translate(d) )
-    {
-        rcu_unlock_domain(d);
-        return -EINVAL;
-    }
-
-    for ( i = *progress; i < op.nr_gpfns; i++ )
-    {
-        if ( hypercall_preempt_check() )
-        {
-            rcu_unlock_domain(d);
-            *progress = i;
-            return -EAGAIN;
-        }
-
-        if ( unlikely(__copy_from_guest_offset(&gpfn, op.gpfn_list, i, 1)) )
-        {
-            rcu_unlock_domain(d);
-            return -EFAULT;
-        }
-
-        mfn = gmfn_to_mfn(d, gpfn);
-
-        rc = xsm_translate_gpfn_list(current->domain, mfn);
-        if ( rc )
-        {
-            rcu_unlock_domain(d);
-            return rc;
-        }
-
-        if ( unlikely(__copy_to_guest_offset(op.mfn_list, i, &mfn, 1)) )
-        {
-            rcu_unlock_domain(d);
-            return -EFAULT;
-        }
-    }
-
-    rcu_unlock_domain(d);
-    return 0;
-}
-
 static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg)
 {
     struct xen_memory_exchange exch;
-    LIST_HEAD(in_chunk_list);
-    LIST_HEAD(out_chunk_list);
+    PAGE_LIST_HEAD(in_chunk_list);
+    PAGE_LIST_HEAD(out_chunk_list);
     unsigned long in_chunk_order, out_chunk_order;
     xen_pfn_t     gpfn, gmfn, mfn;
     unsigned long i, j, k;
@@ -377,7 +325,7 @@ static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg)
                     goto fail;
                 }
 
-                list_add(&page->list, &in_chunk_list);
+                page_list_add(page, &in_chunk_list);
             }
         }
 
@@ -391,7 +339,7 @@ static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg)
                 goto fail;
             }
 
-            list_add(&page->list, &out_chunk_list);
+            page_list_add(page, &out_chunk_list);
         }
 
         /*
@@ -399,10 +347,8 @@ static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg)
          */
 
         /* Destroy final reference to each input page. */
-        while ( !list_empty(&in_chunk_list) )
+        while ( (page = page_list_remove_head(&in_chunk_list)) )
         {
-            page = list_entry(in_chunk_list.next, struct page_info, list);
-            list_del(&page->list);
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
@@ -412,10 +358,8 @@ static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg)
 
         /* Assign each output page to the domain. */
         j = 0;
-        while ( !list_empty(&out_chunk_list) )
+        while ( (page = page_list_remove_head(&out_chunk_list)) )
         {
-            page = list_entry(out_chunk_list.next, struct page_info, list);
-            list_del(&page->list);
             if ( assign_pages(d, page, exch.out.extent_order,
                               MEMF_no_refcount) )
                 BUG();
@@ -451,21 +395,13 @@ static long memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg)
      */
  fail:
     /* Reassign any input pages we managed to steal. */
-    while ( !list_empty(&in_chunk_list) )
-    {
-        page = list_entry(in_chunk_list.next, struct page_info, list);
-        list_del(&page->list);
+    while ( (page = page_list_remove_head(&in_chunk_list)) )
         if ( assign_pages(d, page, 0, MEMF_no_refcount) )
             BUG();
-    }
 
     /* Free any output pages we managed to allocate. */
-    while ( !list_empty(&out_chunk_list) )
-    {
-        page = list_entry(out_chunk_list.next, struct page_info, list);
-        list_del(&page->list);
+    while ( (page = page_list_remove_head(&out_chunk_list)) )
         free_domheap_pages(page, exch.out.extent_order);
-    }
 
     exch.nr_exchanged = i << in_chunk_order;
 
@@ -480,7 +416,7 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
     struct domain *d;
     int rc, op;
     unsigned int address_bits;
-    unsigned long start_extent, progress;
+    unsigned long start_extent;
     struct xen_memory_reservation reservation;
     struct memop_args args;
     domid_t domid;
@@ -522,6 +458,10 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
 
         args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
 
+        if ( op == XENMEM_populate_physmap
+             && (reservation.mem_flags & XENMEMF_populate_on_demand) )
+            args.memflags |= MEMF_populate_on_demand;
+
         if ( likely(reservation.domid == DOMID_SELF) )
         {
             d = rcu_lock_current_domain();
@@ -612,17 +552,6 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
 
         break;
 
-    case XENMEM_translate_gpfn_list:
-        progress = cmd >> MEMOP_EXTENT_SHIFT;
-        rc = translate_gpfn_list(
-            guest_handle_cast(arg, xen_translate_gpfn_list_t),
-            &progress);
-        if ( rc == -EAGAIN )
-            return hypercall_create_continuation(
-                __HYPERVISOR_memory_op, "lh",
-                op | (progress << MEMOP_EXTENT_SHIFT), arg);
-        break;
-
     default:
         rc = arch_memory_op(op, arg);
         break;
@@ -631,6 +560,17 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
     return rc;
 }
 
+/* Temporary placeholder. */
+int do_tmem_op(void *tmem_op)
+{
+    static bool_t warned;
+
+    if ( !test_and_set_bool(warned) )
+        printk("tmem: not implemented\n");
+
+    return -ENOSYS;
+}
+
 /*
  * Local variables:
  * mode: C
index d9e0a2946ac738c4b9cc4179c03d43254cec89b5..830e44cdb47e3a4c707ee3c10a999463e29368f1 100644 (file)
@@ -35,6 +35,7 @@
 #include <xen/perfc.h>
 #include <xen/numa.h>
 #include <xen/nodemask.h>
+#include <public/sysctl.h>
 #include <asm/page.h>
 #include <asm/numa.h>
 #include <asm/flushtlb.h>
@@ -71,9 +72,14 @@ integer_param("dma_bits", dma_bitsize);
 #endif
 
 static DEFINE_SPINLOCK(page_scrub_lock);
-LIST_HEAD(page_scrub_list);
+PAGE_LIST_HEAD(page_scrub_list);
 static unsigned long scrub_pages;
 
+/* Offlined page list, protected by heap_lock. */
+PAGE_LIST_HEAD(page_offlined_list);
+/* Broken page list, protected by heap_lock. */
+PAGE_LIST_HEAD(page_broken_list);
+
 /*********************
  * ALLOCATION BITMAP
  *  One bit per page of memory. Bit set => page is allocated.
@@ -260,9 +266,11 @@ unsigned long __init alloc_boot_pages(
 #define MEMZONE_XEN 0
 #define NR_ZONES    (PADDR_BITS - PAGE_SHIFT)
 
-#define pfn_dom_zone_type(_pfn) (fls(_pfn) - 1)
+#define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
+#define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
+                          (fls(page_to_mfn(pg)) - 1))
 
-typedef struct list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
+typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
 #define heap(node, zone, order) ((*_heap[node])[zone][order])
 
@@ -270,13 +278,16 @@ static unsigned long *avail[MAX_NUMNODES];
 
 static DEFINE_SPINLOCK(heap_lock);
 
-static void init_node_heap(int node)
+static unsigned long init_node_heap(int node, unsigned long mfn,
+                                    unsigned long nr)
 {
     /* First node to be discovered has its heap metadata statically alloced. */
     static heap_by_zone_and_order_t _heap_static;
     static unsigned long avail_static[NR_ZONES];
     static int first_node_initialised;
-
+    unsigned long needed = (sizeof(**_heap) +
+                            sizeof(**avail) * NR_ZONES +
+                            PAGE_SIZE - 1) >> PAGE_SHIFT;
     int i, j;
 
     if ( !first_node_initialised )
@@ -284,19 +295,40 @@ static void init_node_heap(int node)
         _heap[node] = &_heap_static;
         avail[node] = avail_static;
         first_node_initialised = 1;
+        needed = 0;
+    }
+#ifdef DIRECTMAP_VIRT_END
+    else if ( nr >= needed &&
+              (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
+    {
+        _heap[node] = mfn_to_virt(mfn);
+        avail[node] = mfn_to_virt(mfn + needed) - sizeof(**avail) * NR_ZONES;
+    }
+#endif
+    else if ( get_order_from_bytes(sizeof(**_heap)) ==
+              get_order_from_pages(needed) )
+    {
+        _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
+        BUG_ON(!_heap[node]);
+        avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
+                      sizeof(**avail) * NR_ZONES;
+        needed = 0;
     }
     else
     {
         _heap[node] = xmalloc(heap_by_zone_and_order_t);
         avail[node] = xmalloc_array(unsigned long, NR_ZONES);
         BUG_ON(!_heap[node] || !avail[node]);
+        needed = 0;
     }
 
     memset(avail[node], 0, NR_ZONES * sizeof(long));
 
     for ( i = 0; i < NR_ZONES; i++ )
         for ( j = 0; j <= MAX_ORDER; j++ )
-            INIT_LIST_HEAD(&(*_heap[node])[i][j]);
+            INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
+
+    return needed;
 }
 
 /* Allocate 2^@order contiguous pages. */
@@ -338,7 +370,7 @@ static struct page_info *alloc_heap_pages(
 
             /* Find smallest order which can satisfy the request. */
             for ( j = order; j <= MAX_ORDER; j++ )
-                if ( !list_empty(&heap(node, zone, j)) )
+                if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
                     goto found;
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
@@ -352,14 +384,11 @@ static struct page_info *alloc_heap_pages(
     return NULL;
 
  found: 
-    pg = list_entry(heap(node, zone, j).next, struct page_info, list);
-    list_del(&pg->list);
-
     /* We may have to halve the chunk a number of times. */
     while ( j != order )
     {
         PFN_ORDER(pg) = --j;
-        list_add_tail(&pg->list, &heap(node, zone, j));
+        page_list_add_tail(pg, &heap(node, zone, j));
         pg += 1 << j;
     }
     
@@ -376,10 +405,13 @@ static struct page_info *alloc_heap_pages(
         /* Reference count must continuously be zero for free pages. */
         BUG_ON(pg[i].count_info != 0);
 
-        /* Add in any extra CPUs that need flushing because of this page. */
-        cpus_andnot(extra_cpus_mask, pg[i].u.free.cpumask, mask);
-        tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
-        cpus_or(mask, mask, extra_cpus_mask);
+        if ( pg[i].u.free.need_tlbflush )
+        {
+            /* Add in extra CPUs that need flushing because of this page. */
+            cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
+            tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
+            cpus_or(mask, mask, extra_cpus_mask);
+        }
 
         /* Initialise fields which have other uses for free pages. */
         pg[i].u.inuse.type_info = 0;
@@ -395,15 +427,89 @@ static struct page_info *alloc_heap_pages(
     return pg;
 }
 
+/* Remove any offlined page in the buddy pointed to by head. */
+static int reserve_offlined_page(struct page_info *head)
+{
+    unsigned int node = phys_to_nid(page_to_maddr(head));
+    int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
+    struct page_info *cur_head;
+    int cur_order;
+
+    ASSERT(spin_is_locked(&heap_lock));
+
+    cur_head = head;
+
+    page_list_del(head, &heap(node, zone, head_order));
+
+    while ( cur_head < (head + (1 << head_order)) )
+    {
+        struct page_info *pg;
+        int next_order;
+
+        if ( test_bit(_PGC_offlined, &cur_head->count_info) )
+        {
+            cur_head++;
+            continue;
+        }
+
+        next_order = cur_order = 0;
+
+        while ( cur_order < head_order )
+        {
+            next_order = cur_order + 1;
+
+            if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
+                goto merge;
+
+            for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
+                  i < (1 << next_order);
+                  i++, pg++ )
+                if ( test_bit(_PGC_offlined, &pg->count_info) )
+                    break;
+            if ( i == ( 1 << next_order) )
+            {
+                cur_order = next_order;
+                continue;
+            }
+            else
+            {
+            merge:
+                /* We don't consider merging outside the head_order. */
+                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
+                PFN_ORDER(cur_head) = cur_order;
+                cur_head += (1 << cur_order);
+                break;
+            }
+        }
+    }
+
+    for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
+    {
+        if ( !test_bit(_PGC_offlined, &cur_head->count_info) )
+            continue;
+
+        avail[node][zone]--;
+
+        map_alloc(page_to_mfn(cur_head), 1);
+
+        page_list_add_tail(cur_head,
+                           test_bit(_PGC_broken, &cur_head->count_info) ?
+                           &page_broken_list : &page_offlined_list);
+
+        count++;
+    }
+
+    return count;
+}
+
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    unsigned int zone, struct page_info *pg, unsigned int order)
+    struct page_info *pg, unsigned int order)
 {
     unsigned long mask;
-    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
-    struct domain *d;
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
+    unsigned int zone = page_to_zone(pg);
 
-    ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
     ASSERT(node >= 0);
     ASSERT(node < num_online_nodes());
@@ -421,17 +527,19 @@ static void free_heap_pages(
          *     in its pseudophysical address space).
          * In all the above cases there can be no guest mappings of this page.
          */
-        pg[i].count_info = 0;
-
-        if ( (d = page_get_owner(&pg[i])) != NULL )
-        {
-            pg[i].tlbflush_timestamp = tlbflush_current_time();
-            pg[i].u.free.cpumask     = d->domain_dirty_cpumask;
-        }
-        else
+        ASSERT(!(pg[i].count_info & PGC_offlined));
+        pg[i].count_info &= PGC_offlining | PGC_broken;
+        if ( pg[i].count_info & PGC_offlining )
         {
-            cpus_clear(pg[i].u.free.cpumask);
+            pg[i].count_info &= ~PGC_offlining;
+            pg[i].count_info |= PGC_offlined;
+            tainted = 1;
         }
+
+        /* If a page has no owner it will need no safety TLB flush. */
+        pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
+        if ( pg[i].u.free.need_tlbflush )
+            pg[i].tlbflush_timestamp = tlbflush_current_time();
     }
 
     spin_lock(&heap_lock);
@@ -450,8 +558,8 @@ static void free_heap_pages(
             if ( allocated_in_map(page_to_mfn(pg)-mask) ||
                  (PFN_ORDER(pg-mask) != order) )
                 break;
-            list_del(&(pg-mask)->list);
             pg -= mask;
+            page_list_del(pg, &heap(node, zone, order));
         }
         else
         {
@@ -459,9 +567,9 @@ static void free_heap_pages(
             if ( allocated_in_map(page_to_mfn(pg)+mask) ||
                  (PFN_ORDER(pg+mask) != order) )
                 break;
-            list_del(&(pg+mask)->list);
+            page_list_del(pg + mask, &heap(node, zone, order));
         }
-        
+
         order++;
 
         /* After merging, pg should remain in the same node. */
@@ -469,50 +577,288 @@ static void free_heap_pages(
     }
 
     PFN_ORDER(pg) = order;
-    list_add_tail(&pg->list, &heap(node, zone, order));
+    page_list_add_tail(pg, &heap(node, zone, order));
+
+    if ( tainted )
+        reserve_offlined_page(pg);
 
     spin_unlock(&heap_lock);
 }
 
+
+/*
+ * Following possible status for a page:
+ * free and Online; free and offlined; free and offlined and broken;
+ * assigned and online; assigned and offlining; assigned and offling and broken
+ *
+ * Following rules applied for page offline:
+ * Once a page is broken, it can't be assigned anymore
+ * A page will be offlined only if it is free
+ * return original count_info
+ *
+ */
+static unsigned long mark_page_offline(struct page_info *pg, int broken)
+{
+    unsigned long nx, x, y = pg->count_info;
+
+    ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
+    ASSERT(spin_is_locked(&heap_lock));
+
+    do {
+        nx = x = y;
+
+        if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) )
+            return y;
+
+        if ( x & PGC_offlined )
+        {
+            /* PGC_offlined means it is a free page. */
+            if ( broken && !(nx & PGC_broken) )
+                nx |= PGC_broken;
+            else
+                return y;
+        }
+        else
+        {
+            /* It is not offlined, not reserved page */
+            nx |= (allocated_in_map(page_to_mfn(pg)) ?
+                   PGC_offlining : PGC_offlined);
+        }
+
+        if ( broken )
+            nx |= PGC_broken;
+    } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
+
+    return y;
+}
+
+static int reserve_heap_page(struct page_info *pg)
+{
+    struct page_info *head = NULL;
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg));
+    unsigned int zone = page_to_zone(pg);
+
+    for ( i = 0; i <= MAX_ORDER; i++ )
+    {
+        struct page_info *tmp;
+
+        if ( page_list_empty(&heap(node, zone, i)) )
+            continue;
+
+        page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
+        {
+            if ( (head <= pg) &&
+                 (head + (1UL << i) > pg) )
+                return reserve_offlined_page(head);
+        }
+    }
+
+    return -EINVAL;
+
+}
+
+int offline_page(unsigned long mfn, int broken, uint32_t *status)
+{
+    unsigned long old_info = 0;
+    struct domain *owner;
+    int ret = 0;
+    struct page_info *pg;
+
+    if ( mfn > max_page )
+    {
+        dprintk(XENLOG_WARNING,
+                "try to offline page out of range %lx\n", mfn);
+        return -EINVAL;
+    }
+
+    *status = 0;
+    pg = mfn_to_page(mfn);
+
+#if defined(__x86_64__)
+     /* Xen's txt mfn in x86_64 is reserved in e820 */
+    if ( is_xen_fixed_mfn(mfn) )
+#elif defined(__i386__)
+    if ( is_xen_heap_mfn(mfn) )
+#endif
+    {
+        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
+          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+        return -EPERM;
+    }
+
+    /*
+     * N.B. xen's txt in x86_64 is marked reserved and handled already
+     *  Also kexec range is reserved
+     */
+     if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
+     {
+        *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
+        return -EINVAL;
+     }
+
+    spin_lock(&heap_lock);
+
+    old_info = mark_page_offline(pg, broken);
+
+    if ( !allocated_in_map(mfn) )
+    {
+        /* Free pages are reserve directly */
+        reserve_heap_page(pg);
+        *status = PG_OFFLINE_OFFLINED;
+    }
+    else if ( test_bit(_PGC_offlined, &pg->count_info) )
+    {
+        *status = PG_OFFLINE_OFFLINED;
+    }
+    else if ( (owner = page_get_owner_and_reference(pg)) )
+    {
+            *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
+              (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
+            /* Release the reference since it will not be allocated anymore */
+            put_page(pg);
+    }
+    else if ( old_info & PGC_xen_heap)
+    {
+        *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
+          (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
+    }
+    else
+    {
+        /*
+         * assign_pages does not hold heap_lock, so small window that the owner
+         * may be set later, but please notice owner will only change from
+         * NULL to be set, not verse, since page is offlining now.
+         * No windows If called from #MC handler, since all CPU are in softirq
+         * If called from user space like CE handling, tools can wait some time
+         * before call again.
+         */
+        *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
+                  (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
+    }
+
+    if ( broken )
+        *status |= PG_OFFLINE_BROKEN;
+
+    spin_unlock(&heap_lock);
+
+    return ret;
+}
+
+/*
+ * Online the memory.
+ *   The caller should make sure end_pfn <= max_page,
+ *   if not, expand_pages() should be called prior to online_page().
+ */
+unsigned int online_page(unsigned long mfn, uint32_t *status)
+{
+    struct page_info *pg;
+    int ret = 0, free = 0;
+
+    if ( mfn > max_page )
+    {
+        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+        return -EINVAL;
+    }
+
+    pg = mfn_to_page(mfn);
+
+    *status = 0;
+
+    spin_lock(&heap_lock);
+
+    if ( unlikely(is_page_broken(pg)) )
+    {
+        ret = -EINVAL;
+        *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
+    }
+    else if ( pg->count_info & PGC_offlined )
+    {
+        clear_bit(_PGC_offlined, &pg->count_info);
+        page_list_del(pg, &page_offlined_list);
+        *status = PG_ONLINE_ONLINED;
+        free = 1;
+    }
+    else if ( pg->count_info & PGC_offlining )
+    {
+        clear_bit(_PGC_offlining, &pg->count_info);
+        *status = PG_ONLINE_ONLINED;
+    }
+    spin_unlock(&heap_lock);
+
+    if ( free )
+        free_heap_pages(pg, 0);
+
+    return ret;
+}
+
+int query_page_offline(unsigned long mfn, uint32_t *status)
+{
+    struct page_info *pg;
+
+    if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
+    {
+        dprintk(XENLOG_WARNING, "call expand_pages() first\n");
+        return -EINVAL;
+    }
+
+    *status = 0;
+    spin_lock(&heap_lock);
+
+    pg = mfn_to_page(mfn);
+
+    if (pg->count_info & PGC_offlining)
+        *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
+    if (pg->count_info & PGC_broken)
+        *status |= PG_OFFLINE_STATUS_BROKEN;
+    if (pg->count_info & PGC_offlined)
+        *status |= PG_OFFLINE_STATUS_OFFLINED;
+
+    spin_unlock(&heap_lock);
+
+    return 0;
+}
+
 /*
  * Hand the specified arbitrary page range to the specified heap zone
  * checking the node_id of the previous page.  If they differ and the
  * latter is not on a MAX_ORDER boundary, then we reserve the page by
  * not freeing it to the buddy allocator.
  */
-#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
 static void init_heap_pages(
-    unsigned int zone, struct page_info *pg, unsigned long nr_pages)
+    struct page_info *pg, unsigned long nr_pages)
 {
     unsigned int nid_curr, nid_prev;
     unsigned long i;
 
-    ASSERT(zone < NR_ZONES);
-
-    if ( likely(page_to_mfn(pg) != 0) )
-        nid_prev = phys_to_nid(page_to_maddr(pg-1));
-    else
-        nid_prev = phys_to_nid(page_to_maddr(pg));
+    nid_prev = phys_to_nid(page_to_maddr(pg-1));
 
-    for ( i = 0; i < nr_pages; i++ )
+    for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
     {
         nid_curr = phys_to_nid(page_to_maddr(pg+i));
 
         if ( unlikely(!avail[nid_curr]) )
-            init_node_heap(nid_curr);
+        {
+            unsigned long n;
+
+            n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i);
+            if ( n )
+            {
+                BUG_ON(i + n > nr_pages);
+                i += n - 1;
+                continue;
+            }
+        }
 
         /*
-         * free pages of the same node, or if they differ, but are on a
-         * MAX_ORDER alignement boundary (which already get reserved)
+         * Free pages of the same node, or if they differ, but are on a
+         * MAX_ORDER alignment boundary (which already get reserved).
          */
-         if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
-                                         MAX_ORDER_ALIGNED) )
-             free_heap_pages(zone, pg+i, 0);
-         else
-             printk("Reserving non-aligned node boundary @ mfn %lu\n",
-                    page_to_mfn(pg+i));
-
-        nid_prev = nid_curr;
+        if ( (nid_curr == nid_prev) ||
+             !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
+            free_heap_pages(pg+i, 0);
+        else
+            printk("Reserving non-aligned node boundary @ mfn %#lx\n",
+                   page_to_mfn(pg+i));
     }
 }
 
@@ -540,7 +886,7 @@ static unsigned long avail_heap_pages(
 #define avail_for_domheap(mfn) !(allocated_in_map(mfn) || is_xen_heap_mfn(mfn))
 void __init end_boot_allocator(void)
 {
-    unsigned long i;
+    unsigned long i, nr = 0;
     int curr_free, next_free;
 
     /* Pages that are free now go to the domain sub-allocator. */
@@ -553,8 +899,15 @@ void __init end_boot_allocator(void)
         if ( next_free )
             map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
         if ( curr_free )
-            init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
+            ++nr;
+        else if ( nr )
+        {
+            init_heap_pages(mfn_to_page(i - nr), nr);
+            nr = 0;
+        }
     }
+    if ( nr )
+        init_heap_pages(mfn_to_page(i - nr), nr);
 
     if ( !dma_bitsize && (num_online_nodes() > 1) )
     {
@@ -634,6 +987,8 @@ void __init scrub_heap_pages(void)
  * XEN-HEAP SUB-ALLOCATOR
  */
 
+#if !defined(__x86_64__) && !defined(__ia64__)
+
 void init_xenheap_pages(paddr_t ps, paddr_t pe)
 {
     ps = round_pgup(ps);
@@ -652,11 +1007,11 @@ void init_xenheap_pages(paddr_t ps, paddr_t pe)
     if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
         pe -= PAGE_SIZE;
 
-    init_heap_pages(MEMZONE_XEN, maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
+    init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
 }
 
 
-void *alloc_xenheap_pages(unsigned int order)
+void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
 {
     struct page_info *pg;
 
@@ -665,15 +1020,11 @@ void *alloc_xenheap_pages(unsigned int order)
     pg = alloc_heap_pages(
         MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order);
     if ( unlikely(pg == NULL) )
-        goto no_memory;
+        return NULL;
 
     memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
 
     return page_to_virt(pg);
-
- no_memory:
-    printk("Cannot handle page request order %d!\n", order);
-    return NULL;
 }
 
 
@@ -686,9 +1037,53 @@ void free_xenheap_pages(void *v, unsigned int order)
 
     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
 
-    free_heap_pages(MEMZONE_XEN, virt_to_page(v), order);
+    free_heap_pages(virt_to_page(v), order);
 }
 
+#else
+
+void init_xenheap_pages(paddr_t ps, paddr_t pe)
+{
+    init_domheap_pages(ps, pe);
+}
+
+void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
+{
+    struct page_info *pg;
+    unsigned int i;
+
+    ASSERT(!in_irq());
+
+    pg = alloc_domheap_pages(NULL, order, memflags);
+    if ( unlikely(pg == NULL) )
+        return NULL;
+
+    for ( i = 0; i < (1u << order); i++ )
+        pg[i].count_info |= PGC_xen_heap;
+
+    return page_to_virt(pg);
+}
+
+void free_xenheap_pages(void *v, unsigned int order)
+{
+    struct page_info *pg;
+    unsigned int i;
+
+    ASSERT(!in_irq());
+
+    if ( v == NULL )
+        return;
+
+    pg = virt_to_page(v);
+
+    for ( i = 0; i < (1u << order); i++ )
+        pg[i].count_info &= ~PGC_xen_heap;
+
+    free_heap_pages(pg, order);
+}
+
+#endif
+
 
 
 /*************************
@@ -697,26 +1092,14 @@ void free_xenheap_pages(void *v, unsigned int order)
 
 void init_domheap_pages(paddr_t ps, paddr_t pe)
 {
-    unsigned long s_tot, e_tot;
-    unsigned int zone;
+    unsigned long smfn, emfn;
 
     ASSERT(!in_irq());
 
-    s_tot = round_pgup(ps) >> PAGE_SHIFT;
-    e_tot = round_pgdown(pe) >> PAGE_SHIFT;
-
-    zone = fls(s_tot);
-    BUG_ON(zone <= MEMZONE_XEN + 1);
-    for ( --zone; s_tot < e_tot; ++zone )
-    {
-        unsigned long end = e_tot;
+    smfn = round_pgup(ps) >> PAGE_SHIFT;
+    emfn = round_pgdown(pe) >> PAGE_SHIFT;
 
-        BUILD_BUG_ON(NR_ZONES > BITS_PER_LONG);
-        if ( zone < BITS_PER_LONG - 1 && end > 1UL << (zone + 1) )
-            end = 1UL << (zone + 1);
-        init_heap_pages(zone, mfn_to_page(s_tot), end - s_tot);
-        s_tot = end;
-    }
+    init_heap_pages(mfn_to_page(smfn), emfn - smfn);
 }
 
 
@@ -759,7 +1142,7 @@ int assign_pages(
         page_set_owner(&pg[i], d);
         wmb(); /* Domain pointer must be visible before updating refcnt. */
         pg[i].count_info = PGC_allocated | 1;
-        list_add_tail(&pg[i].list, &d->page_list);
+        page_list_add_tail(&pg[i], &d->page_list);
     }
 
     spin_unlock(&d->page_alloc_lock);
@@ -776,7 +1159,7 @@ struct page_info *alloc_domheap_pages(
 {
     struct page_info *pg = NULL;
     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
-    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1);
+    unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
 
     ASSERT(!in_irq());
 
@@ -784,16 +1167,11 @@ struct page_info *alloc_domheap_pages(
         node = domain_to_node(d);
 
     bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
-    if ( bits <= (PAGE_SHIFT + 1) )
+    if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
         return NULL;
 
-    bits -= PAGE_SHIFT + 1;
-    if ( bits < zone_hi )
-        zone_hi = bits;
-
-    if ( (dma_bitsize > PAGE_SHIFT) &&
-         ((zone_hi + PAGE_SHIFT) >= dma_bitsize) )
-        pg = alloc_heap_pages(dma_bitsize - PAGE_SHIFT, zone_hi, node, order);
+    if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order);
 
     if ( (pg == NULL) &&
          ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
@@ -802,7 +1180,7 @@ struct page_info *alloc_domheap_pages(
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
     {
-        free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order);
+        free_heap_pages(pg, order);
         return NULL;
     }
     
@@ -822,7 +1200,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
         spin_lock_recursive(&d->page_alloc_lock);
 
         for ( i = 0; i < (1 << order); i++ )
-            list_del(&pg[i].list);
+            page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
 
         d->xenheap_pages -= 1 << order;
         drop_dom_ref = (d->xenheap_pages == 0);
@@ -837,7 +1215,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
         for ( i = 0; i < (1 << order); i++ )
         {
             BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
-            list_del(&pg[i].list);
+            page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
         }
 
         d->tot_pages -= 1 << order;
@@ -847,7 +1225,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
 
         if ( likely(!d->is_dying) )
         {
-            free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order);
+            free_heap_pages(pg, order);
         }
         else
         {
@@ -860,7 +1238,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
             {
                 page_set_owner(&pg[i], NULL);
                 spin_lock(&page_scrub_lock);
-                list_add(&pg[i].list, &page_scrub_list);
+                page_list_add(&pg[i], &page_scrub_list);
                 scrub_pages++;
                 spin_unlock(&page_scrub_lock);
             }
@@ -869,7 +1247,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
     else
     {
         /* Freeing anonymous domain-heap pages. */
-        free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, order);
+        free_heap_pages(pg, order);
         drop_dom_ref = 0;
     }
 
@@ -882,13 +1260,11 @@ unsigned long avail_domheap_pages_region(
 {
     int zone_lo, zone_hi;
 
-    zone_lo = min_width ? (min_width - (PAGE_SHIFT + 1)) : (MEMZONE_XEN + 1);
-    zone_lo = max_t(int, MEMZONE_XEN + 1, zone_lo);
-    zone_lo = min_t(int, NR_ZONES - 1, zone_lo);
+    zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
+    zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
 
-    zone_hi = max_width ? (max_width - (PAGE_SHIFT + 1)) : (NR_ZONES - 1);
-    zone_hi = max_t(int, MEMZONE_XEN + 1, zone_hi);
-    zone_hi = min_t(int, NR_ZONES - 1, zone_hi);
+    zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
+    zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
 
     return avail_heap_pages(zone_lo, zone_hi, node);
 }
@@ -945,7 +1321,7 @@ static DEFINE_PER_CPU(struct timer, page_scrub_timer);
 
 static void page_scrub_softirq(void)
 {
-    struct list_head *ent;
+    PAGE_LIST_HEAD(list);
     struct page_info  *pg;
     void             *p;
     int               i;
@@ -963,36 +1339,30 @@ static void page_scrub_softirq(void)
     do {
         spin_lock(&page_scrub_lock);
 
-        if ( unlikely((ent = page_scrub_list.next) == &page_scrub_list) )
-        {
-            spin_unlock(&page_scrub_lock);
-            goto out;
-        }
-        
         /* Peel up to 16 pages from the list. */
         for ( i = 0; i < 16; i++ )
         {
-            if ( ent->next == &page_scrub_list )
+            if ( !(pg = page_list_remove_head(&page_scrub_list)) )
                 break;
-            ent = ent->next;
+            page_list_add_tail(pg, &list);
         }
         
-        /* Remove peeled pages from the list. */
-        ent->next->prev = &page_scrub_list;
-        page_scrub_list.next = ent->next;
-        scrub_pages -= (i+1);
+        if ( unlikely(i == 0) )
+        {
+            spin_unlock(&page_scrub_lock);
+            goto out;
+        }
+
+        scrub_pages -= i;
 
         spin_unlock(&page_scrub_lock);
 
-        /* Working backwards, scrub each page in turn. */
-        while ( ent != &page_scrub_list )
-        {
-            pg = list_entry(ent, struct page_info, list);
-            ent = ent->prev;
+        /* Scrub each page in turn. */
+        while ( (pg = page_list_remove_head(&list)) ) {
             p = map_domain_page(page_to_mfn(pg));
             scrub_page(p);
             unmap_domain_page(p);
-            free_heap_pages(pfn_dom_zone_type(page_to_mfn(pg)), pg, 0);
+            free_heap_pages(pg, 0);
         }
     } while ( (NOW() - start) < MILLISECS(1) );
 
index 3ba7d3e8cd0cdc8b01288d69bd2dbc23b187a66a..d724293bc164496c59289bc58763f5ac4d6c4e8f 100644 (file)
 /*
  * CSCHED_STATS
  *
- * Manage very basic counters and stats.
+ * Manage very basic per-vCPU counters and stats.
  *
  * Useful for debugging live systems. The stats are displayed
  * with runq dumps ('r' on the Xen console).
  */
+#ifdef PERF_COUNTERS
 #define CSCHED_STATS
+#endif
 
 
 /*
 /*
  * Stats
  */
-#ifdef CSCHED_STATS
-
-#define CSCHED_STAT(_X)         (csched_priv.stats._X)
-#define CSCHED_STAT_DEFINE(_X)  uint32_t _X;
-#define CSCHED_STAT_PRINTK(_X)                                  \
-    do                                                          \
-    {                                                           \
-        printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X));  \
-    } while ( 0 );
-
-/*
- * Try and keep often cranked stats on top so they'll fit on one
- * cache line.
- */
-#define CSCHED_STATS_EXPAND_SCHED(_MACRO)   \
-    _MACRO(schedule)                        \
-    _MACRO(acct_run)                        \
-    _MACRO(acct_no_work)                    \
-    _MACRO(acct_balance)                    \
-    _MACRO(acct_reorder)                    \
-    _MACRO(acct_min_credit)                 \
-    _MACRO(acct_vcpu_active)                \
-    _MACRO(acct_vcpu_idle)                  \
-    _MACRO(vcpu_sleep)                      \
-    _MACRO(vcpu_wake_running)               \
-    _MACRO(vcpu_wake_onrunq)                \
-    _MACRO(vcpu_wake_runnable)              \
-    _MACRO(vcpu_wake_not_runnable)          \
-    _MACRO(vcpu_park)                       \
-    _MACRO(vcpu_unpark)                     \
-    _MACRO(tickle_local_idler)              \
-    _MACRO(tickle_local_over)               \
-    _MACRO(tickle_local_under)              \
-    _MACRO(tickle_local_other)              \
-    _MACRO(tickle_idlers_none)              \
-    _MACRO(tickle_idlers_some)              \
-    _MACRO(load_balance_idle)               \
-    _MACRO(load_balance_over)               \
-    _MACRO(load_balance_other)              \
-    _MACRO(steal_trylock_failed)            \
-    _MACRO(steal_peer_idle)                 \
-    _MACRO(migrate_queued)                  \
-    _MACRO(migrate_running)                 \
-    _MACRO(dom_init)                        \
-    _MACRO(dom_destroy)                     \
-    _MACRO(vcpu_init)                       \
-    _MACRO(vcpu_destroy)
-
-#ifndef NDEBUG
-#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
-    _MACRO(vcpu_check)
-#else
-#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
-#endif
-
-#define CSCHED_STATS_EXPAND(_MACRO)         \
-    CSCHED_STATS_EXPAND_CHECKS(_MACRO)      \
-    CSCHED_STATS_EXPAND_SCHED(_MACRO)
-
-#define CSCHED_STATS_RESET()                                        \
-    do                                                              \
-    {                                                               \
-        memset(&csched_priv.stats, 0, sizeof(csched_priv.stats));   \
-    } while ( 0 )
-
-#define CSCHED_STATS_DEFINE()                   \
-    struct                                      \
-    {                                           \
-        CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
-    } stats;
-
-#define CSCHED_STATS_PRINTK()                   \
-    do                                          \
-    {                                           \
-        printk("stats:\n");                     \
-        CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
-    } while ( 0 )
+#define CSCHED_STAT_CRANK(_X)               (perfc_incr(_X))
 
-#define CSCHED_STAT_CRANK(_X)               (CSCHED_STAT(_X)++)
+#ifdef CSCHED_STATS
 
 #define CSCHED_VCPU_STATS_RESET(_V)                     \
     do                                                  \
 
 #else /* CSCHED_STATS */
 
-#define CSCHED_STATS_RESET()                do {} while ( 0 )
-#define CSCHED_STATS_DEFINE()
-#define CSCHED_STATS_PRINTK()               do {} while ( 0 )
-#define CSCHED_STAT_CRANK(_X)               do {} while ( 0 )
 #define CSCHED_VCPU_STATS_RESET(_V)         do {} while ( 0 )
 #define CSCHED_VCPU_STAT_CRANK(_V, _X)      do {} while ( 0 )
 #define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
@@ -238,7 +160,6 @@ struct csched_private {
     uint32_t credit;
     int credit_balance;
     uint32_t runq_sort;
-    CSCHED_STATS_DEFINE()
 };
 
 
@@ -249,15 +170,6 @@ static struct csched_private csched_priv;
 
 static void csched_tick(void *_cpu);
 
-static inline int
-__cycle_cpu(int cpu, const cpumask_t *mask)
-{
-    int nxt = next_cpu(cpu, *mask);
-    if (nxt == NR_CPUS)
-        nxt = first_cpu(*mask);
-    return nxt;
-}
-
 static inline int
 __vcpu_on_runq(struct csched_vcpu *svc)
 {
@@ -404,14 +316,37 @@ __csched_vcpu_check(struct vcpu *vc)
 #define CSCHED_VCPU_CHECK(_vc)
 #endif
 
+/*
+ * Delay, in microseconds, between migrations of a VCPU between PCPUs.
+ * This prevents rapid fluttering of a VCPU between CPUs, and reduces the
+ * implicit overheads such as cache-warming. 1ms (1000) has been measured
+ * as a good value.
+ */
+static unsigned int vcpu_migration_delay;
+integer_param("vcpu_migration_delay", vcpu_migration_delay);
+
+static inline int
+__csched_vcpu_is_cache_hot(struct vcpu *v)
+{
+    int hot = ((NOW() - v->last_run_time) <
+               ((uint64_t)vcpu_migration_delay * 1000u));
+
+    if ( hot )
+        CSCHED_STAT_CRANK(vcpu_hot);
+
+    return hot;
+}
+
 static inline int
 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
 {
     /*
-     * Don't pick up work that's in the peer's scheduling tail. Also only pick
-     * up work that's allowed to run on our CPU.
+     * Don't pick up work that's in the peer's scheduling tail or hot on
+     * peer PCPU. Only pick up work that's allowed to run on our CPU.
      */
-    return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+    return !vc->is_running &&
+           !__csched_vcpu_is_cache_hot(vc) &&
+           cpu_isset(dest_cpu, vc->cpu_affinity);
 }
 
 static int
@@ -428,7 +363,7 @@ csched_cpu_pick(struct vcpu *vc)
     cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
-            : __cycle_cpu(vc->processor, &cpus);
+            : cycle_cpu(vc->processor, cpus);
     ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
 
     /*
@@ -452,9 +387,9 @@ csched_cpu_pick(struct vcpu *vc)
     {
         cpumask_t cpu_idlers;
         cpumask_t nxt_idlers;
-        int nxt;
+        int nxt, weight_cpu, weight_nxt;
 
-        nxt = __cycle_cpu(cpu, &cpus);
+        nxt = cycle_cpu(cpu, cpus);
 
         if ( cpu_isset(cpu, cpu_core_map[nxt]) )
         {
@@ -469,7 +404,10 @@ csched_cpu_pick(struct vcpu *vc)
             cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
         }
 
-        if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
+        weight_cpu = cpus_weight(cpu_idlers);
+        weight_nxt = cpus_weight(nxt_idlers);
+        if ( ( (weight_cpu < weight_nxt) ^ sched_smt_power_savings )
+                && (weight_cpu != weight_nxt) )
         {
             cpu = nxt;
             cpu_clear(cpu, cpus);
@@ -1128,7 +1066,7 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
 
     while ( !cpus_empty(workers) )
     {
-        peer_cpu = __cycle_cpu(peer_cpu, &workers);
+        peer_cpu = cycle_cpu(peer_cpu, workers);
         cpu_clear(peer_cpu, workers);
 
         /*
@@ -1306,7 +1244,8 @@ csched_dump(void)
            "\tmsecs per tick     = %dms\n"
            "\tcredits per tick   = %d\n"
            "\tticks per tslice   = %d\n"
-           "\tticks per acct     = %d\n",
+           "\tticks per acct     = %d\n"
+           "\tmigration delay    = %uus\n",
            csched_priv.ncpus,
            csched_priv.master,
            csched_priv.credit,
@@ -1317,13 +1256,12 @@ csched_dump(void)
            CSCHED_MSECS_PER_TICK,
            CSCHED_CREDITS_PER_TICK,
            CSCHED_TICKS_PER_TSLICE,
-           CSCHED_TICKS_PER_ACCT);
+           CSCHED_TICKS_PER_ACCT,
+           vcpu_migration_delay);
 
     cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
     printk("idlers: %s\n", idlers_buf);
 
-    CSCHED_STATS_PRINTK();
-
     printk("active vcpus:\n");
     loop = 0;
     list_for_each( iter_sdom, &csched_priv.active_sdom )
@@ -1354,7 +1292,6 @@ csched_init(void)
     csched_priv.credit = 0U;
     csched_priv.credit_balance = 0;
     csched_priv.runq_sort = 0U;
-    CSCHED_STATS_RESET();
 }
 
 /* Tickers cannot be kicked until SMP subsystem is alive. */
index 04b09e2168841150f2072543aa06dc37ff6ed792..5e91f6c85d818744f981f29010925241bb471b8b 100644 (file)
 static char opt_sched[10] = "credit";
 string_param("sched", opt_sched);
 
+/* if sched_smt_power_savings is set,
+ * scheduler will give preferrence to partially idle package compared to
+ * the full idle package, when picking pCPU to schedule vCPU.
+ */
+int sched_smt_power_savings = 0;
+boolean_param("sched_smt_power_savings", sched_smt_power_savings);
+
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
 
 /* Various timer handlers. */
@@ -81,36 +88,66 @@ static inline void trace_runstate_change(struct vcpu *v, int new_state)
     __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
 }
 
+static inline void trace_continue_running(struct vcpu *v)
+{
+    struct { uint32_t vcpu:16, domain:16; } d;
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    d.vcpu = v->vcpu_id;
+    d.domain = v->domain->domain_id;
+
+    __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d),
+                (unsigned char *)&d);
+}
+
 static inline void vcpu_runstate_change(
     struct vcpu *v, int new_state, s_time_t new_entry_time)
 {
+    s_time_t delta;
+
     ASSERT(v->runstate.state != new_state);
     ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
 
     trace_runstate_change(v, new_state);
 
-    v->runstate.time[v->runstate.state] +=
-        new_entry_time - v->runstate.state_entry_time;
-    v->runstate.state_entry_time = new_entry_time;
+    delta = new_entry_time - v->runstate.state_entry_time;
+    if ( delta > 0 )
+    {
+        v->runstate.time[v->runstate.state] += delta;
+        v->runstate.state_entry_time = new_entry_time;
+    }
+
     v->runstate.state = new_state;
 }
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
 {
-    if ( likely(v == current) )
-    {
-        /* Fast lock-free path. */
-        memcpy(runstate, &v->runstate, sizeof(*runstate));
-        ASSERT(runstate->state == RUNSTATE_running);
-        runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
-    }
-    else
-    {
+    s_time_t delta;
+
+    if ( unlikely(v != current) )
         vcpu_schedule_lock_irq(v);
-        memcpy(runstate, &v->runstate, sizeof(*runstate));
-        runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
+
+    memcpy(runstate, &v->runstate, sizeof(*runstate));
+    delta = NOW() - runstate->state_entry_time;
+    if ( delta > 0 )
+        runstate->time[runstate->state] += delta;
+
+    if ( unlikely(v != current) )
         vcpu_schedule_unlock_irq(v);
-    }
+}
+
+uint64_t get_cpu_idle_time(unsigned int cpu)
+{
+    struct vcpu_runstate_info state;
+    struct vcpu *v;
+
+    if ( (v = idle_vcpu[cpu]) == NULL )
+        return 0;
+
+    vcpu_runstate_get(v, &state);
+    return state.time[RUNSTATE_running];
 }
 
 int sched_init_vcpu(struct vcpu *v, unsigned int processor) 
@@ -787,6 +824,7 @@ static void schedule(void)
     if ( unlikely(prev == next) )
     {
         spin_unlock_irq(&sd->schedule_lock);
+        trace_continue_running(next);
         return continue_running(prev);
     }
 
@@ -805,6 +843,7 @@ static void schedule(void)
         (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
          (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
         now);
+    prev->last_run_time = now;
 
     ASSERT(next->runstate.state != RUNSTATE_running);
     vcpu_runstate_change(next, RUNSTATE_running, now);
@@ -910,6 +949,8 @@ void dump_runq(unsigned char key)
 
     printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
     SCHED_OP(dump_settings);
+    printk("sched_smt_power_savings: %s\n",
+            sched_smt_power_savings? "enabled":"disabled");
     printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
 
     for_each_online_cpu ( i )
diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
new file mode 100644 (file)
index 0000000..002f82e
--- /dev/null
@@ -0,0 +1,223 @@
+#include <xen/config.h>
+#include <xen/irq.h>
+#include <xen/smp.h>
+#include <xen/spinlock.h>
+
+#ifndef NDEBUG
+
+static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0);
+
+static void check_lock(struct lock_debug *debug)
+{
+    int irq_safe = !local_irq_is_enabled();
+
+    if ( unlikely(atomic_read(&spin_debug) <= 0) )
+        return;
+
+    /* A few places take liberties with this. */
+    /* BUG_ON(in_irq() && !irq_safe); */
+
+    if ( unlikely(debug->irq_safe != irq_safe) )
+    {
+        int seen = cmpxchg(&debug->irq_safe, -1, irq_safe);
+        BUG_ON(seen == !irq_safe);
+    }
+}
+
+void spin_debug_enable(void)
+{
+    atomic_inc(&spin_debug);
+}
+
+void spin_debug_disable(void)
+{
+    atomic_dec(&spin_debug);
+}
+
+#else /* defined(NDEBUG) */
+
+#define check_lock(l) ((void)0)
+
+#endif
+
+void _spin_lock(spinlock_t *lock)
+{
+    check_lock(&lock->debug);
+    _raw_spin_lock(&lock->raw);
+}
+
+void _spin_lock_irq(spinlock_t *lock)
+{
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
+    _raw_spin_lock(&lock->raw);
+}
+
+unsigned long _spin_lock_irqsave(spinlock_t *lock)
+{
+    unsigned long flags;
+    local_irq_save(flags);
+    check_lock(&lock->debug);
+    _raw_spin_lock(&lock->raw);
+    return flags;
+}
+
+void _spin_unlock(spinlock_t *lock)
+{
+    _raw_spin_unlock(&lock->raw);
+}
+
+void _spin_unlock_irq(spinlock_t *lock)
+{
+    _raw_spin_unlock(&lock->raw);
+    local_irq_enable();
+}
+
+void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+    _raw_spin_unlock(&lock->raw);
+    local_irq_restore(flags);
+}
+
+int _spin_is_locked(spinlock_t *lock)
+{
+    check_lock(&lock->debug);
+    return _raw_spin_is_locked(&lock->raw);
+}
+
+int _spin_trylock(spinlock_t *lock)
+{
+    check_lock(&lock->debug);
+    return _raw_spin_trylock(&lock->raw);
+}
+
+void _spin_barrier(spinlock_t *lock)
+{
+    check_lock(&lock->debug);
+    do { mb(); } while ( _raw_spin_is_locked(&lock->raw) );
+    mb();
+}
+
+void _spin_barrier_irq(spinlock_t *lock)
+{
+    unsigned long flags;
+    local_irq_save(flags);
+    _spin_barrier(lock);
+    local_irq_restore(flags);
+}
+
+void _spin_lock_recursive(spinlock_t *lock)
+{
+    int cpu = smp_processor_id();
+
+    /* Don't allow overflow of recurse_cpu field. */
+    BUILD_BUG_ON(NR_CPUS > 0xfffu);
+
+    check_lock(&lock->debug);
+
+    if ( likely(lock->recurse_cpu != cpu) )
+    {
+        spin_lock(lock);
+        lock->recurse_cpu = cpu;
+    }
+
+    /* We support only fairly shallow recursion, else the counter overflows. */
+    ASSERT(lock->recurse_cnt < 0xfu);
+    lock->recurse_cnt++;
+}
+
+void _spin_unlock_recursive(spinlock_t *lock)
+{
+    if ( likely(--lock->recurse_cnt == 0) )
+    {
+        lock->recurse_cpu = 0xfffu;
+        spin_unlock(lock);
+    }
+}
+
+void _read_lock(rwlock_t *lock)
+{
+    check_lock(&lock->debug);
+    _raw_read_lock(&lock->raw);
+}
+
+void _read_lock_irq(rwlock_t *lock)
+{
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
+    _raw_read_lock(&lock->raw);
+}
+
+unsigned long _read_lock_irqsave(rwlock_t *lock)
+{
+    unsigned long flags;
+    local_irq_save(flags);
+    check_lock(&lock->debug);
+    _raw_read_lock(&lock->raw);
+    return flags;
+}
+
+void _read_unlock(rwlock_t *lock)
+{
+    _raw_read_unlock(&lock->raw);
+}
+
+void _read_unlock_irq(rwlock_t *lock)
+{
+    _raw_read_unlock(&lock->raw);
+    local_irq_enable();
+}
+
+void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+    _raw_read_unlock(&lock->raw);
+    local_irq_restore(flags);
+}
+
+void _write_lock(rwlock_t *lock)
+{
+    check_lock(&lock->debug);
+    _raw_write_lock(&lock->raw);
+}
+
+void _write_lock_irq(rwlock_t *lock)
+{
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
+    _raw_write_lock(&lock->raw);
+}
+
+unsigned long _write_lock_irqsave(rwlock_t *lock)
+{
+    unsigned long flags;
+    local_irq_save(flags);
+    check_lock(&lock->debug);
+    _raw_write_lock(&lock->raw);
+    return flags;
+}
+
+void _write_unlock(rwlock_t *lock)
+{
+    _raw_write_unlock(&lock->raw);
+}
+
+void _write_unlock_irq(rwlock_t *lock)
+{
+    _raw_write_unlock(&lock->raw);
+    local_irq_enable();
+}
+
+void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+    _raw_write_unlock(&lock->raw);
+    local_irq_restore(flags);
+}
+
+int _rw_is_locked(rwlock_t *lock)
+{
+    check_lock(&lock->debug);
+    return _raw_rw_is_locked(&lock->raw);
+}
index c3f1b427167d7372dd4ff94b788637b9a37b1247..70da6264165da870f1efe9dbb7e89cc5e7e61499 100644 (file)
@@ -26,6 +26,7 @@
 #include <xsm/xsm.h>
 
 extern int do_get_pm_info(struct xen_sysctl_get_pmstat *op);
+extern int do_pm_op(struct xen_sysctl_pm_op *op);
 
 extern long arch_do_sysctl(
     struct xen_sysctl *op, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl);
@@ -166,7 +167,6 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
     {
         uint32_t i, nr_cpus;
         struct xen_sysctl_cpuinfo cpuinfo;
-        struct vcpu *v;
 
         nr_cpus = min_t(uint32_t, op->u.getcpuinfo.max_cpus, NR_CPUS);
 
@@ -176,13 +176,7 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
 
         for ( i = 0; i < nr_cpus; i++ )
         {
-            /* Assume no holes in idle-vcpu map. */
-            if ( (v = idle_vcpu[i]) == NULL )
-                break;
-
-            cpuinfo.idletime = v->runstate.time[RUNSTATE_running];
-            if ( v->is_running )
-                cpuinfo.idletime += NOW() - v->runstate.state_entry_time;
+            cpuinfo.idletime = get_cpu_idle_time(i);
 
             ret = -EFAULT;
             if ( copy_to_guest_offset(op->u.getcpuinfo.info, i, &cpuinfo, 1) )
@@ -224,6 +218,77 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
     }
     break;
 
+    case XEN_SYSCTL_pm_op:
+    {
+        ret = do_pm_op(&op->u.pm_op);
+        if ( ret && (ret != -EAGAIN) )
+            break;
+
+        if ( copy_to_guest(u_sysctl, op, 1) )
+        {
+            ret = -EFAULT;
+            break;
+        }
+    }
+    break;
+
+    case XEN_SYSCTL_page_offline_op:
+    {
+        uint32_t *status, *ptr;
+        unsigned long pfn;
+
+        ptr = status = xmalloc_bytes( sizeof(uint32_t) *
+                                (op->u.page_offline.end -
+                                  op->u.page_offline.start + 1));
+        if ( !status )
+        {
+            dprintk(XENLOG_WARNING, "Out of memory for page offline op\n");
+            ret = -ENOMEM;
+            break;
+        }
+
+        memset(status, PG_OFFLINE_INVALID, sizeof(uint32_t) *
+                      (op->u.page_offline.end - op->u.page_offline.start + 1));
+
+        for ( pfn = op->u.page_offline.start;
+              pfn <= op->u.page_offline.end;
+              pfn ++ )
+        {
+            switch ( op->u.page_offline.cmd )
+            {
+                /* Shall revert her if failed, or leave caller do it? */
+                case sysctl_page_offline:
+                    ret = offline_page(pfn, 0, ptr++);
+                    break;
+                case sysctl_page_online:
+                    ret = online_page(pfn, ptr++);
+                    break;
+                case sysctl_query_page_offline:
+                    ret = query_page_offline(pfn, ptr++);
+                    break;
+                default:
+                    gdprintk(XENLOG_WARNING, "invalid page offline op %x\n",
+                            op->u.page_offline.cmd);
+                    ret = -EINVAL;
+                    break;
+            }
+
+            if (ret)
+                break;
+        }
+
+        if ( copy_to_guest(
+            op->u.page_offline.status, status,
+            op->u.page_offline.end - op->u.page_offline.start + 1) )
+        {
+            ret = -EFAULT;
+            break;
+        }
+
+        xfree(status);
+    }
+    break;
+
     default:
         ret = arch_do_sysctl(op, u_sysctl);
         break;
index 8bfef0d760f530fb4224fe4c3d0800e6905128b4..eca25b3fb75664f0778daf7987fbf1dc97fa1509 100644 (file)
  * We pull handlers off the timer list this far in future,
  * rather than reprogramming the time hardware.
  */
-#define TIMER_SLOP (50*1000) /* ns */
+static unsigned int timer_slop __read_mostly = 50000; /* 50 us */
+integer_param("timer_slop", timer_slop);
 
 struct timers {
     spinlock_t     lock;
+    bool_t         overflow;
     struct timer **heap;
     struct timer  *list;
     struct timer  *running;
@@ -114,34 +116,19 @@ static int remove_from_heap(struct timer **heap, struct timer *t)
 
 
 /* Add new entry @t to @heap. Return TRUE if new top of heap. */
-static int add_to_heap(struct timer ***pheap, struct timer *t)
+static int add_to_heap(struct timer **heap, struct timer *t)
 {
-    struct timer **heap = *pheap;
     int sz = GET_HEAP_SIZE(heap);
 
-    /* Copy the heap if it is full. */
+    /* Fail if the heap is full. */
     if ( unlikely(sz == GET_HEAP_LIMIT(heap)) )
-    {
-        /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
-        int old_limit = GET_HEAP_LIMIT(heap);
-        int new_limit = ((old_limit + 1) << 4) - 1;
-        if ( in_irq() )
-            goto out;
-        heap = xmalloc_array(struct timer *, new_limit + 1);
-        if ( heap == NULL )
-            goto out;
-        memcpy(heap, *pheap, (old_limit + 1) * sizeof(*heap));
-        SET_HEAP_LIMIT(heap, new_limit);
-        if ( old_limit != 0 )
-            xfree(*pheap);
-        *pheap = heap;
-    }
+        return 0;
 
     SET_HEAP_SIZE(heap, ++sz);
     heap[sz] = t;
     t->heap_offset = sz;
     up_heap(heap, sz);
- out:
+
     return (t->heap_offset == 1);
 }
 
@@ -210,11 +197,12 @@ static int add_entry(struct timers *timers, struct timer *t)
     /* Try to add to heap. t->heap_offset indicates whether we succeed. */
     t->heap_offset = 0;
     t->status = TIMER_STATUS_in_heap;
-    rc = add_to_heap(&timers->heap, t);
+    rc = add_to_heap(timers->heap, t);
     if ( t->heap_offset != 0 )
         return rc;
 
     /* Fall back to adding to the slower linked list. */
+    timers->overflow = 1;
     t->status = TIMER_STATUS_in_list;
     return add_to_list(&timers->list, t);
 }
@@ -273,6 +261,7 @@ void set_timer(struct timer *timer, s_time_t expires)
         __stop_timer(timer);
 
     timer->expires = expires;
+    timer->expires_end = expires + timer_slop;
 
     if ( likely(timer->status != TIMER_STATUS_killed) )
         __add_timer(timer);
@@ -359,19 +348,70 @@ void kill_timer(struct timer *timer)
 }
 
 
+static void execute_timer(struct timers *ts, struct timer *t)
+{
+    void (*fn)(void *) = t->function;
+    void *data = t->data;
+
+    ts->running = t;
+    spin_unlock_irq(&ts->lock);
+    (*fn)(data);
+    spin_lock_irq(&ts->lock);
+    ts->running = NULL;
+}
+
+
 static void timer_softirq_action(void)
 {
     struct timer  *t, **heap, *next;
     struct timers *ts;
-    s_time_t       now, deadline;
-    void         (*fn)(void *);
-    void          *data;
+    s_time_t       now;
 
     ts = &this_cpu(timers);
+    heap = ts->heap;
+
+    /* If we overflowed the heap, try to allocate a larger heap. */
+    if ( unlikely(ts->overflow) )
+    {
+        /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
+        int old_limit = GET_HEAP_LIMIT(heap);
+        int new_limit = ((old_limit + 1) << 4) - 1;
+        struct timer **newheap = xmalloc_array(struct timer *, new_limit + 1);
+        if ( newheap != NULL )
+        {
+            spin_lock_irq(&ts->lock);
+            memcpy(newheap, heap, (old_limit + 1) * sizeof(*heap));
+            SET_HEAP_LIMIT(newheap, new_limit);
+            ts->heap = newheap;
+            spin_unlock_irq(&ts->lock);
+            if ( old_limit != 0 )
+                xfree(heap);
+            heap = newheap;
+        }
+    }
 
     spin_lock_irq(&ts->lock);
 
-    /* Try to move timers from overflow linked list to more efficient heap. */
+    now = NOW();
+
+    /* Execute ready heap timers. */
+    while ( (GET_HEAP_SIZE(heap) != 0) &&
+            ((t = heap[1])->expires < now) )
+    {
+        remove_from_heap(heap, t);
+        t->status = TIMER_STATUS_inactive;
+        execute_timer(ts, t);
+    }
+
+    /* Execute ready list timers. */
+    while ( ((t = ts->list) != NULL) && (t->expires < now) )
+    {
+        ts->list = t->list_next;
+        t->status = TIMER_STATUS_inactive;
+        execute_timer(ts, t);
+    }
+
+    /* Try to move timers from linked list to more efficient heap. */
     next = ts->list;
     ts->list = NULL;
     while ( unlikely((t = next) != NULL) )
@@ -380,56 +420,45 @@ static void timer_softirq_action(void)
         t->status = TIMER_STATUS_inactive;
         add_entry(ts, t);
     }
-    
-    heap = ts->heap;
-    now  = NOW();
 
-    while ( (GET_HEAP_SIZE(heap) != 0) &&
-            ((t = heap[1])->expires < (now + TIMER_SLOP)) )
+    ts->overflow = (ts->list != NULL);
+    if ( unlikely(ts->overflow) )
     {
-        remove_entry(ts, t);
-
-        ts->running = t;
-
-        fn   = t->function;
-        data = t->data;
-
-        spin_unlock_irq(&ts->lock);
-        (*fn)(data);
-        spin_lock_irq(&ts->lock);
-
-        /* Heap may have grown while the lock was released. */
-        heap = ts->heap;
+        /* Find earliest deadline at head of list or top of heap. */
+        this_cpu(timer_deadline) = ts->list->expires;
+        if ( (GET_HEAP_SIZE(heap) != 0) &&
+             ((t = heap[1])->expires < this_cpu(timer_deadline)) )
+            this_cpu(timer_deadline) = t->expires;
     }
-
-    deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
-
-    while ( unlikely((t = ts->list) != NULL) )
+    else
     {
-        if ( t->expires >= (now + TIMER_SLOP) )
+        /*
+         * Find the earliest deadline that encompasses largest number of timers
+         * on the heap. To do this we take timers from the heap while their
+         * valid deadline ranges continue to intersect.
+         */
+        s_time_t start = 0, end = STIME_MAX;
+        struct timer **list_tail = &ts->list;
+
+        while ( (GET_HEAP_SIZE(heap) != 0) &&
+                ((t = heap[1])->expires <= end) )
         {
-            if ( (deadline == 0) || (deadline > t->expires) )
-                deadline = t->expires;
-            break;
-        }
-
-        ts->list = t->list_next;
-        t->status = TIMER_STATUS_inactive;
+            remove_entry(ts, t);
 
-        ts->running = t;
+            t->status = TIMER_STATUS_in_list;
+            t->list_next = NULL;
+            *list_tail = t;
+            list_tail = &t->list_next;
 
-        fn   = t->function;
-        data = t->data;
+            start = t->expires;
+            if ( end > t->expires_end )
+                end = t->expires_end;
+        }
 
-        spin_unlock_irq(&ts->lock);
-        (*fn)(data);
-        spin_lock_irq(&ts->lock);
+        this_cpu(timer_deadline) = start;
     }
 
-    ts->running = NULL;
-
-    this_cpu(timer_deadline) = deadline;
-    if ( !reprogram_timer(deadline) )
+    if ( !reprogram_timer(this_cpu(timer_deadline)) )
         raise_softirq(TIMER_SOFTIRQ);
 
     spin_unlock_irq(&ts->lock);
@@ -444,6 +473,13 @@ void process_pending_timers(void)
         timer_softirq_action();
 }
 
+s_time_t align_timer(s_time_t firsttick, uint64_t period)
+{
+    if ( !period )
+        return firsttick;
+
+    return firsttick + (period - 1) - ((firsttick - 1) % period);
+}
 
 static void dump_timerq(unsigned char key)
 {
@@ -465,12 +501,14 @@ static void dump_timerq(unsigned char key)
         for ( j = 1; j <= GET_HEAP_SIZE(ts->heap); j++ )
         {
             t = ts->heap[j];
-            printk ("  %d : %p ex=0x%08X%08X %p\n",
-                    j, t, (u32)(t->expires>>32), (u32)t->expires, t->data);
+            printk ("  %d : %p ex=0x%08X%08X %p %p\n",
+                    j, t, (u32)(t->expires>>32), (u32)t->expires,
+                    t->data, t->function);
         }
         for ( t = ts->list, j = 0; t != NULL; t = t->list_next, j++ )
-            printk (" L%d : %p ex=0x%08X%08X %p\n",
-                    j, t, (u32)(t->expires>>32), (u32)t->expires, t->data);
+            printk (" L%d : %p ex=0x%08X%08X %p %p\n",
+                    j, t, (u32)(t->expires>>32), (u32)t->expires,
+                    t->data, t->function);
         spin_unlock_irqrestore(&ts->lock, flags);
         printk("\n");
     }
index e41cfb90a6054da08aceb95ae692264c9937a89e..38113d21df994ce3875e67b6623a869c3f130680 100644 (file)
 #define xen_t_buf t_buf
 CHECK_t_buf;
 #undef xen_t_buf
-#define TB_COMPAT IS_COMPAT(dom0)
 #else
 #define compat_t_rec t_rec
-#define TB_COMPAT 0
 #endif
 
 /* opt_tbuf_size: trace buffer size (in pages) */
@@ -94,7 +92,7 @@ static int alloc_trace_bufs(void)
     order    = get_order_from_pages(nr_pages);
     data_size  = (opt_tbuf_size * PAGE_SIZE - sizeof(struct t_buf));
     
-    if ( (rawbuf = alloc_xenheap_pages(order)) == NULL )
+    if ( (rawbuf = alloc_xenheap_pages(order, 0)) == NULL )
     {
         printk("Xen trace buffers: memory allocation failed\n");
         opt_tbuf_size = 0;
index c1e32bb0de2b97d94d9cdfd76ba575936d607870..247545494d3c0036c9c1b4fa4479b00d3073804a 100644 (file)
@@ -51,24 +51,16 @@ xencomm_get_page(unsigned long paddr, struct page_info **page)
         return -EFAULT;
         
     *page = maddr_to_page(maddr);
-    if ( get_page(*page, current->domain) == 0 )
+    if ( !get_page(*page, current->domain) )
     {
-        if ( page_get_owner(*page) != current->domain )
-        {
-            /*
-             * This page might be a page granted by another domain, or
-             * this page is freed with decrease reservation hypercall at
-             * the same time.
-             */
-            gdprintk(XENLOG_WARNING,
-                     "bad page is passed. paddr 0x%lx maddr 0x%lx\n",
-                     paddr, maddr);
-            return -EFAULT;
-        }
-
-        /* Try again. */
-        cpu_relax();
-        return -EAGAIN;
+        /*
+         * This page might be a page granted by another domain, or this page 
+         * is freed with decrease reservation hypercall at the same time.
+         */
+        gdprintk(XENLOG_WARNING,
+                 "bad page is passed. paddr 0x%lx maddr 0x%lx\n",
+                 paddr, maddr);
+        return -EFAULT;
     }
 
     return 0;
index f41ebb90699bbe510ab9a2acd568a2bd2701b648..0a33613caaf2d3902ebbc0bf649ccd22504f903a 100644 (file)
@@ -85,7 +85,7 @@ int is_active(struct domain *d)
     return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE));
 }
 
-static int is_passive(struct domain *d)
+int is_passive(struct domain *d)
 {
     struct xenoprof *x = d->xenoprof;
     return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE));
@@ -136,23 +136,23 @@ share_xenoprof_page_with_guest(struct domain *d, unsigned long mfn, int npages)
 {
     int i;
 
-   /* Check if previous page owner has released the page. */
-   for ( i = 0; i < npages; i++ )
-   {
-       struct page_info *page = mfn_to_page(mfn + i);
-       if ( (page->count_info & (PGC_allocated|PGC_count_mask)) != 0 )
-       {
-           gdprintk(XENLOG_INFO, "mfn 0x%lx page->count_info 0x%x\n",
-                    mfn + i, page->count_info);
-           return -EBUSY;
-       }
-       page_set_owner(page, NULL);
-   }
-
-   for ( i = 0; i < npages; i++ )
-       share_xen_page_with_guest(mfn_to_page(mfn + i), d, XENSHARE_writable);
-
-   return 0;
+    /* Check if previous page owner has released the page. */
+    for ( i = 0; i < npages; i++ )
+    {
+        struct page_info *page = mfn_to_page(mfn + i);
+        if ( (page->count_info & (PGC_allocated|PGC_count_mask)) != 0 )
+        {
+            gdprintk(XENLOG_INFO, "mfn 0x%lx page->count_info 0x%lx\n",
+                     mfn + i, (unsigned long)page->count_info);
+            return -EBUSY;
+        }
+        page_set_owner(page, NULL);
+    }
+
+    for ( i = 0; i < npages; i++ )
+        share_xen_page_with_guest(mfn_to_page(mfn + i), d, XENSHARE_writable);
+
+    return 0;
 }
 
 static void
@@ -208,7 +208,7 @@ static int alloc_xenoprof_struct(
     bufsize = sizeof(struct xenoprof_buf);
     i = sizeof(struct event_log);
 #ifdef CONFIG_COMPAT
-    d->xenoprof->is_compat = IS_COMPAT(is_passive ? dom0 : d);
+    d->xenoprof->is_compat = is_pv_32on64_domain(is_passive ? dom0 : d);
     if ( XENOPROF_COMPAT(d->xenoprof) )
     {
         bufsize = sizeof(struct compat_oprof_buf);
@@ -225,7 +225,7 @@ static int alloc_xenoprof_struct(
     bufsize += (max_samples - 1) * i;
     npages = (nvcpu * bufsize - 1) / PAGE_SIZE + 1;
 
-    d->xenoprof->rawbuf = alloc_xenheap_pages(get_order_from_pages(npages));
+    d->xenoprof->rawbuf = alloc_xenheap_pages(get_order_from_pages(npages), 0);
     if ( d->xenoprof->rawbuf == NULL )
     {
         xfree(d->xenoprof);
@@ -681,6 +681,8 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
     {
     case XENOPROF_init:
         ret = xenoprof_op_init(arg);
+        if ( !ret )
+            xenoprof_state = XENOPROF_INITIALIZED;
         break;
 
     case XENOPROF_get_buffer:
@@ -693,21 +695,19 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
         break;
 
     case XENOPROF_reset_active_list:
-    {
         reset_active_list();
         ret = 0;
         break;
-    }
+
     case XENOPROF_reset_passive_list:
-    {
         reset_passive_list();
         ret = 0;
         break;
-    }
+
     case XENOPROF_set_active:
     {
         domid_t domid;
-        if ( xenoprof_state != XENOPROF_IDLE )
+        if ( xenoprof_state != XENOPROF_INITIALIZED )
         {
             ret = -EPERM;
             break;
@@ -720,18 +720,18 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
         ret = add_active_list(domid);
         break;
     }
+
     case XENOPROF_set_passive:
-    {
-        if ( xenoprof_state != XENOPROF_IDLE )
+        if ( xenoprof_state != XENOPROF_INITIALIZED )
         {
             ret = -EPERM;
             break;
         }
         ret = add_passive_list(arg);
         break;
-    }
+
     case XENOPROF_reserve_counters:
-        if ( xenoprof_state != XENOPROF_IDLE )
+        if ( xenoprof_state != XENOPROF_INITIALIZED )
         {
             ret = -EPERM;
             break;
@@ -748,7 +748,6 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
             ret = -EPERM;
             break;
         }
-
         ret = xenoprof_arch_counter(arg);
         break;
 
@@ -766,8 +765,14 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
     case XENOPROF_enable_virq:
     {
         int i;
+
         if ( current->domain == xenoprof_primary_profiler )
         {
+            if ( xenoprof_state != XENOPROF_READY )
+            {
+                ret = -EPERM;
+                break;
+            }
             xenoprof_arch_enable_virq();
             xenoprof_reset_stat();
             for ( i = 0; i < pdomains; i++ )
@@ -835,7 +840,7 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
         if ( (xenoprof_state == XENOPROF_COUNTERS_RESERVED) ||
              (xenoprof_state == XENOPROF_READY) )
         {
-            xenoprof_state = XENOPROF_IDLE;
+            xenoprof_state = XENOPROF_INITIALIZED;
             xenoprof_arch_release_counters();
             xenoprof_arch_disable_virq();
             reset_passive_list();
@@ -845,7 +850,7 @@ int do_xenoprof_op(int op, XEN_GUEST_HANDLE(void) arg)
 
     case XENOPROF_shutdown:
         ret = -EPERM;
-        if ( xenoprof_state == XENOPROF_IDLE )
+        if ( xenoprof_state == XENOPROF_INITIALIZED )
         {
             activated = 0;
             adomains=0;
index 032755b74c764266f604656b00639746a2cff747..7a476e8fb7facefb68225faa9e21968cdbb97804 100644 (file)
@@ -300,7 +300,7 @@ struct xmem_pool *xmem_pool_create(
     pool_bytes = ROUNDUP_SIZE(sizeof(*pool));
     pool_order = get_order_from_bytes(pool_bytes);
 
-    pool = (void *)alloc_xenheap_pages(pool_order);
+    pool = (void *)alloc_xenheap_pages(pool_order, 0);
     if ( pool == NULL )
         return NULL;
     memset(pool, 0, pool_bytes);
@@ -505,12 +505,12 @@ static struct xmem_pool *xenpool;
 static void *xmalloc_pool_get(unsigned long size)
 {
     ASSERT(size == PAGE_SIZE);
-    return alloc_xenheap_pages(0);
+    return alloc_xenheap_page();
 }
 
 static void xmalloc_pool_put(void *p)
 {
-    free_xenheap_pages(p,0);
+    free_xenheap_page(p);
 }
 
 static void *xmalloc_whole_pages(unsigned long size)
@@ -518,7 +518,7 @@ static void *xmalloc_whole_pages(unsigned long size)
     struct bhdr *b;
     unsigned int pageorder = get_order_from_bytes(size + BHDR_OVERHEAD);
 
-    b = alloc_xenheap_pages(pageorder);
+    b = alloc_xenheap_pages(pageorder, 0);
     if ( b == NULL )
         return NULL;
 
diff --git a/xen/crypto/Makefile b/xen/crypto/Makefile
new file mode 100644 (file)
index 0000000..db29655
--- /dev/null
@@ -0,0 +1,2 @@
+obj-y += rijndael.o
+obj-y += vmac.o
diff --git a/xen/crypto/rijndael.c b/xen/crypto/rijndael.c
new file mode 100644 (file)
index 0000000..f749618
--- /dev/null
@@ -0,0 +1,1269 @@
+/*     $OpenBSD: rijndael.c,v 1.19 2008/06/09 07:49:45 djm Exp $ */
+
+/**
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* start for Xen */
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/lib.h>
+#include <crypto/rijndael.h>
+/* end for Xen */
+
+#undef FULL_UNROLL
+
+/*
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+Te4[x] = S [x].[01, 01, 01, 01];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01, 01, 01, 01];
+*/
+
+static const u32 Te0[256] = {
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+static const u32 Te4[256] = {
+    0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+    0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+    0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+    0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+    0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+    0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+    0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+    0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+    0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+    0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+    0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+    0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+    0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+    0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+    0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+    0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+    0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+    0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+    0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+    0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+    0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+    0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+    0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+    0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+    0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+    0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+    0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+    0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+    0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+    0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+    0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+    0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+    0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+    0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+    0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+    0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+    0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+    0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+    0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+    0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+    0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+    0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+    0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+    0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+    0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+    0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+    0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+    0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+    0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+    0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+    0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+    0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+    0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+    0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+    0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+    0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+    0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+    0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+    0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+    0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+    0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+    0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+    0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
+};
+static const u32 Td0[256] = {
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u32 Td4[256] = {
+    0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+    0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+    0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+    0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+    0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+    0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+    0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+    0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+    0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+    0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+    0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+    0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+    0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+    0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+    0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+    0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+    0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+    0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+    0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+    0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+    0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+    0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+    0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+    0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+    0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+    0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+    0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+    0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+    0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+    0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+    0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+    0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+    0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+    0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+    0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+    0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+    0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+    0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+    0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+    0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+    0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+    0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+    0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+    0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+    0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+    0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+    0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+    0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+    0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+    0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+    0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+    0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+    0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+    0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+    0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+    0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+    0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+    0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+    0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+    0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+    0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+    0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+    0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+    0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU,
+};
+static const u32 rcon[] = {
+       0x01000000, 0x02000000, 0x04000000, 0x08000000,
+       0x10000000, 0x20000000, 0x40000000, 0x80000000,
+       0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+#define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
+#define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * @return     the number of rounds for the given cipher key size.
+ */
+int
+rijndaelKeySetupEnc(u32 rk[/*4*(Nr + 1)*/], const u8 cipherKey[], int keyBits)
+{
+       int i = 0;
+       u32 temp;
+
+       rk[0] = GETU32(cipherKey     );
+       rk[1] = GETU32(cipherKey +  4);
+       rk[2] = GETU32(cipherKey +  8);
+       rk[3] = GETU32(cipherKey + 12);
+       if (keyBits == 128) {
+               for (;;) {
+                       temp  = rk[3];
+                       rk[4] = rk[0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[5] = rk[1] ^ rk[4];
+                       rk[6] = rk[2] ^ rk[5];
+                       rk[7] = rk[3] ^ rk[6];
+                       if (++i == 10) {
+                               return 10;
+                       }
+                       rk += 4;
+               }
+       }
+       rk[4] = GETU32(cipherKey + 16);
+       rk[5] = GETU32(cipherKey + 20);
+       if (keyBits == 192) {
+               for (;;) {
+                       temp = rk[ 5];
+                       rk[ 6] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[ 7] = rk[ 1] ^ rk[ 6];
+                       rk[ 8] = rk[ 2] ^ rk[ 7];
+                       rk[ 9] = rk[ 3] ^ rk[ 8];
+                       if (++i == 8) {
+                               return 12;
+                       }
+                       rk[10] = rk[ 4] ^ rk[ 9];
+                       rk[11] = rk[ 5] ^ rk[10];
+                       rk += 6;
+               }
+       }
+       rk[6] = GETU32(cipherKey + 24);
+       rk[7] = GETU32(cipherKey + 28);
+       if (keyBits == 256) {
+               for (;;) {
+                       temp = rk[ 7];
+                       rk[ 8] = rk[ 0] ^
+                               (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp      ) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp >> 24)       ] & 0x000000ff) ^
+                               rcon[i];
+                       rk[ 9] = rk[ 1] ^ rk[ 8];
+                       rk[10] = rk[ 2] ^ rk[ 9];
+                       rk[11] = rk[ 3] ^ rk[10];
+                       if (++i == 7) {
+                               return 14;
+                       }
+                       temp = rk[11];
+                       rk[12] = rk[ 4] ^
+                               (Te4[(temp >> 24)       ] & 0xff000000) ^
+                               (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+                               (Te4[(temp >>  8) & 0xff] & 0x0000ff00) ^
+                               (Te4[(temp      ) & 0xff] & 0x000000ff);
+                       rk[13] = rk[ 5] ^ rk[12];
+                       rk[14] = rk[ 6] ^ rk[13];
+                       rk[15] = rk[ 7] ^ rk[14];
+                       rk += 8;
+               }
+       }
+       return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * @return     the number of rounds for the given cipher key size.
+ */
+int
+rijndaelKeySetupDec(u32 rk[/*4*(Nr + 1)*/], const u8 cipherKey[], int keyBits)
+{
+       int Nr, i, j;
+       u32 temp;
+
+       /* expand the cipher key: */
+       Nr = rijndaelKeySetupEnc(rk, cipherKey, keyBits);
+
+       /* invert the order of the round keys: */
+       for (i = 0, j = 4*Nr; i < j; i += 4, j -= 4) {
+               temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+               temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+               temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+               temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+       }
+       /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+       for (i = 1; i < Nr; i++) {
+               rk += 4;
+               rk[0] =
+                       Td0[Te4[(rk[0] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[0] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[0]      ) & 0xff] & 0xff];
+               rk[1] =
+                       Td0[Te4[(rk[1] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[1] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[1]      ) & 0xff] & 0xff];
+               rk[2] =
+                       Td0[Te4[(rk[2] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[2] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[2]      ) & 0xff] & 0xff];
+               rk[3] =
+                       Td0[Te4[(rk[3] >> 24)       ] & 0xff] ^
+                       Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+                       Td2[Te4[(rk[3] >>  8) & 0xff] & 0xff] ^
+                       Td3[Te4[(rk[3]      ) & 0xff] & 0xff];
+       }
+       return Nr;
+}
+
+void
+rijndaelEncrypt(const u32 rk[/*4*(Nr + 1)*/], int Nr, const u8 pt[16],
+    u8 ct[16])
+{
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+    int r;
+#endif /* ?FULL_UNROLL */
+
+    /*
+        * map byte array block to cipher state
+        * and add initial round key:
+        */
+       s0 = GETU32(pt     ) ^ rk[0];
+       s1 = GETU32(pt +  4) ^ rk[1];
+       s2 = GETU32(pt +  8) ^ rk[2];
+       s3 = GETU32(pt + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+       /* round 2: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+    /* round 3: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+       /* round 4: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+    /* round 5: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+       /* round 6: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+    /* round 7: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+       /* round 8: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+    /* round 9: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+    if (Nr > 10) {
+       /* round 10: */
+       s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+       s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+       s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+       s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+       /* round 11: */
+       t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+       t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+       t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+       t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+       if (Nr > 12) {
+           /* round 12: */
+           s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+           s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+           s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+           s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+           /* round 13: */
+           t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+           t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+           t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+           t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+       }
+    }
+    rk += Nr << 2;
+#else  /* !FULL_UNROLL */
+    /*
+        * Nr - 1 full rounds:
+        */
+    r = Nr >> 1;
+    for (;;) {
+       t0 =
+           Te0[(s0 >> 24)       ] ^
+           Te1[(s1 >> 16) & 0xff] ^
+           Te2[(s2 >>  8) & 0xff] ^
+           Te3[(s3      ) & 0xff] ^
+           rk[4];
+       t1 =
+           Te0[(s1 >> 24)       ] ^
+           Te1[(s2 >> 16) & 0xff] ^
+           Te2[(s3 >>  8) & 0xff] ^
+           Te3[(s0      ) & 0xff] ^
+           rk[5];
+       t2 =
+           Te0[(s2 >> 24)       ] ^
+           Te1[(s3 >> 16) & 0xff] ^
+           Te2[(s0 >>  8) & 0xff] ^
+           Te3[(s1      ) & 0xff] ^
+           rk[6];
+       t3 =
+           Te0[(s3 >> 24)       ] ^
+           Te1[(s0 >> 16) & 0xff] ^
+           Te2[(s1 >>  8) & 0xff] ^
+           Te3[(s2      ) & 0xff] ^
+           rk[7];
+
+       rk += 8;
+       if (--r == 0) {
+           break;
+       }
+
+       s0 =
+           Te0[(t0 >> 24)       ] ^
+           Te1[(t1 >> 16) & 0xff] ^
+           Te2[(t2 >>  8) & 0xff] ^
+           Te3[(t3      ) & 0xff] ^
+           rk[0];
+       s1 =
+           Te0[(t1 >> 24)       ] ^
+           Te1[(t2 >> 16) & 0xff] ^
+           Te2[(t3 >>  8) & 0xff] ^
+           Te3[(t0      ) & 0xff] ^
+           rk[1];
+       s2 =
+           Te0[(t2 >> 24)       ] ^
+           Te1[(t3 >> 16) & 0xff] ^
+           Te2[(t0 >>  8) & 0xff] ^
+           Te3[(t1      ) & 0xff] ^
+           rk[2];
+       s3 =
+           Te0[(t3 >> 24)       ] ^
+           Te1[(t0 >> 16) & 0xff] ^
+           Te2[(t1 >>  8) & 0xff] ^
+           Te3[(t2      ) & 0xff] ^
+           rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+        * apply last round and
+        * map cipher state to byte array block:
+        */
+       s0 =
+               (Te4[(t0 >> 24)       ] & 0xff000000) ^
+               (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t3      ) & 0xff] & 0x000000ff) ^
+               rk[0];
+       PUTU32(ct     , s0);
+       s1 =
+               (Te4[(t1 >> 24)       ] & 0xff000000) ^
+               (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t0      ) & 0xff] & 0x000000ff) ^
+               rk[1];
+       PUTU32(ct +  4, s1);
+       s2 =
+               (Te4[(t2 >> 24)       ] & 0xff000000) ^
+               (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t1      ) & 0xff] & 0x000000ff) ^
+               rk[2];
+       PUTU32(ct +  8, s2);
+       s3 =
+               (Te4[(t3 >> 24)       ] & 0xff000000) ^
+               (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+               (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+               (Te4[(t2      ) & 0xff] & 0x000000ff) ^
+               rk[3];
+       PUTU32(ct + 12, s3);
+}
+
+static void
+rijndaelDecrypt(const u32 rk[/*4*(Nr + 1)*/], int Nr, const u8 ct[16],
+    u8 pt[16])
+{
+       u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+    int r;
+#endif /* ?FULL_UNROLL */
+
+    /*
+        * map byte array block to cipher state
+        * and add initial round key:
+        */
+    s0 = GETU32(ct     ) ^ rk[0];
+    s1 = GETU32(ct +  4) ^ rk[1];
+    s2 = GETU32(ct +  8) ^ rk[2];
+    s3 = GETU32(ct + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+    if (Nr > 10) {
+       /* round 10: */
+       s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+       s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+       s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+       s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+       /* round 11: */
+       t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+       t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+       t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+       t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+       if (Nr > 12) {
+           /* round 12: */
+           s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+           s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+           s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+           s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+           /* round 13: */
+           t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+           t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+           t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+           t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+       }
+    }
+       rk += Nr << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = Nr >> 1;
+    for (;;) {
+       t0 =
+           Td0[(s0 >> 24)       ] ^
+           Td1[(s3 >> 16) & 0xff] ^
+           Td2[(s2 >>  8) & 0xff] ^
+           Td3[(s1      ) & 0xff] ^
+           rk[4];
+       t1 =
+           Td0[(s1 >> 24)       ] ^
+           Td1[(s0 >> 16) & 0xff] ^
+           Td2[(s3 >>  8) & 0xff] ^
+           Td3[(s2      ) & 0xff] ^
+           rk[5];
+       t2 =
+           Td0[(s2 >> 24)       ] ^
+           Td1[(s1 >> 16) & 0xff] ^
+           Td2[(s0 >>  8) & 0xff] ^
+           Td3[(s3      ) & 0xff] ^
+           rk[6];
+       t3 =
+           Td0[(s3 >> 24)       ] ^
+           Td1[(s2 >> 16) & 0xff] ^
+           Td2[(s1 >>  8) & 0xff] ^
+           Td3[(s0      ) & 0xff] ^
+           rk[7];
+
+       rk += 8;
+       if (--r == 0) {
+           break;
+       }
+
+       s0 =
+           Td0[(t0 >> 24)       ] ^
+           Td1[(t3 >> 16) & 0xff] ^
+           Td2[(t2 >>  8) & 0xff] ^
+           Td3[(t1      ) & 0xff] ^
+           rk[0];
+       s1 =
+           Td0[(t1 >> 24)       ] ^
+           Td1[(t0 >> 16) & 0xff] ^
+           Td2[(t3 >>  8) & 0xff] ^
+           Td3[(t2      ) & 0xff] ^
+           rk[1];
+       s2 =
+           Td0[(t2 >> 24)       ] ^
+           Td1[(t1 >> 16) & 0xff] ^
+           Td2[(t0 >>  8) & 0xff] ^
+           Td3[(t3      ) & 0xff] ^
+           rk[2];
+       s3 =
+           Td0[(t3 >> 24)       ] ^
+           Td1[(t2 >> 16) & 0xff] ^
+           Td2[(t1 >>  8) & 0xff] ^
+           Td3[(t0      ) & 0xff] ^
+           rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+        * apply last round and
+        * map cipher state to byte array block:
+        */
+       s0 =
+               (Td4[(t0 >> 24)       ] & 0xff000000) ^
+               (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t1      ) & 0xff] & 0x000000ff) ^
+               rk[0];
+       PUTU32(pt     , s0);
+       s1 =
+               (Td4[(t1 >> 24)       ] & 0xff000000) ^
+               (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t2      ) & 0xff] & 0x000000ff) ^
+               rk[1];
+       PUTU32(pt +  4, s1);
+       s2 =
+               (Td4[(t2 >> 24)       ] & 0xff000000) ^
+               (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t3      ) & 0xff] & 0x000000ff) ^
+               rk[2];
+       PUTU32(pt +  8, s2);
+       s3 =
+               (Td4[(t3 >> 24)       ] & 0xff000000) ^
+               (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+               (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+               (Td4[(t0      ) & 0xff] & 0x000000ff) ^
+               rk[3];
+       PUTU32(pt + 12, s3);
+}
+
+/* setup key context for encryption only */
+int
+rijndael_set_key_enc_only(rijndael_ctx *ctx, const u_char *key, int bits)
+{
+       int rounds;
+
+       rounds = rijndaelKeySetupEnc(ctx->ek, key, bits);
+       if (rounds == 0)
+               return -1;
+
+       ctx->Nr = rounds;
+       ctx->enc_only = 1;
+
+       return 0;
+}
+
+/* setup key context for both encryption and decryption */
+int
+rijndael_set_key(rijndael_ctx *ctx, const u_char *key, int bits)
+{
+       int rounds;
+
+       rounds = rijndaelKeySetupEnc(ctx->ek, key, bits);
+       if (rounds == 0)
+               return -1;
+       if (rijndaelKeySetupDec(ctx->dk, key, bits) != rounds)
+               return -1;
+
+       ctx->Nr = rounds;
+       ctx->enc_only = 0;
+
+       return 0;
+}
+
+void
+rijndael_decrypt(rijndael_ctx *ctx, const u_char *src, u_char *dst)
+{
+       rijndaelDecrypt(ctx->dk, ctx->Nr, src, dst);
+}
+
+void
+rijndael_encrypt(rijndael_ctx *ctx, const u_char *src, u_char *dst)
+{
+       rijndaelEncrypt(ctx->ek, ctx->Nr, src, dst);
+}
diff --git a/xen/crypto/vmac.c b/xen/crypto/vmac.c
new file mode 100644 (file)
index 0000000..e5558ba
--- /dev/null
@@ -0,0 +1,1220 @@
+/* --------------------------------------------------------------------------
+ * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
+ * This implementation is herby placed in the public domain.
+ * The authors offers no warranty. Use at your own risk.
+ * Please send bug reports to the authors.
+ * Last modified: 17 APR 08, 1700 PDT
+ * ----------------------------------------------------------------------- */
+
+/* start for Xen */
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/lib.h>
+#include <crypto/vmac.h>
+#define UINT64_C(x)  x##ULL
+/* end for Xen */
+
+/* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
+#ifndef VMAC_ARCH_64
+#define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
+#endif
+
+/* Enable code tuned for Intel SSE2 instruction set                   */
+#if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
+#define VMAC_USE_SSE2    1
+#include <emmintrin.h>
+#endif
+
+/* Native word reads. Update (or define via compiler) if incorrect */
+#ifndef VMAC_ARCH_BIG_ENDIAN       /* Assume big-endian unless on the list */
+#define VMAC_ARCH_BIG_ENDIAN \
+    (!(__x86_64__ || __i386__ || _M_IX86 || \
+       _M_X64 || __ARMEL__ || __MIPSEL__))
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Constants and masks                                                     */
+
+const uint64_t p64   = UINT64_C(0xfffffffffffffeff);  /* 2^64 - 257 prime  */
+const uint64_t m62   = UINT64_C(0x3fffffffffffffff);  /* 62-bit mask       */
+const uint64_t m63   = UINT64_C(0x7fffffffffffffff);  /* 63-bit mask       */
+const uint64_t m64   = UINT64_C(0xffffffffffffffff);  /* 64-bit mask       */
+const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff);  /* Poly key mask     */
+
+/* ----------------------------------------------------------------------- *
+ * The following routines are used in this implementation. They are
+ * written via macros to simulate zero-overhead call-by-reference.
+ * All have default implemantations for when they are not defined in an
+ * architecture-specific manner.
+ *
+ * MUL64: 64x64->128-bit multiplication
+ * PMUL64: assumes top bits cleared on inputs
+ * ADD128: 128x128->128-bit addition
+ * GET_REVERSED_64: load and byte-reverse 64-bit word  
+ * ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+#if (__GNUC__ && (__x86_64__ || __amd64__))
+/* ----------------------------------------------------------------------- */
+
+#define ADD128(rh,rl,ih,il)                                               \
+    asm ("addq %3, %1 \n\t"                                               \
+         "adcq %2, %0"                                                    \
+    : "+r"(rh),"+r"(rl)                                                   \
+    : "r"(ih),"r"(il) : "cc");
+
+#define MUL64(rh,rl,i1,i2)                                                \
+    asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
+
+#define PMUL64 MUL64
+
+#define GET_REVERSED_64(p)                                                \
+    ({uint64_t x;                                                         \
+     asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;})
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && __i386__)
+/* ----------------------------------------------------------------------- */
+
+#define GET_REVERSED_64(p)                                                \
+    ({ uint64_t x;                                                        \
+    uint32_t *tp = (uint32_t *)(p);                                       \
+    asm  ("bswap %%edx\n\t"                                               \
+          "bswap %%eax"                                                   \
+    : "=A"(x)                                                             \
+    : "a"(tp[1]), "d"(tp[0]));                                            \
+    x; })
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && __ppc64__)
+/* ----------------------------------------------------------------------- */
+
+#define ADD128(rh,rl,ih,il)                                               \
+    asm volatile (  "addc %1, %1, %3 \n\t"                                \
+                    "adde %0, %0, %2"                                     \
+    : "+r"(rh),"+r"(rl)                                                   \
+    : "r"(ih),"r"(il));
+
+#define MUL64(rh,rl,i1,i2)                                                \
+{ uint64_t _i1 = (i1), _i2 = (i2);                                        \
+    rl = _i1 * _i2;                                                       \
+    asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
+}
+
+#define PMUL64 MUL64
+
+#define GET_REVERSED_64(p)                                                \
+    ({ uint32_t hi, lo, *_p = (uint32_t *)(p);                            \
+       asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) );  \
+       asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) );  \
+       ((uint64_t)hi << 32) | (uint64_t)lo; } )
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && (__ppc__ || __PPC__))
+/* ----------------------------------------------------------------------- */
+
+#define GET_REVERSED_64(p)                                                \
+    ({ uint32_t hi, lo, *_p = (uint32_t *)(p);                            \
+       asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) );  \
+       asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) );  \
+       ((uint64_t)hi << 32) | (uint64_t)lo; } )
+
+/* ----------------------------------------------------------------------- */
+#elif (__GNUC__ && (__ARMEL__ || __ARM__))
+/* ----------------------------------------------------------------------- */
+
+#define bswap32(v)                                                        \
+({ uint32_t tmp,out;                                                      \
+    asm volatile(                                                         \
+        "eor    %1, %2, %2, ror #16\n"                                    \
+        "bic    %1, %1, #0x00ff0000\n"                                    \
+        "mov    %0, %2, ror #8\n"                                         \
+        "eor    %0, %0, %1, lsr #8"                                       \
+    : "=r" (out), "=&r" (tmp)                                             \
+    : "r" (v));                                                           \
+    out;})
+
+/* ----------------------------------------------------------------------- */
+#elif _MSC_VER
+/* ----------------------------------------------------------------------- */
+
+#include <intrin.h>
+
+#if (_M_IA64 || _M_X64) && \
+    (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
+#define MUL64(rh,rl,i1,i2)   (rl) = _umul128(i1,i2,&(rh));
+#pragma intrinsic(_umul128)
+#define PMUL64 MUL64
+#endif
+
+/* MSVC uses add, adc in this version */
+#define ADD128(rh,rl,ih,il)                                          \
+    {   uint64_t _il = (il);                                         \
+        (rl) += (_il);                                               \
+        (rh) += (ih) + ((rl) < (_il));                               \
+    }
+
+#if _MSC_VER >= 1300
+#define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p))
+#pragma intrinsic(_byteswap_uint64)
+#endif
+
+#if _MSC_VER >= 1400 && \
+    (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
+#define MUL32(i1,i2)    (__emulu((uint32_t)(i1),(uint32_t)(i2)))
+#pragma intrinsic(__emulu)
+#endif
+
+/* ----------------------------------------------------------------------- */
+#endif
+/* ----------------------------------------------------------------------- */
+
+#if __GNUC__
+#define ALIGN(n)      __attribute__ ((aligned(n))) 
+#define NOINLINE      __attribute__ ((noinline))
+#define FASTCALL
+#elif _MSC_VER
+#define ALIGN(n)      __declspec(align(n))
+#define NOINLINE      __declspec(noinline)
+#define FASTCALL      __fastcall
+#else
+#define ALIGN(n)
+#define NOINLINE
+#define FASTCALL
+#endif
+
+/* ----------------------------------------------------------------------- */
+/* Default implementations, if not defined above                           */
+/* ----------------------------------------------------------------------- */
+
+#ifndef ADD128
+#define ADD128(rh,rl,ih,il)                                              \
+    {   uint64_t _il = (il);                                             \
+        (rl) += (_il);                                                   \
+        if ((rl) < (_il)) (rh)++;                                        \
+        (rh) += (ih);                                                    \
+    }
+#endif
+
+#ifndef MUL32
+#define MUL32(i1,i2)    ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
+#endif
+
+#ifndef PMUL64              /* rh may not be same as i1 or i2 */
+#define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow     */         \
+    {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
+        uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2);            \
+        rh         = MUL32(_i1>>32,_i2>>32);                             \
+        rl         = MUL32(_i1,_i2);                                     \
+        ADD128(rh,rl,(m >> 32),(m << 32));                               \
+    }
+#endif
+
+#ifndef MUL64
+#define MUL64(rh,rl,i1,i2)                                               \
+    {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
+        uint64_t m1= MUL32(_i1,_i2>>32);                                 \
+        uint64_t m2= MUL32(_i1>>32,_i2);                                 \
+        rh         = MUL32(_i1>>32,_i2>>32);                             \
+        rl         = MUL32(_i1,_i2);                                     \
+        ADD128(rh,rl,(m1 >> 32),(m1 << 32));                             \
+        ADD128(rh,rl,(m2 >> 32),(m2 << 32));                             \
+    }
+#endif
+
+#ifndef GET_REVERSED_64
+#ifndef bswap64
+#ifndef bswap32
+#define bswap32(x)                                                        \
+  ({ uint32_t bsx = (x);                                                  \
+      ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >>  8) |    \
+       (((bsx) & 0x0000ff00u) <<  8) | (((bsx) & 0x000000ffu) << 24)); })
+#endif
+#define bswap64(x)                                                        \
+     ({ union { uint64_t ll; uint32_t l[2]; } w, r;                       \
+         w.ll = (x);                                                      \
+         r.l[0] = bswap32 (w.l[1]);                                       \
+         r.l[1] = bswap32 (w.l[0]);                                       \
+         r.ll; })
+#endif
+#define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p)) 
+#endif
+
+/* ----------------------------------------------------------------------- */
+
+#if (VMAC_PREFER_BIG_ENDIAN)
+#  define get64PE get64BE
+#else
+#  define get64PE get64LE
+#endif
+
+#if (VMAC_ARCH_BIG_ENDIAN)
+#  define get64BE(ptr) (*(uint64_t *)(ptr))
+#  define get64LE(ptr) GET_REVERSED_64(ptr)
+#else /* assume little-endian */
+#  define get64BE(ptr) GET_REVERSED_64(ptr)
+#  define get64LE(ptr) (*(uint64_t *)(ptr))
+#endif
+
+
+/* --------------------------------------------------------------------- *
+ * For highest performance the L1 NH and L2 polynomial hashes should be
+ * carefully implemented to take advantage of one's target architechture.
+ * Here these two hash functions are defined multiple time; once for
+ * 64-bit architectures, once for 32-bit SSE2 architectures, and once
+ * for the rest (32-bit) architectures.
+ * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
+ * Optionally, nh_vmac_nhbytes can be defined (for multiples of
+ * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
+ * NH computations at once).
+ * --------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+#if VMAC_ARCH_64
+/* ----------------------------------------------------------------------- */
+
+#define nh_16(mp, kp, nw, rh, rl)                                            \
+{   int i; uint64_t th, tl;                                                  \
+    rh = rl = 0;                                                             \
+    for (i = 0; i < nw; i+= 2) {                                             \
+        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
+        ADD128(rh,rl,th,tl);                                                 \
+    }                                                                        \
+}
+#define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1)                                \
+{   int i; uint64_t th, tl;                                                  \
+    rh1 = rl1 = rh = rl = 0;                                                 \
+    for (i = 0; i < nw; i+= 2) {                                             \
+        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
+        ADD128(rh1,rl1,th,tl);                                               \
+    }                                                                        \
+}
+
+#if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
+#define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                                  \
+{   int i; uint64_t th, tl;                                                  \
+    rh = rl = 0;                                                             \
+    for (i = 0; i < nw; i+= 8) {                                             \
+        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
+        ADD128(rh,rl,th,tl);                                                 \
+    }                                                                        \
+}
+#define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1)                      \
+{   int i; uint64_t th, tl;                                                  \
+    rh1 = rl1 = rh = rl = 0;                                                 \
+    for (i = 0; i < nw; i+= 8) {                                             \
+        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
+        ADD128(rh1,rl1,th,tl);                                               \
+        MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
+        ADD128(rh1,rl1,th,tl);                                               \
+        MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
+        ADD128(rh1,rl1,th,tl);                                               \
+        MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
+        ADD128(rh,rl,th,tl);                                                 \
+        MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
+        ADD128(rh1,rl1,th,tl);                                               \
+    }                                                                        \
+}
+#endif
+
+#define poly_step(ah, al, kh, kl, mh, ml)                   \
+{   uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0;             \
+    /* compute ab*cd, put bd into result registers */       \
+    PMUL64(t3h,t3l,al,kh);                                  \
+    PMUL64(t2h,t2l,ah,kl);                                  \
+    PMUL64(t1h,t1l,ah,2*kh);                                \
+    PMUL64(ah,al,al,kl);                                    \
+    /* add 2 * ac to result */                              \
+    ADD128(ah,al,t1h,t1l);                                  \
+    /* add together ad + bc */                              \
+    ADD128(t2h,t2l,t3h,t3l);                                \
+    /* now (ah,al), (t2l,2*t2h) need summing */             \
+    /* first add the high registers, carrying into t2h */   \
+    ADD128(t2h,ah,z,t2l);                                   \
+    /* double t2h and add top bit of ah */                  \
+    t2h = 2 * t2h + (ah >> 63);                             \
+    ah &= m63;                                              \
+    /* now add the low registers */                         \
+    ADD128(ah,al,mh,ml);                                    \
+    ADD128(ah,al,z,t2h);                                    \
+}
+
+/* ----------------------------------------------------------------------- */
+#elif VMAC_USE_SSE2
+/* ----------------------------------------------------------------------- */
+
+// macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
+#if defined(__GNUC__)
+       // define these in two steps to allow arguments to be expanded
+       #define GNU_AS2(x, y) #x ", " #y ";"
+       #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
+       #define GNU_ASL(x) "\n" #x ":"
+       #define GNU_ASJ(x, y, z) #x " " #y #z ";"
+       #define AS2(x, y) GNU_AS2(x, y)
+       #define AS3(x, y, z) GNU_AS3(x, y, z)
+       #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
+       #define ASL(x) GNU_ASL(x)
+       #define ASJ(x, y, z) GNU_ASJ(x, y, z)
+#else
+       #define AS2(x, y) __asm {x, y}
+       #define AS3(x, y, z) __asm {x, y, z}
+       #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
+       #define ASL(x) __asm {label##x:}
+       #define ASJ(x, y, z) __asm {x label##y}
+#endif
+
+static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
+{
+       // This assembly version, using MMX registers, is just as fast as the
+       // intrinsics version (which uses XMM registers) on the Intel Core 2,
+       // but is much faster on the Pentium 4. In order to schedule multiplies
+       // as early as possible, the loop interleaves operations for the current
+       // block and the next block. To mask out high 32-bits, we use "movd"
+       // to move the lower 32-bits to the stack and then back. Surprisingly,
+       // this is faster than any other method.
+#ifdef __GNUC__
+       __asm__ __volatile__
+       (
+               ".intel_syntax noprefix;"
+#else
+               AS2(    mov             esi, mp)
+               AS2(    mov             edi, kp)
+               AS2(    mov             ecx, nw)
+               AS2(    mov             eax, rl)
+               AS2(    mov             edx, rh)
+#endif
+               AS2(    sub             esp, 12)
+               AS2(    movq    mm6, [esi])
+               AS2(    paddq   mm6, [edi])
+               AS2(    movq    mm5, [esi+8])
+               AS2(    paddq   mm5, [edi+8])
+               AS2(    add             esi, 16)
+               AS2(    add             edi, 16)
+               AS2(    movq    mm4, mm6)
+               ASS(    pshufw  mm2, mm6, 1, 0, 3, 2)
+               AS2(    pmuludq mm6, mm5)
+               ASS(    pshufw  mm3, mm5, 1, 0, 3, 2)
+               AS2(    pmuludq mm5, mm2)
+               AS2(    pmuludq mm2, mm3)
+               AS2(    pmuludq mm3, mm4)
+               AS2(    pxor    mm7, mm7)
+               AS2(    movd    [esp], mm6)
+               AS2(    psrlq   mm6, 32)
+               AS2(    movd    [esp+4], mm5)
+               AS2(    psrlq   mm5, 32)
+               AS2(    sub             ecx, 2)
+               ASJ(    jz,             1, f)
+               ASL(0)
+               AS2(    movq    mm0, [esi])
+               AS2(    paddq   mm0, [edi])
+               AS2(    movq    mm1, [esi+8])
+               AS2(    paddq   mm1, [edi+8])
+               AS2(    add             esi, 16)
+               AS2(    add             edi, 16)
+               AS2(    movq    mm4, mm0)
+               AS2(    paddq   mm5, mm2)
+               ASS(    pshufw  mm2, mm0, 1, 0, 3, 2)
+               AS2(    pmuludq mm0, mm1)
+               AS2(    movd    [esp+8], mm3)
+               AS2(    psrlq   mm3, 32)
+               AS2(    paddq   mm5, mm3)
+               ASS(    pshufw  mm3, mm1, 1, 0, 3, 2)
+               AS2(    pmuludq mm1, mm2)
+               AS2(    pmuludq mm2, mm3)
+               AS2(    pmuludq mm3, mm4)
+               AS2(    movd    mm4, [esp])
+               AS2(    paddq   mm7, mm4)
+               AS2(    movd    mm4, [esp+4])
+               AS2(    paddq   mm6, mm4)
+               AS2(    movd    mm4, [esp+8])
+               AS2(    paddq   mm6, mm4)
+               AS2(    movd    [esp], mm0)
+               AS2(    psrlq   mm0, 32)
+               AS2(    paddq   mm6, mm0)
+               AS2(    movd    [esp+4], mm1)
+               AS2(    psrlq   mm1, 32)
+               AS2(    paddq   mm5, mm1)
+               AS2(    sub             ecx, 2)
+               ASJ(    jnz,    0, b)
+               ASL(1)
+               AS2(    paddq   mm5, mm2)
+               AS2(    movd    [esp+8], mm3)
+               AS2(    psrlq   mm3, 32)
+               AS2(    paddq   mm5, mm3)
+               AS2(    movd    mm4, [esp])
+               AS2(    paddq   mm7, mm4)
+               AS2(    movd    mm4, [esp+4])
+               AS2(    paddq   mm6, mm4)
+               AS2(    movd    mm4, [esp+8])
+               AS2(    paddq   mm6, mm4)
+
+               ASS(    pshufw  mm0, mm7, 3, 2, 1, 0)
+               AS2(    psrlq   mm7, 32)
+               AS2(    paddq   mm6, mm7)
+               AS2(    punpckldq       mm0, mm6)
+               AS2(    psrlq   mm6, 32)
+               AS2(    paddq   mm5, mm6)
+               AS2(    movq    [eax], mm0)
+               AS2(    movq    [edx], mm5)
+               AS2(    add             esp, 12)
+#ifdef __GNUC__
+               ".att_syntax prefix;"
+               :
+               : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
+               : "memory", "cc"
+       );
+#endif
+}
+#define nh_16(mp, kp, nw, rh, rl)   nh_16_func(mp, kp, nw, &(rh), &(rl));
+
+static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
+               const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)                  
+{
+       // This code tries to schedule the multiplies as early as possible to overcome
+       // the long latencies on the Pentium 4. It also minimizes "movq" instructions
+       // which are very expensive on the P4.
+
+#define a0 [eax+0]
+#define a1 [eax+4]
+#define a2 [ebx+0]
+#define a3 [ebx+4]
+#define k0 [ecx+0]
+#define k1 [ecx+4]
+#define k2 [edx+0]
+#define k3 [edx+4]
+
+#ifdef __GNUC__
+       uint32_t temp;
+       __asm__ __volatile__
+       (
+               "mov %%ebx, %0;"
+               "mov %1, %%ebx;"
+               ".intel_syntax noprefix;"
+#else
+               AS2(    mov             ebx, ahi)
+               AS2(    mov             edx, kh)
+               AS2(    mov             eax, alo)
+               AS2(    mov             ecx, kl)
+               AS2(    mov             esi, mh)
+               AS2(    mov             edi, ml)
+#endif
+
+               AS2(    movd    mm0, a3)
+               AS2(    movq    mm4, mm0)
+               AS2(    pmuludq mm0, k3)                // a3*k3
+               AS2(    movd    mm1, a0)
+               AS2(    pmuludq mm1, k2)                // a0*k2
+               AS2(    movd    mm2, a1)
+               AS2(    movd    mm6, k1)
+               AS2(    pmuludq mm2, mm6)               // a1*k1
+               AS2(    movd    mm3, a2)
+               AS2(    movq    mm5, mm3)
+               AS2(    movd    mm7, k0)
+               AS2(    pmuludq mm3, mm7)               // a2*k0
+               AS2(    pmuludq mm4, mm7)               // a3*k0
+               AS2(    pmuludq mm5, mm6)               // a2*k1
+               AS2(    psllq   mm0, 1)
+               AS2(    paddq   mm0, [esi])
+               AS2(    paddq   mm0, mm1)
+               AS2(    movd    mm1, a1)
+               AS2(    paddq   mm4, mm5)
+               AS2(    movq    mm5, mm1)
+               AS2(    pmuludq mm1, k2)                // a1*k2
+               AS2(    paddq   mm0, mm2)
+               AS2(    movd    mm2, a0)
+               AS2(    paddq   mm0, mm3)
+               AS2(    movq    mm3, mm2)
+               AS2(    pmuludq mm2, k3)                // a0*k3
+               AS2(    pmuludq mm3, mm7)               // a0*k0
+               AS2(    movd    esi, mm0)
+               AS2(    psrlq   mm0, 32)
+               AS2(    pmuludq mm7, mm5)               // a1*k0
+               AS2(    pmuludq mm5, k3)                // a1*k3
+               AS2(    paddq   mm0, mm1)
+               AS2(    movd    mm1, a2)
+               AS2(    pmuludq mm1, k2)                // a2*k2
+               AS2(    paddq   mm0, mm2)
+               AS2(    paddq   mm0, mm4)
+               AS2(    movq    mm4, mm0)
+               AS2(    movd    mm2, a3)
+               AS2(    pmuludq mm2, mm6)               // a3*k1
+               AS2(    pmuludq mm6, a0)                // a0*k1
+               AS2(    psrlq   mm0, 31)
+               AS2(    paddq   mm0, mm3)
+               AS2(    movd    mm3, [edi])
+               AS2(    paddq   mm0, mm3)
+               AS2(    movd    mm3, a2)
+               AS2(    pmuludq mm3, k3)                // a2*k3
+               AS2(    paddq   mm5, mm1)
+               AS2(    movd    mm1, a3)
+               AS2(    pmuludq mm1, k2)                // a3*k2
+               AS2(    paddq   mm5, mm2)
+               AS2(    movd    mm2, [edi+4])
+               AS2(    psllq   mm5, 1)
+               AS2(    paddq   mm0, mm5)
+               AS2(    movq    mm5, mm0)
+               AS2(    psllq   mm4, 33)
+               AS2(    psrlq   mm0, 32)
+               AS2(    paddq   mm6, mm7)
+               AS2(    movd    mm7, esi)
+               AS2(    paddq   mm0, mm6)
+               AS2(    paddq   mm0, mm2)
+               AS2(    paddq   mm3, mm1)
+               AS2(    psllq   mm3, 1)
+               AS2(    paddq   mm0, mm3)
+               AS2(    psrlq   mm4, 1)
+               AS2(    punpckldq       mm5, mm0)
+               AS2(    psrlq   mm0, 32)
+               AS2(    por             mm4, mm7)
+               AS2(    paddq   mm0, mm4)
+               AS2(    movq    a0, mm5)
+               AS2(    movq    a2, mm0)
+#ifdef __GNUC__
+               ".att_syntax prefix;"
+               "mov %0, %%ebx;"
+               : "=m" (temp)
+               : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
+               : "memory", "cc"
+       );
+#endif
+
+
+#undef a0
+#undef a1
+#undef a2
+#undef a3
+#undef k0
+#undef k1
+#undef k2
+#undef k3
+}
+
+#define poly_step(ah, al, kh, kl, mh, ml)   \
+        poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
+
+/* ----------------------------------------------------------------------- */
+#else /* not VMAC_ARCH_64 and not SSE2 */
+/* ----------------------------------------------------------------------- */
+
+#ifndef nh_16
+#define nh_16(mp, kp, nw, rh, rl)                                       \
+{   uint64_t t1,t2,m1,m2,t;                                             \
+    int i;                                                              \
+    rh = rl = t = 0;                                                    \
+    for (i = 0; i < nw; i+=2)  {                                        \
+        t1  = get64PE(mp+i) + kp[i];                                    \
+        t2  = get64PE(mp+i+1) + kp[i+1];                                \
+        m2  = MUL32(t1 >> 32, t2);                                      \
+        m1  = MUL32(t1, t2 >> 32);                                      \
+        ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2));            \
+        rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32);    \
+        t  += (uint64_t)(uint32_t)m1 + (uint32_t)m2;                    \
+    }                                                                   \
+    ADD128(rh,rl,(t >> 32),(t << 32));                                  \
+}
+#endif
+
+static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
+               const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)                  
+{
+
+#if VMAC_ARCH_BIG_ENDIAN
+#define INDEX_HIGH 0
+#define INDEX_LOW 1
+#else
+#define INDEX_HIGH 1
+#define INDEX_LOW 0
+#endif
+
+#define a0 *(((uint32_t*)alo)+INDEX_LOW)
+#define a1 *(((uint32_t*)alo)+INDEX_HIGH)
+#define a2 *(((uint32_t*)ahi)+INDEX_LOW)
+#define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
+#define k0 *(((uint32_t*)kl)+INDEX_LOW)
+#define k1 *(((uint32_t*)kl)+INDEX_HIGH)
+#define k2 *(((uint32_t*)kh)+INDEX_LOW)
+#define k3 *(((uint32_t*)kh)+INDEX_HIGH)
+
+    uint64_t p, q, t;
+    uint32_t t2;
+
+    p = MUL32(a3, k3);
+    p += p;
+       p += *(uint64_t *)mh;
+    p += MUL32(a0, k2);
+    p += MUL32(a1, k1);
+    p += MUL32(a2, k0);
+    t = (uint32_t)(p);
+    p >>= 32;
+    p += MUL32(a0, k3);
+    p += MUL32(a1, k2);
+    p += MUL32(a2, k1);
+    p += MUL32(a3, k0);
+    t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
+    p >>= 31;
+    p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
+    p += MUL32(a0, k0);
+    q =  MUL32(a1, k3);
+    q += MUL32(a2, k2);
+    q += MUL32(a3, k1);
+    q += q;
+    p += q;
+    t2 = (uint32_t)(p);
+    p >>= 32;
+    p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
+    p += MUL32(a0, k1);
+    p += MUL32(a1, k0);
+    q =  MUL32(a2, k3);
+    q += MUL32(a3, k2);
+    q += q;
+    p += q;
+    *(uint64_t *)(alo) = (p << 32) | t2;
+    p >>= 32;
+    *(uint64_t *)(ahi) = p + t;
+
+#undef a0
+#undef a1
+#undef a2
+#undef a3
+#undef k0
+#undef k1
+#undef k2
+#undef k3
+}
+
+#define poly_step(ah, al, kh, kl, mh, ml)   \
+        poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
+
+/* ----------------------------------------------------------------------- */
+#endif  /* end of specialized NH and poly definitions */
+/* ----------------------------------------------------------------------- */
+
+/* At least nh_16 is defined. Defined others as needed  here               */
+#ifndef nh_16_2
+#define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2)                           \
+    nh_16(mp, kp, nw, rh, rl);                                          \
+    nh_16(mp, ((kp)+2), nw, rh2, rl2);
+#endif
+#ifndef nh_vmac_nhbytes
+#define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                             \
+    nh_16(mp, kp, nw, rh, rl)
+#endif
+#ifndef nh_vmac_nhbytes_2
+#define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2)                 \
+    nh_vmac_nhbytes(mp, kp, nw, rh, rl);                                \
+    nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
+#endif
+
+/* ----------------------------------------------------------------------- */
+
+void vhash_abort(vmac_ctx_t *ctx)
+{
+    ctx->polytmp[0] = ctx->polykey[0] ;
+    ctx->polytmp[1] = ctx->polykey[1] ;
+    #if (VMAC_TAG_LEN == 128)
+    ctx->polytmp[2] = ctx->polykey[2] ;
+    ctx->polytmp[3] = ctx->polykey[3] ;
+    #endif
+    ctx->first_block_processed = 0;
+}
+
+/* ----------------------------------------------------------------------- */
+static uint64_t l3hash(uint64_t p1, uint64_t p2,
+                       uint64_t k1, uint64_t k2, uint64_t len)
+{
+    uint64_t rh, rl, t, z=0;
+
+    /* fully reduce (p1,p2)+(len,0) mod p127 */
+    t = p1 >> 63;
+    p1 &= m63;
+    ADD128(p1, p2, len, t);
+    /* At this point, (p1,p2) is at most 2^127+(len<<64) */
+    t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
+    ADD128(p1, p2, z, t);
+    p1 &= m63;
+
+    /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
+    t = p1 + (p2 >> 32);
+    t += (t >> 32);
+    t += (uint32_t)t > 0xfffffffeu;
+    p1 += (t >> 32);
+    p2 += (p1 << 32);
+
+    /* compute (p1+k1)%p64 and (p2+k2)%p64 */
+    p1 += k1;
+    p1 += (0 - (p1 < k1)) & 257;
+    p2 += k2;
+    p2 += (0 - (p2 < k2)) & 257;
+
+    /* compute (p1+k1)*(p2+k2)%p64 */
+    MUL64(rh, rl, p1, p2);
+    t = rh >> 56;
+    ADD128(t, rl, z, rh);
+    rh <<= 8;
+    ADD128(t, rl, z, rh);
+    t += t << 8;
+    rl += t;
+    rl += (0 - (rl < t)) & 257;
+    rl += (0 - (rl > p64-1)) & 257;
+    return rl;
+}
+
+/* ----------------------------------------------------------------------- */
+
+void vhash_update(unsigned char *m,
+                  unsigned int   mbytes, /* Pos multiple of VMAC_NHBYTES */
+                  vmac_ctx_t    *ctx)
+{
+    uint64_t rh, rl, *mptr;
+    const uint64_t *kptr = (uint64_t *)ctx->nhkey;
+    int i;
+    uint64_t ch, cl;
+    uint64_t pkh = ctx->polykey[0];
+    uint64_t pkl = ctx->polykey[1];
+    #if (VMAC_TAG_LEN == 128)
+    uint64_t ch2, cl2, rh2, rl2;
+    uint64_t pkh2 = ctx->polykey[2];
+    uint64_t pkl2 = ctx->polykey[3];
+    #endif
+
+    mptr = (uint64_t *)m;
+    i = mbytes / VMAC_NHBYTES;  /* Must be non-zero */
+
+    ch = ctx->polytmp[0];
+    cl = ctx->polytmp[1];
+    #if (VMAC_TAG_LEN == 128)
+    ch2 = ctx->polytmp[2];
+    cl2 = ctx->polytmp[3];
+    #endif
+    
+    if ( ! ctx->first_block_processed) {
+        ctx->first_block_processed = 1;
+        #if (VMAC_TAG_LEN == 64)
+        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
+        #else
+        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
+        rh2 &= m62;
+        ADD128(ch2,cl2,rh2,rl2);
+        #endif
+        rh &= m62;
+        ADD128(ch,cl,rh,rl);
+        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+        i--;
+    }
+
+    while (i--) {
+        #if (VMAC_TAG_LEN == 64)
+        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
+        #else
+        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
+        rh2 &= m62;
+        poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
+        #endif
+        rh &= m62;
+        poly_step(ch,cl,pkh,pkl,rh,rl);
+        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+    }
+
+    ctx->polytmp[0] = ch;
+    ctx->polytmp[1] = cl;
+    #if (VMAC_TAG_LEN == 128)
+    ctx->polytmp[2] = ch2;
+    ctx->polytmp[3] = cl2;
+    #endif
+    #if VMAC_USE_SSE2
+    _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
+    #endif
+}
+
+/* ----------------------------------------------------------------------- */
+
+uint64_t xvhash(unsigned char m[],
+          unsigned int mbytes,
+          uint64_t *tagl,
+          vmac_ctx_t *ctx)
+{
+    uint64_t ch, cl, rh, rl, *mptr;
+    #if (VMAC_TAG_LEN == 128)
+    uint64_t ch2, cl2, rh2, rl2;
+    #endif
+    const uint64_t *kptr = (uint64_t *)ctx->nhkey;
+    int i, remaining;
+
+    remaining = mbytes % VMAC_NHBYTES;
+    i = mbytes-remaining;
+    mptr = (uint64_t *)(m+i);
+    if (i) vhash_update(m,i,ctx);
+
+    ch = ctx->polytmp[0];
+    cl = ctx->polytmp[1];
+    #if (VMAC_TAG_LEN == 128)
+    ch2 = ctx->polytmp[2];
+    cl2 = ctx->polytmp[3];
+    #endif
+
+    if (remaining) {
+        #if (VMAC_TAG_LEN == 128)
+        nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
+        rh2 &= m62;
+        #else
+        nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
+        #endif
+        rh &= m62;
+        if (i) {
+            poly_step(ch,cl,ctx->polykey[0],ctx->polykey[1],rh,rl);
+            #if (VMAC_TAG_LEN == 128)
+            poly_step(ch2,cl2,ctx->polykey[2],ctx->polykey[3],rh2,rl2);
+            #endif
+        } else {
+            ADD128(ch,cl,rh,rl);
+            #if (VMAC_TAG_LEN == 128)
+            ADD128(ch2,cl2,rh2,rl2);
+            #endif
+        }
+    }
+
+    #if VMAC_USE_SSE2
+    _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
+    #endif
+    vhash_abort(ctx);
+    remaining *= 8;
+#if (VMAC_TAG_LEN == 128)
+    *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
+#endif
+    return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
+}
+
+uint64_t vhash(unsigned char m[],
+          unsigned int mbytes,
+          uint64_t *tagl,
+          vmac_ctx_t *ctx)
+{
+    uint64_t rh, rl, *mptr;
+    const uint64_t *kptr = (uint64_t *)ctx->nhkey;
+    int i, remaining;
+    uint64_t ch, cl;
+    uint64_t pkh = ctx->polykey[0];
+    uint64_t pkl = ctx->polykey[1];
+    #if (VMAC_TAG_LEN == 128)
+        uint64_t ch2, cl2, rh2, rl2;
+        uint64_t pkh2 = ctx->polykey[2];
+        uint64_t pkl2 = ctx->polykey[3];
+    #endif
+
+    mptr = (uint64_t *)m;
+    i = mbytes / VMAC_NHBYTES;
+    remaining = mbytes % VMAC_NHBYTES;
+
+    if (ctx->first_block_processed)
+    {
+        ch = ctx->polytmp[0];
+        cl = ctx->polytmp[1];
+        #if (VMAC_TAG_LEN == 128)
+        ch2 = ctx->polytmp[2];
+        cl2 = ctx->polytmp[3];
+        #endif
+    }
+    else if (i)
+    {
+        #if (VMAC_TAG_LEN == 64)
+        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
+        #else
+        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
+        ch2 &= m62;
+        ADD128(ch2,cl2,pkh2,pkl2);
+        #endif
+        ch &= m62;
+        ADD128(ch,cl,pkh,pkl);
+        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+        i--;
+    }
+    else if (remaining)
+    {
+        #if (VMAC_TAG_LEN == 64)
+        nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
+        #else
+        nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
+        ch2 &= m62;
+        ADD128(ch2,cl2,pkh2,pkl2);
+        #endif
+        ch &= m62;
+        ADD128(ch,cl,pkh,pkl);
+        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+        goto do_l3;
+    }
+    else /* Empty String */
+    {
+        ch = pkh; cl = pkl;
+        #if (VMAC_TAG_LEN == 128)
+        ch2 = pkh2; cl2 = pkl2;
+        #endif
+        goto do_l3;
+    }
+
+    while (i--) {
+        #if (VMAC_TAG_LEN == 64)
+        nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
+        #else
+        nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
+        rh2 &= m62;
+        poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
+        #endif
+        rh &= m62;
+        poly_step(ch,cl,pkh,pkl,rh,rl);
+        mptr += (VMAC_NHBYTES/sizeof(uint64_t));
+    }
+    if (remaining) {
+        #if (VMAC_TAG_LEN == 64)
+        nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
+        #else
+        nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
+        rh2 &= m62;
+        poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
+        #endif
+        rh &= m62;
+        poly_step(ch,cl,pkh,pkl,rh,rl);
+    }
+
+do_l3:
+    #if VMAC_USE_SSE2
+    _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
+    #endif
+    vhash_abort(ctx);
+    remaining *= 8;
+#if (VMAC_TAG_LEN == 128)
+    *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
+#endif
+    return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
+}
+
+/* ----------------------------------------------------------------------- */
+
+uint64_t vmac(unsigned char m[],
+         unsigned int mbytes,
+         unsigned char n[16],
+         uint64_t *tagl,
+         vmac_ctx_t *ctx)
+{
+#if (VMAC_TAG_LEN == 64)
+    uint64_t *in_n, *out_p;
+    uint64_t p, h;
+    int i;
+    
+    #if VMAC_CACHE_NONCES
+    in_n = ctx->cached_nonce;
+    out_p = ctx->cached_aes;
+    #else
+    uint64_t tmp[2];
+    in_n = out_p = tmp;
+    #endif
+
+    i = n[15] & 1;
+    #if VMAC_CACHE_NONCES
+    if ((*(uint64_t *)(n+8) != in_n[1]) ||
+        (*(uint64_t *)(n  ) != in_n[0])) {
+    #endif
+    
+        in_n[0] = *(uint64_t *)(n  );
+        in_n[1] = *(uint64_t *)(n+8);
+        ((unsigned char *)in_n)[15] &= 0xFE;
+        aes_encryption(in_n, out_p, &ctx->cipher_key);
+
+    #if VMAC_CACHE_NONCES
+        ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
+    }
+    #endif
+    p = get64BE(out_p + i);
+    h = vhash(m, mbytes, (uint64_t *)0, ctx);
+    return p + h;
+#else
+    uint64_t tmp[2];
+    uint64_t th,tl;
+    aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
+    th = vhash(m, mbytes, &tl, ctx);
+    th += get64BE(tmp);
+    *tagl = tl + get64BE(tmp+1);
+    return th;
+#endif
+}
+
+/* ----------------------------------------------------------------------- */
+
+void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
+{
+    uint64_t in[2] = {0}, out[2];
+    unsigned i;
+    aes_key_setup(user_key, &ctx->cipher_key);
+    
+    /* Fill nh key */
+    ((unsigned char *)in)[0] = 0x80; 
+    for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
+        aes_encryption((unsigned char *)in, (unsigned char *)out,
+                                                         &ctx->cipher_key);
+        ctx->nhkey[i  ] = get64BE(out);
+        ctx->nhkey[i+1] = get64BE(out+1);
+        ((unsigned char *)in)[15] += 1;
+    }
+
+    /* Fill poly key */
+    ((unsigned char *)in)[0] = 0xC0; 
+    in[1] = 0;
+    for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
+        aes_encryption((unsigned char *)in, (unsigned char *)out,
+                                                         &ctx->cipher_key);
+        ctx->polytmp[i  ] = ctx->polykey[i  ] = get64BE(out) & mpoly;
+        ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
+        ((unsigned char *)in)[15] += 1;
+    }
+
+    /* Fill ip key */
+    ((unsigned char *)in)[0] = 0xE0;
+    in[1] = 0;
+    for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
+        do {
+            aes_encryption((unsigned char *)in, (unsigned char *)out,
+                                                         &ctx->cipher_key);
+            ctx->l3key[i  ] = get64BE(out);
+            ctx->l3key[i+1] = get64BE(out+1);
+            ((unsigned char *)in)[15] += 1;
+        } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
+    }
+    
+    /* Invalidate nonce/aes cache and reset other elements */
+    #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
+    ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
+    ctx->cached_nonce[1] = (uint64_t)0;  /* Ensure illegal nonce */
+    #endif
+    ctx->first_block_processed = 0;
+}
+
+/* ----------------------------------------------------------------------- */
+
+
+#if VMAC_RUN_TESTS
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+
+unsigned prime(void)  /* Wake variable speed cpu, get rough speed estimate */
+{
+    volatile uint64_t i;
+    volatile uint64_t j=1;
+    unsigned cnt=0;
+    volatile clock_t ticks = clock();
+    do {
+        for (i = 0; i < 500000; i++) {
+            uint64_t x = get64PE(&j);
+            j = x * x + (uint64_t)ticks;
+        }
+        cnt++;
+    } while (clock() - ticks < (CLOCKS_PER_SEC/2));
+    return cnt;  /* cnt is millions of iterations per second */
+}
+
+int main(void)
+{
+    ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
+    uint64_t res, tagl;
+    void *p;
+    unsigned char *m;
+    ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
+    ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
+    unsigned int  vector_lengths[] = {0,3,48,300,3000000};
+    #if (VMAC_TAG_LEN == 64)
+    ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
+                        "E8421F61D573D298","4492DF6C5CAC1BBE",
+                        "09BA597DD7601113"};
+    #else
+    ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
+                         "4EE815A06A1D71EDD36FC75D51188A42",
+                         "09F2C80C8E1007A0C12FAE19FE4504AE",
+                         "66438817154850C61D8A412164803BCB",
+                         "2B6B02288FFC461B75485DE893C629DC"};
+    #endif
+    unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+    unsigned i, j, *speed_iters;
+    clock_t ticks;
+    double cpb;
+    const unsigned int buf_len = 3 * (1 << 20);
+    
+    j = prime();
+    i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
+    speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
+    speed_iters[i-1] = j * (1 << 12);
+    while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
+    
+    /* Initialize context and message buffer, all 16-byte aligned */
+    p = malloc(buf_len + 32);
+    m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
+    memset(m, 0, buf_len + 16);
+    vmac_set_key(key, &ctx);
+    
+    /* Test incremental and all-in-one interfaces for correctness */
+    vmac_set_key(key, &ctx_aio);
+    vmac_set_key(key, &ctx_inc1);
+    vmac_set_key(key, &ctx_inc2);
+    
+    
+    /*
+    for (i = 0; i <= 512; i++) {
+        vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
+        tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
+                                                      nonce, &tagl, &ctx);
+        vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
+        for (j = 0; j < vector_lengths[i]; j++)
+            m[j] = (unsigned char)('a'+j%3);
+        
+    }
+    */
+    
+    /* Generate vectors */
+    for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
+        for (j = 0; j < vector_lengths[i]; j++)
+            m[j] = (unsigned char)('a'+j%3);
+        res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
+        #if (VMAC_TAG_LEN == 64)
+        printf("\'abc\' * %7u: %016llX Should be: %s\n",
+              vector_lengths[i]/3,res,should_be[i]);
+        #else
+        printf("\'abc\' * %7u: %016llX%016llX\nShould be      : %s\n",
+              vector_lengths[i]/3,res,tagl,should_be[i]);
+        #endif
+    }
+
+    /* Speed test */
+    for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
+        ticks = clock();
+        for (j = 0; j < speed_iters[i]; j++) {
+            #if HASH_ONLY
+            res = vhash(m, speed_lengths[i], &tagl, &ctx);
+            #else
+            res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
+            nonce[7]++;
+            #endif
+        }
+        ticks = clock() - ticks;
+        cpb = ((ticks*VMAC_HZ)/
+              ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
+        printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
+    }
+    return 1;
+}
+
+#endif
index 14c74611ba70ca422a79645e919be1f1e6a5481e..eb4fb61554e74fb94dca71df6a4d499a2e526005 100644 (file)
@@ -1,6 +1,6 @@
 subdir-y += char
 subdir-y += cpufreq
 subdir-y += pci
-subdir-$(x86) += passthrough
+subdir-y += passthrough
 subdir-$(HAS_ACPI) += acpi
 subdir-$(HAS_VGA) += video
index b3ba34c378bc84cf64fdbe653074cfed442a3911..1ac35c8237082dc691d96e964d43efa6282e0bc0 100644 (file)
@@ -47,12 +47,17 @@ extern uint32_t pmstat_get_cx_nr(uint32_t cpuid);
 extern int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat);
 extern int pmstat_reset_cx_stat(uint32_t cpuid);
 
+extern struct list_head cpufreq_governor_list;
+
+/*
+ * Get PM statistic info
+ */
 int do_get_pm_info(struct xen_sysctl_get_pmstat *op)
 {
     int ret = 0;
     const struct processor_pminfo *pmpt;
 
-    if ( (op->cpuid >= NR_CPUS) || !cpu_online(op->cpuid) )
+    if ( !op || (op->cpuid >= NR_CPUS) || !cpu_online(op->cpuid) )
         return -EINVAL;
     pmpt = processor_pminfo[op->cpuid];
 
@@ -82,33 +87,34 @@ int do_get_pm_info(struct xen_sysctl_get_pmstat *op)
 
     case PMSTAT_get_pxstat:
     {
-        uint64_t now, ct;
-        uint64_t total_idle_ns;
-        uint64_t tmp_idle_ns;
+        uint32_t ct;
         struct pm_px *pxpt = cpufreq_statistic_data[op->cpuid];
+        spinlock_t *cpufreq_statistic_lock = 
+                   &per_cpu(cpufreq_statistic_lock, op->cpuid);
 
-        if ( !pxpt )
-            return -ENODATA;
+        spin_lock(cpufreq_statistic_lock);
 
-        total_idle_ns = get_cpu_idle_time(op->cpuid);
-        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
+        if ( !pxpt || !pxpt->u.pt || !pxpt->u.trans_pt )
+        {
+            spin_unlock(cpufreq_statistic_lock);
+            return -ENODATA;
+        }
 
-        now = NOW();
         pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.platform_limit;
-        pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall;
-        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
-        pxpt->prev_state_wall = now;
-        pxpt->prev_idle_wall = total_idle_ns;
+
+        cpufreq_residency_update(op->cpuid, pxpt->u.cur);
 
         ct = pmpt->perf.state_count;
         if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) )
         {
+            spin_unlock(cpufreq_statistic_lock);
             ret = -EFAULT;
             break;
         }
 
         if ( copy_to_guest(op->u.getpx.pt, pxpt->u.pt, ct) )
         {
+            spin_unlock(cpufreq_statistic_lock);
             ret = -EFAULT;
             break;
         }
@@ -118,6 +124,8 @@ int do_get_pm_info(struct xen_sysctl_get_pmstat *op)
         op->u.getpx.last = pxpt->u.last;
         op->u.getpx.cur = pxpt->u.cur;
 
+        spin_unlock(cpufreq_statistic_lock);
+
         break;
     }
 
@@ -156,3 +164,374 @@ int do_get_pm_info(struct xen_sysctl_get_pmstat *op)
 
     return ret;
 }
+
+/*
+ * 1. Get PM parameter
+ * 2. Provide user PM control
+ */
+static int read_scaling_available_governors(char *scaling_available_governors,
+                                            unsigned int size)
+{
+    unsigned int i = 0;
+    struct cpufreq_governor *t;
+
+    if ( !scaling_available_governors )
+        return -EINVAL;
+
+    list_for_each_entry(t, &cpufreq_governor_list, governor_list)
+    {
+        i += scnprintf(&scaling_available_governors[i],
+                       CPUFREQ_NAME_LEN, "%s ", t->name);
+        if ( i > size )
+            return -EINVAL;
+    }
+    scaling_available_governors[i-1] = '\0';
+
+    return 0;
+}
+
+static int get_cpufreq_para(struct xen_sysctl_pm_op *op)
+{
+    uint32_t ret = 0;
+    const struct processor_pminfo *pmpt;
+    struct cpufreq_policy *policy;
+    uint32_t gov_num = 0;
+    uint32_t *affected_cpus;
+    uint32_t *scaling_available_frequencies;
+    char     *scaling_available_governors;
+    struct list_head *pos;
+    uint32_t cpu, i, j = 0;
+
+    if ( !op || !cpu_online(op->cpuid) )
+        return -EINVAL;
+    pmpt = processor_pminfo[op->cpuid];
+    policy = cpufreq_cpu_policy[op->cpuid];
+
+    if ( !pmpt || !pmpt->perf.states ||
+         !policy || !policy->governor )
+        return -EINVAL;
+
+    list_for_each(pos, &cpufreq_governor_list)
+        gov_num++;
+
+    if ( (op->get_para.cpu_num  != cpus_weight(policy->cpus)) ||
+         (op->get_para.freq_num != pmpt->perf.state_count)    ||
+         (op->get_para.gov_num  != gov_num) )
+    {
+        op->get_para.cpu_num =  cpus_weight(policy->cpus);
+        op->get_para.freq_num = pmpt->perf.state_count;
+        op->get_para.gov_num  = gov_num;
+        return -EAGAIN;
+    }
+
+    if ( !(affected_cpus = xmalloc_array(uint32_t, op->get_para.cpu_num)) )
+        return -ENOMEM;
+    memset(affected_cpus, 0, op->get_para.cpu_num * sizeof(uint32_t));
+    for_each_cpu_mask(cpu, policy->cpus)
+        affected_cpus[j++] = cpu;
+    ret = copy_to_guest(op->get_para.affected_cpus,
+                       affected_cpus, op->get_para.cpu_num);
+    xfree(affected_cpus);
+    if ( ret )
+        return ret;
+
+    if ( !(scaling_available_frequencies =
+        xmalloc_array(uint32_t, op->get_para.freq_num)) )
+        return -ENOMEM;
+    memset(scaling_available_frequencies, 0,
+           op->get_para.freq_num * sizeof(uint32_t));
+    for ( i = 0; i < op->get_para.freq_num; i++ )
+        scaling_available_frequencies[i] =
+                        pmpt->perf.states[i].core_frequency * 1000;
+    ret = copy_to_guest(op->get_para.scaling_available_frequencies,
+                   scaling_available_frequencies, op->get_para.freq_num);
+    xfree(scaling_available_frequencies);
+    if ( ret )
+        return ret;
+
+    if ( !(scaling_available_governors =
+        xmalloc_array(char, gov_num * CPUFREQ_NAME_LEN)) )
+        return -ENOMEM;
+    memset(scaling_available_governors, 0,
+                gov_num * CPUFREQ_NAME_LEN * sizeof(char));
+    if ( (ret = read_scaling_available_governors(scaling_available_governors,
+                gov_num * CPUFREQ_NAME_LEN * sizeof(char))) )
+    {
+        xfree(scaling_available_governors);
+        return ret;
+    }
+    ret = copy_to_guest(op->get_para.scaling_available_governors,
+                scaling_available_governors, gov_num * CPUFREQ_NAME_LEN);
+    xfree(scaling_available_governors);
+    if ( ret )
+        return ret;
+
+    op->get_para.cpuinfo_cur_freq =
+        cpufreq_driver->get ? cpufreq_driver->get(op->cpuid) : policy->cur;
+    op->get_para.cpuinfo_max_freq = policy->cpuinfo.max_freq;
+    op->get_para.cpuinfo_min_freq = policy->cpuinfo.min_freq;
+    op->get_para.scaling_cur_freq = policy->cur;
+    op->get_para.scaling_max_freq = policy->max;
+    op->get_para.scaling_min_freq = policy->min;
+
+    if ( cpufreq_driver->name )
+        strlcpy(op->get_para.scaling_driver, 
+            cpufreq_driver->name, CPUFREQ_NAME_LEN);
+    else
+        strlcpy(op->get_para.scaling_driver, "Unknown", CPUFREQ_NAME_LEN);
+
+    if ( policy->governor->name )
+        strlcpy(op->get_para.scaling_governor, 
+            policy->governor->name, CPUFREQ_NAME_LEN);
+    else
+        strlcpy(op->get_para.scaling_governor, "Unknown", CPUFREQ_NAME_LEN);
+
+    /* governor specific para */
+    if ( !strnicmp(op->get_para.scaling_governor, 
+                   "userspace", CPUFREQ_NAME_LEN) )
+    {
+        op->get_para.u.userspace.scaling_setspeed = policy->cur;
+    }
+
+    if ( !strnicmp(op->get_para.scaling_governor, 
+                   "ondemand", CPUFREQ_NAME_LEN) )
+    {
+        ret = get_cpufreq_ondemand_para(
+            &op->get_para.u.ondemand.sampling_rate_max,
+            &op->get_para.u.ondemand.sampling_rate_min,
+            &op->get_para.u.ondemand.sampling_rate,
+            &op->get_para.u.ondemand.up_threshold); 
+    }
+
+    return ret;
+}
+
+static int set_cpufreq_gov(struct xen_sysctl_pm_op *op)
+{
+    struct cpufreq_policy new_policy, *old_policy;
+
+    if ( !op || !cpu_online(op->cpuid) )
+        return -EINVAL;
+
+    old_policy = cpufreq_cpu_policy[op->cpuid];
+    if ( !old_policy )
+        return -EINVAL;
+
+    memcpy(&new_policy, old_policy, sizeof(struct cpufreq_policy));
+
+    new_policy.governor = __find_governor(op->set_gov.scaling_governor);
+    if (new_policy.governor == NULL)
+        return -EINVAL;
+
+    return __cpufreq_set_policy(old_policy, &new_policy);
+}
+
+static int set_cpufreq_para(struct xen_sysctl_pm_op *op)
+{
+    int ret = 0;
+    struct cpufreq_policy *policy;
+
+    if ( !op || !cpu_online(op->cpuid) )
+        return -EINVAL;
+    policy = cpufreq_cpu_policy[op->cpuid];
+
+    if ( !policy || !policy->governor )
+        return -EINVAL;
+
+    switch(op->set_para.ctrl_type)
+    {
+    case SCALING_MAX_FREQ:
+    {
+        struct cpufreq_policy new_policy;
+
+        memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
+        new_policy.max = op->set_para.ctrl_value;
+        ret = __cpufreq_set_policy(policy, &new_policy);
+
+        break;
+    }
+
+    case SCALING_MIN_FREQ:
+    {
+        struct cpufreq_policy new_policy;
+
+        memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
+        new_policy.min = op->set_para.ctrl_value;
+        ret = __cpufreq_set_policy(policy, &new_policy);
+
+        break;
+    }
+
+    case SCALING_SETSPEED:
+    {
+        unsigned int freq =op->set_para.ctrl_value;
+
+        if ( !strnicmp(policy->governor->name,
+                       "userspace", CPUFREQ_NAME_LEN) )
+            ret = write_userspace_scaling_setspeed(op->cpuid, freq);
+        else
+            ret = -EINVAL;
+
+        break;
+    }
+
+    case SAMPLING_RATE:
+    {
+        unsigned int sampling_rate = op->set_para.ctrl_value;
+
+        if ( !strnicmp(policy->governor->name,
+                       "ondemand", CPUFREQ_NAME_LEN) )
+            ret = write_ondemand_sampling_rate(sampling_rate);
+        else
+            ret = -EINVAL;
+
+        break;
+    }
+
+    case UP_THRESHOLD:
+    {
+        unsigned int up_threshold = op->set_para.ctrl_value;
+
+        if ( !strnicmp(policy->governor->name,
+                       "ondemand", CPUFREQ_NAME_LEN) )
+            ret = write_ondemand_up_threshold(up_threshold);
+        else
+            ret = -EINVAL;
+
+        break;
+    }
+
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+static int get_cpufreq_avgfreq(struct xen_sysctl_pm_op *op)
+{
+    if ( !op || !cpu_online(op->cpuid) )
+        return -EINVAL;
+
+    op->get_avgfreq = cpufreq_driver_getavg(op->cpuid, USR_GETAVG);
+
+    return 0;
+}
+
+static int get_cputopo (struct xen_sysctl_pm_op *op)
+{
+    uint32_t i, nr_cpus;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_core_arr;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket_arr;
+    int arr_size, ret=0;
+
+    cpu_to_core_arr = op->get_topo.cpu_to_core;
+    cpu_to_socket_arr = op->get_topo.cpu_to_socket;
+    arr_size= min_t(uint32_t, op->get_topo.max_cpus, NR_CPUS);
+
+    if ( guest_handle_is_null( cpu_to_core_arr ) ||
+            guest_handle_is_null(  cpu_to_socket_arr) )
+    {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    nr_cpus = 0;
+    for ( i = 0; i < arr_size; i++ )
+    {
+        uint32_t core, socket;
+        if ( cpu_online(i) )
+        {
+            core = cpu_to_core(i);
+            socket = cpu_to_socket(i);
+            nr_cpus = i;
+        }
+        else
+        {
+            core = socket = INVALID_TOPOLOGY_ID;
+        }
+
+        if ( copy_to_guest_offset(cpu_to_core_arr, i, &core, 1) ||
+                copy_to_guest_offset(cpu_to_socket_arr, i, &socket, 1))
+        {
+            ret = -EFAULT;
+            goto out;
+        }
+    }
+
+    op->get_topo.nr_cpus = nr_cpus + 1;
+out:
+    return ret;
+}
+
+int do_pm_op(struct xen_sysctl_pm_op *op)
+{
+    int ret = 0;
+    const struct processor_pminfo *pmpt;
+
+    if ( !op || !cpu_online(op->cpuid) )
+        return -EINVAL;
+    pmpt = processor_pminfo[op->cpuid];
+
+    switch ( op->cmd & PM_PARA_CATEGORY_MASK )
+    {
+    case CPUFREQ_PARA:
+        if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) )
+            return -ENODEV;
+        if ( !pmpt || !(pmpt->perf.init & XEN_PX_INIT) )
+            return -EINVAL;
+        break;
+    }
+
+    switch ( op->cmd )
+    {
+    case GET_CPUFREQ_PARA:
+    {
+        ret = get_cpufreq_para(op);
+        break;
+    }
+
+    case SET_CPUFREQ_GOV:
+    {
+        ret = set_cpufreq_gov(op);
+        break;
+    }
+
+    case SET_CPUFREQ_PARA:
+    {
+        ret = set_cpufreq_para(op);
+        break;
+    }
+
+    case GET_CPUFREQ_AVGFREQ:
+    {
+        ret = get_cpufreq_avgfreq(op);
+        break;
+    }
+
+    case XEN_SYSCTL_pm_op_get_cputopo:
+    {
+        ret = get_cputopo(op);
+        break;
+    }
+
+    case XEN_SYSCTL_pm_op_set_sched_opt_smt:
+    {
+        uint32_t saved_value;
+
+        saved_value = sched_smt_power_savings;
+        sched_smt_power_savings = !!op->set_sched_opt_smt;
+        op->set_sched_opt_smt = saved_value;
+
+        break;
+    }
+
+    default:
+        printk("not defined sub-hypercall @ do_pm_op\n");
+        ret = -ENOSYS;
+        break;
+    }
+
+    return ret;
+}
index 95edc1eb327e736f09b75950f49e738da0c92aa6..69ea8afeead81eef99bcaa3091b759c935e4b872 100644 (file)
@@ -21,7 +21,7 @@ void acpi_reboot(void)
         * on a device on bus 0. */
        switch (rr->space_id) {
        case ACPI_ADR_SPACE_PCI_CONFIG:
-               printk("Resetting with ACPI PCI RESET_REG.");
+               printk("Resetting with ACPI PCI RESET_REG.\n");
                /* Write the value that resets us. */
                pci_conf_write8(0,
                                (rr->address >> 32) & 31,
@@ -31,7 +31,7 @@ void acpi_reboot(void)
                break;
        case ACPI_ADR_SPACE_SYSTEM_MEMORY:
        case ACPI_ADR_SPACE_SYSTEM_IO:
-               printk("ACPI MEMORY or I/O RESET_REG.");
+               printk("Resetting with ACPI MEMORY or I/O RESET_REG.\n");
                acpi_hw_low_level_write(8, reset_value, rr);
                break;
        }
index 203f8459fc75d95f6731a6c474aee4d0f1ad3bc7..f6ce51aad8ab5a3ff64a8147ef410a3b7eb6a003 100644 (file)
@@ -315,6 +315,12 @@ static void serial_rx(char c, struct cpu_user_regs *regs)
     __serial_rx(c, regs);
 }
 
+static void notify_dom0_con_ring(unsigned long unused)
+{
+    send_guest_global_virq(dom0, VIRQ_CON_RING);
+}
+static DECLARE_TASKLET(notify_dom0_con_ring_tasklet, notify_dom0_con_ring, 0);
+
 static long guest_console_write(XEN_GUEST_HANDLE(char) buffer, int count)
 {
     char kbuf[128], *kptr;
@@ -348,7 +354,7 @@ static long guest_console_write(XEN_GUEST_HANDLE(char) buffer, int count)
         {
             for ( kptr = kbuf; *kptr != '\0'; kptr++ )
                 putchar_console_ring(*kptr);
-            send_guest_global_virq(dom0, VIRQ_CON_RING);
+            tasklet_schedule(&notify_dom0_con_ring_tasklet);
         }
 
         spin_unlock_irq(&console_lock);
@@ -414,6 +420,8 @@ long do_console_io(int cmd, int count, XEN_GUEST_HANDLE(char) buffer)
  * *****************************************************
  */
 
+static bool_t console_locks_busted;
+
 static void __putstr(const char *str)
 {
     int c;
@@ -423,10 +431,12 @@ static void __putstr(const char *str)
     sercon_puts(str);
     vga_puts(str);
 
-    while ( (c = *str++) != '\0' )
-        putchar_console_ring(c);
-
-    send_guest_global_virq(dom0, VIRQ_CON_RING);
+    if ( !console_locks_busted )
+    {
+        while ( (c = *str++) != '\0' )
+            putchar_console_ring(c);
+        tasklet_schedule(&notify_dom0_con_ring_tasklet);
+    }
 }
 
 static int printk_prefix_check(char *p, char **pp)
@@ -659,6 +669,7 @@ void console_force_unlock(void)
 {
     spin_lock_init(&console_lock);
     serial_force_unlock(sercon_handle);
+    console_locks_busted = 1;
     console_start_sync();
 }
 
@@ -885,7 +896,7 @@ static int __init debugtrace_init(void)
         return 0;
 
     order = get_order_from_bytes(bytes);
-    debugtrace_buf = alloc_xenheap_pages(order);
+    debugtrace_buf = alloc_xenheap_pages(order, 0);
     ASSERT(debugtrace_buf != NULL);
 
     memset(debugtrace_buf, '\0', bytes);
@@ -927,7 +938,7 @@ void panic(const char *fmt, ...)
     console_start_sync();
     printk("\n****************************************\n");
     printk("Panic on CPU %d:\n", smp_processor_id());
-    printk(buf);
+    printk("%s", buf);
     printk("****************************************\n\n");
     if ( opt_noreboot )
         printk("Manual reset required ('noreboot' specified)\n");
index a66a4999f222b0aa662a8c03787949aa102e20f5..1f889070083d5443e481bc86d30b60354c407319 100644 (file)
 #include <asm/io.h>
 
 /*
- * Configure serial port with a string <baud>,DPS,<io-base>,<irq>.
+ * Configure serial port with a string:
+ *   <baud>[/<clock_hz>][,DPS[,<io-base>[,<irq>]]].
  * The tail of the string can be omitted if platform defaults are sufficient.
  * If the baud rate is pre-configured, perhaps by a bootloader, then 'auto'
- * can be specified in place of a numeric baud rate.
+ * can be specified in place of a numeric baud rate. Polled mode is specified
+ * by requesting irq 0.
  */
 static char opt_com1[30] = "", opt_com2[30] = "";
 string_param("com1", opt_com1);
 string_param("com2", opt_com2);
 
 static struct ns16550 {
-    int baud, data_bits, parity, stop_bits, irq;
+    int baud, clock_hz, data_bits, parity, stop_bits, irq;
     unsigned long io_base;   /* I/O port or memory-mapped I/O address. */
     char *remapped_io_base;  /* Remapped virtual address of mmap I/O.  */ 
     /* UART with IRQ line: interrupt-driven I/O. */
@@ -192,7 +194,7 @@ static void __devinit ns16550_init_preirq(struct serial_port *port)
     if ( uart->baud != BAUD_AUTO )
     {
         /* Baud rate specified: program it into the divisor latch. */
-        divisor = UART_CLOCK_HZ / (uart->baud * 16);
+        divisor = uart->clock_hz / (uart->baud << 4);
         ns_write_reg(uart, DLL, (char)divisor);
         ns_write_reg(uart, DLM, (char)(divisor >> 8));
     }
@@ -201,7 +203,7 @@ static void __devinit ns16550_init_preirq(struct serial_port *port)
         /* Baud rate already set: read it out from the divisor latch. */
         divisor  = ns_read_reg(uart, DLL);
         divisor |= ns_read_reg(uart, DLM) << 8;
-        uart->baud = UART_CLOCK_HZ / (divisor * 16);
+        uart->baud = uart->clock_hz / (divisor << 4);
     }
     ns_write_reg(uart, LCR, lcr);
 
@@ -300,6 +302,13 @@ static int check_existence(struct ns16550 *uart)
 {
     unsigned char status, scratch, scratch2, scratch3;
 
+    /*
+     * We can't poke MMIO UARTs until they get I/O remapped later. Assume that
+     * if we're getting MMIO UARTs, the arch code knows what it's doing.
+     */
+    if ( uart->io_base >= 0x10000 )
+        return 1;
+
     /*
      * Do a simple existence test first; if we fail this,
      * there's no point trying anything else.
@@ -355,6 +364,12 @@ static void __init ns16550_parse_port_config(
     else if ( (baud = simple_strtoul(conf, &conf, 10)) != 0 )
         uart->baud = baud;
 
+    if ( *conf == '/')
+    {
+        conf++;
+        uart->clock_hz = simple_strtoul(conf, &conf, 0) << 4;
+    }
+
     if ( *conf != ',' )
         goto config_parsed;
     conf++;
@@ -408,6 +423,7 @@ void __init ns16550_init(int index, struct ns16550_defaults *defaults)
     uart->baud      = (defaults->baud ? :
                        console_has((index == 0) ? "com1" : "com2")
                        ? BAUD_AUTO : 0);
+    uart->clock_hz  = UART_CLOCK_HZ;
     uart->data_bits = defaults->data_bits;
     uart->parity    = parse_parity_char(defaults->parity);
     uart->stop_bits = defaults->stop_bits;
index 9628b2c96be1ca7fade19605c3a55fc4558d2201..3d261caa47a2cf2f973394e0df2dbcebe66dec19 100644 (file)
@@ -74,7 +74,7 @@ void serial_tx_interrupt(struct serial_port *port, struct cpu_user_regs *regs)
     while ( !spin_trylock(&port->tx_lock) )
     {
         if ( !port->driver->tx_empty(port) )
-            return;
+            goto out;
         cpu_relax();
     }
 
@@ -89,7 +89,10 @@ void serial_tx_interrupt(struct serial_port *port, struct cpu_user_regs *regs)
         }
     }
 
-    spin_unlock_irqrestore(&port->tx_lock, flags);
+    spin_unlock(&port->tx_lock);
+
+ out:
+    local_irq_restore(flags);
 }
 
 static void __serial_putc(struct serial_port *port, char c)
@@ -468,7 +471,7 @@ void serial_suspend(void)
     int i, irq;
     for ( i = 0; i < ARRAY_SIZE(com); i++ )
         if ( (irq = serial_irq(i)) >= 0 )
-            free_irq(irq);
+            release_irq(irq);
 }
 
 void serial_resume(void)
@@ -492,7 +495,7 @@ void serial_async_transmit(struct serial_port *port)
     BUG_ON(!port->driver->tx_empty);
     if ( port->txbuf == NULL )
         port->txbuf = alloc_xenheap_pages(
-            get_order_from_bytes(serial_txbufsz));
+            get_order_from_bytes(serial_txbufsz), 0);
 }
 
 /*
index c91c25b715c2abf616c2f566567920887539104e..b87d12777fa8a93fe616d37a5edb86aa793b8bb8 100644 (file)
@@ -1,3 +1,4 @@
 obj-y += cpufreq.o
 obj-y += cpufreq_ondemand.o
+obj-y += cpufreq_misc_governors.o
 obj-y += utility.o
index add3f2daf91b8bd6bd2b1faccb6848468a0408b2..efb805b01cda26592055061ab2473b423e6f8981 100644 (file)
 #include <xen/errno.h>
 #include <xen/delay.h>
 #include <xen/cpumask.h>
+#include <xen/list.h>
 #include <xen/sched.h>
+#include <xen/string.h>
 #include <xen/timer.h>
 #include <xen/xmalloc.h>
+#include <xen/guest_access.h>
 #include <xen/domain.h>
 #include <asm/bug.h>
 #include <asm/io.h>
 #include <acpi/acpi.h>
 #include <acpi/cpufreq/cpufreq.h>
 
-/* TODO: change to link list later as domain number may be sparse */
-static cpumask_t cpufreq_dom_map[NR_CPUS];
+static unsigned int usr_max_freq, usr_min_freq;
+static void cpufreq_cmdline_common_para(struct cpufreq_policy *new_policy);
+
+struct cpufreq_dom {
+    unsigned int       dom;
+    cpumask_t          map;
+    struct list_head   node;
+};
+static LIST_HEAD(cpufreq_dom_list_head);
+
+struct cpufreq_governor *cpufreq_opt_governor;
+LIST_HEAD(cpufreq_governor_list);
+
+struct cpufreq_governor *__find_governor(const char *governor)
+{
+    struct cpufreq_governor *t;
+
+    if (!governor)
+        return NULL;
+
+    list_for_each_entry(t, &cpufreq_governor_list, governor_list)
+        if (!strnicmp(governor, t->name, CPUFREQ_NAME_LEN))
+            return t;
+
+    return NULL;
+}
+
+int cpufreq_register_governor(struct cpufreq_governor *governor)
+{
+    if (!governor)
+        return -EINVAL;
+
+    if (__find_governor(governor->name) != NULL)
+        return -EEXIST;
+
+    list_add(&governor->governor_list, &cpufreq_governor_list);
+    return 0;
+}
+
+int cpufreq_unregister_governor(struct cpufreq_governor *governor)
+{
+    int cpu = smp_processor_id();
+    struct cpufreq_policy *policy = cpufreq_cpu_policy[cpu];
+
+    if (!governor || !policy)
+        return -EINVAL;
+
+    /* error if unregister current cpufreq governor */
+    if (governor == policy->governor)
+        return -EBUSY;
+
+    if (__find_governor(governor->name) == NULL)
+        return -ENOENT;
+
+    list_del(&governor->governor_list);
+    return 0;
+}
 
 int cpufreq_limit_change(unsigned int cpu)
 {
@@ -71,48 +129,80 @@ int cpufreq_add_cpu(unsigned int cpu)
 {
     int ret = 0;
     unsigned int firstcpu;
-    unsigned int dom;
+    unsigned int dom, domexist = 0;
     unsigned int j;
+    struct list_head *pos;
+    struct cpufreq_dom *cpufreq_dom = NULL;
     struct cpufreq_policy new_policy;
     struct cpufreq_policy *policy;
     struct processor_performance *perf = &processor_pminfo[cpu]->perf;
 
     /* to protect the case when Px was not controlled by xen */
-    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
-        return 0;
-
-    if (!cpu_online(cpu) || cpufreq_cpu_policy[cpu])
+    if (!processor_pminfo[cpu]      ||
+        !(perf->init & XEN_PX_INIT) ||
+        !cpu_online(cpu))
         return -EINVAL;
 
+    if (cpufreq_cpu_policy[cpu])
+        return 0;
+
     ret = cpufreq_statistic_init(cpu);
     if (ret)
         return ret;
 
     dom = perf->domain_info.domain;
-    if (cpus_weight(cpufreq_dom_map[dom])) {
+
+    list_for_each(pos, &cpufreq_dom_list_head) {
+        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
+        if (dom == cpufreq_dom->dom) {
+            domexist = 1;
+            break;
+        }
+    }
+
+    if (domexist) {
         /* share policy with the first cpu since on same boat */
-        firstcpu = first_cpu(cpufreq_dom_map[dom]);
+        firstcpu = first_cpu(cpufreq_dom->map);
         policy = cpufreq_cpu_policy[firstcpu];
 
         cpufreq_cpu_policy[cpu] = policy;
-        cpu_set(cpu, cpufreq_dom_map[dom]);
+        cpu_set(cpu, cpufreq_dom->map);
         cpu_set(cpu, policy->cpus);
 
+        /* domain coordination sanity check */
+        if ((perf->domain_info.coord_type !=
+             processor_pminfo[firstcpu]->perf.domain_info.coord_type) ||
+            (perf->domain_info.num_processors !=
+             processor_pminfo[firstcpu]->perf.domain_info.num_processors)) {
+            ret = -EINVAL;
+            goto err2;
+        }
+
         printk(KERN_EMERG"adding CPU %u\n", cpu);
     } else {
+        cpufreq_dom = xmalloc(struct cpufreq_dom);
+        if (!cpufreq_dom) {
+            cpufreq_statistic_exit(cpu);
+            return -ENOMEM;
+        }
+        memset(cpufreq_dom, 0, sizeof(struct cpufreq_dom));
+        cpufreq_dom->dom = dom;
+        cpu_set(cpu, cpufreq_dom->map);
+        list_add(&cpufreq_dom->node, &cpufreq_dom_list_head);
+
         /* for the first cpu, setup policy and do init work */
         policy = xmalloc(struct cpufreq_policy);
         if (!policy) {
+            list_del(&cpufreq_dom->node);
+            xfree(cpufreq_dom);
             cpufreq_statistic_exit(cpu);
             return -ENOMEM;
         }
         memset(policy, 0, sizeof(struct cpufreq_policy));
-
-        cpufreq_cpu_policy[cpu] = policy;
-        cpu_set(cpu, cpufreq_dom_map[dom]);
+        policy->cpu = cpu;
         cpu_set(cpu, policy->cpus);
+        cpufreq_cpu_policy[cpu] = policy;
 
-        policy->cpu = cpu;
         ret = cpufreq_driver->init(policy);
         if (ret)
             goto err1;
@@ -123,13 +213,28 @@ int cpufreq_add_cpu(unsigned int cpu)
      * After get full cpumap of the coordination domain,
      * we can safely start gov here.
      */
-    if (cpus_weight(cpufreq_dom_map[dom]) ==
+    if (cpus_weight(cpufreq_dom->map) ==
         perf->domain_info.num_processors) {
         memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
         policy->governor = NULL;
+
+        cpufreq_cmdline_common_para(&new_policy);
+
         ret = __cpufreq_set_policy(policy, &new_policy);
-        if (ret)
-            goto err2;
+        if (ret) {
+            if (new_policy.governor == CPUFREQ_DEFAULT_GOVERNOR)
+                /* if default governor fail, cpufreq really meet troubles */
+                goto err2;
+            else {
+                /* grub option governor fail */
+                /* give one more chance to default gov */
+                memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
+                new_policy.governor = CPUFREQ_DEFAULT_GOVERNOR;
+                ret = __cpufreq_set_policy(policy, &new_policy);
+                if (ret)
+                    goto err2;
+            }
+        }
     }
 
     return 0;
@@ -137,58 +242,83 @@ int cpufreq_add_cpu(unsigned int cpu)
 err2:
     cpufreq_driver->exit(policy);
 err1:
-    for_each_cpu_mask(j, cpufreq_dom_map[dom]) {
+    for_each_cpu_mask(j, cpufreq_dom->map) {
         cpufreq_cpu_policy[j] = NULL;
         cpufreq_statistic_exit(j);
     }
 
-    cpus_clear(cpufreq_dom_map[dom]);
+    list_del(&cpufreq_dom->node);
+    xfree(cpufreq_dom);
     xfree(policy);
     return ret;
 }
 
 int cpufreq_del_cpu(unsigned int cpu)
 {
-    unsigned int dom;
+    unsigned int dom, domexist = 0;
+    struct list_head *pos;
+    struct cpufreq_dom *cpufreq_dom = NULL;
     struct cpufreq_policy *policy;
     struct processor_performance *perf = &processor_pminfo[cpu]->perf;
 
     /* to protect the case when Px was not controlled by xen */
-    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
-        return 0;
-
-    if (!cpu_online(cpu) || !cpufreq_cpu_policy[cpu])
+    if (!processor_pminfo[cpu]      ||
+        !(perf->init & XEN_PX_INIT) ||
+        !cpu_online(cpu))
         return -EINVAL;
 
+    if (!cpufreq_cpu_policy[cpu])
+        return 0;
+
     dom = perf->domain_info.domain;
     policy = cpufreq_cpu_policy[cpu];
 
-    printk(KERN_EMERG"deleting CPU %u\n", cpu);
+    list_for_each(pos, &cpufreq_dom_list_head) {
+        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
+        if (dom == cpufreq_dom->dom) {
+            domexist = 1;
+            break;
+        }
+    }
+
+    if (!domexist)
+        return -EINVAL;
 
     /* for the first cpu of the domain, stop gov */
-    if (cpus_weight(cpufreq_dom_map[dom]) ==
+    if (cpus_weight(cpufreq_dom->map) ==
         perf->domain_info.num_processors)
         __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 
     cpufreq_cpu_policy[cpu] = NULL;
     cpu_clear(cpu, policy->cpus);
-    cpu_clear(cpu, cpufreq_dom_map[dom]);
+    cpu_clear(cpu, cpufreq_dom->map);
     cpufreq_statistic_exit(cpu);
 
     /* for the last cpu of the domain, clean room */
     /* It's safe here to free freq_table, drv_data and policy */
-    if (!cpus_weight(cpufreq_dom_map[dom])) {
+    if (!cpus_weight(cpufreq_dom->map)) {
         cpufreq_driver->exit(policy);
+        list_del(&cpufreq_dom->node);
+        xfree(cpufreq_dom);
         xfree(policy);
     }
 
+    printk(KERN_EMERG"deleting CPU %u\n", cpu);
     return 0;
 }
 
+static void print_PCT(struct xen_pct_register *ptr)
+{
+    printk(KERN_INFO "\t_PCT: descriptor=%d, length=%d, space_id=%d, "
+            "bit_width=%d, bit_offset=%d, reserved=%d, address=%"PRId64"\n",
+            ptr->descriptor, ptr->length, ptr->space_id, ptr->bit_width, 
+            ptr->bit_offset, ptr->reserved, ptr->address);
+}
+
 static void print_PSS(struct xen_processor_px *ptr, int count)
 {
     int i;
-    printk(KERN_INFO "\t_PSS:\n");
+    printk(KERN_INFO "\t_PSS: state_count=%d\n", count);
     for (i=0; i<count; i++){
         printk(KERN_INFO "\tState%d: %"PRId64"MHz %"PRId64"mW %"PRId64"us "
                "%"PRId64"us 0x%"PRIx64" 0x%"PRIx64"\n",
@@ -211,20 +341,19 @@ static void print_PSD( struct xen_psd_package *ptr)
             ptr->num_processors);
 }
 
+static void print_PPC(unsigned int platform_limit)
+{
+    printk(KERN_INFO "\t_PPC: %d\n", platform_limit);
+}
+
 int set_px_pminfo(uint32_t acpi_id, struct xen_processor_performance *dom0_px_info)
 {
     int ret=0, cpuid;
     struct processor_pminfo *pmpt;
     struct processor_performance *pxpt;
 
-    if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) )
-    {
-        ret = -ENOSYS;
-        goto out;
-    }
-
     cpuid = get_cpu_id(acpi_id);
-    if ( cpuid < 0 )
+    if ( cpuid < 0 || !dom0_px_info)
     {
         ret = -EINVAL;
         goto out;
@@ -250,45 +379,83 @@ int set_px_pminfo(uint32_t acpi_id, struct xen_processor_performance *dom0_px_in
 
     if ( dom0_px_info->flags & XEN_PX_PCT )
     {
+        /* space_id check */
+        if (dom0_px_info->control_register.space_id != 
+            dom0_px_info->status_register.space_id)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+
+#ifdef CONFIG_IA64
+        /* for IA64, currently it only supports FFH */
+        if (dom0_px_info->control_register.space_id !=
+            ACPI_ADR_SPACE_FIXED_HARDWARE)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+#endif
+
         memcpy ((void *)&pxpt->control_register,
                 (void *)&dom0_px_info->control_register,
                 sizeof(struct xen_pct_register));
         memcpy ((void *)&pxpt->status_register,
                 (void *)&dom0_px_info->status_register,
                 sizeof(struct xen_pct_register));
+        print_PCT(&pxpt->control_register);
+        print_PCT(&pxpt->status_register);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PSS ) 
     {
-        if ( !(pxpt->states = xmalloc_array(struct xen_processor_px,
-                        dom0_px_info->state_count)) )
+        /* capability check */
+        if (dom0_px_info->state_count <= 1)
         {
-            ret = -ENOMEM;
+            ret = -EINVAL;
             goto out;
         }
-        if ( xenpf_copy_px_states(pxpt, dom0_px_info) )
+
+        if ( !(pxpt->states = xmalloc_array(struct xen_processor_px,
+                        dom0_px_info->state_count)) )
         {
-            xfree(pxpt->states);
-            ret = -EFAULT;
+            ret = -ENOMEM;
             goto out;
         }
+        copy_from_guest(pxpt->states, dom0_px_info->states, 
+                                      dom0_px_info->state_count);
         pxpt->state_count = dom0_px_info->state_count;
         print_PSS(pxpt->states,pxpt->state_count);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PSD )
     {
+#ifdef CONFIG_X86
+        /* for X86, check domain coordination */
+        /* for IA64, _PSD is optional for current IA64 cpufreq algorithm */
+        if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL &&
+            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY &&
+            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+#endif
+
         pxpt->shared_type = dom0_px_info->shared_type;
         memcpy ((void *)&pxpt->domain_info,
                 (void *)&dom0_px_info->domain_info,
                 sizeof(struct xen_psd_package));
         print_PSD(&pxpt->domain_info);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PPC )
     {
         pxpt->platform_limit = dom0_px_info->platform_limit;
+        print_PPC(pxpt->platform_limit);
 
         if ( pxpt->init == XEN_PX_INIT )
         {
-
             ret = cpufreq_limit_change(cpuid); 
             goto out;
         }
@@ -307,3 +474,69 @@ out:
     return ret;
 }
 
+static void cpufreq_cmdline_common_para(struct cpufreq_policy *new_policy)
+{
+    if (usr_max_freq)
+        new_policy->max = usr_max_freq;
+    if (usr_min_freq)
+        new_policy->min = usr_min_freq;
+}
+
+static int __init cpufreq_handle_common_option(const char *name, const char *val)
+{
+    if (!strcmp(name, "maxfreq") && val) {
+        usr_max_freq = simple_strtoul(val, NULL, 0);
+        return 1;
+    }
+
+    if (!strcmp(name, "minfreq") && val) {
+        usr_min_freq = simple_strtoul(val, NULL, 0);
+        return 1;
+    }
+
+    return 0;
+}
+
+void __init cpufreq_cmdline_parse(char *str)
+{
+    static struct cpufreq_governor *__initdata cpufreq_governors[] =
+    {
+        &cpufreq_gov_userspace,
+        &cpufreq_gov_dbs,
+        &cpufreq_gov_performance,
+        &cpufreq_gov_powersave
+    };
+    unsigned int gov_index = 0;
+
+    do {
+        char *val, *end = strchr(str, ',');
+        unsigned int i;
+
+        if (end)
+            *end++ = '\0';
+        val = strchr(str, '=');
+        if (val)
+            *val++ = '\0';
+
+        if (!cpufreq_opt_governor) {
+            if (!val) {
+                for (i = 0; i < ARRAY_SIZE(cpufreq_governors); ++i) {
+                    if (!strcmp(str, cpufreq_governors[i]->name)) {
+                        cpufreq_opt_governor = cpufreq_governors[i];
+                        gov_index = i;
+                        str = NULL;
+                        break;
+                    }
+                }
+            } else {
+                cpufreq_opt_governor = CPUFREQ_DEFAULT_GOVERNOR;
+            }
+        }
+
+        if (str && !cpufreq_handle_common_option(str, val) &&
+            cpufreq_governors[gov_index]->handle_option)
+            cpufreq_governors[gov_index]->handle_option(str, val);
+
+        str = end;
+    } while (str);
+}
diff --git a/xen/drivers/cpufreq/cpufreq_misc_governors.c b/xen/drivers/cpufreq/cpufreq_misc_governors.c
new file mode 100644 (file)
index 0000000..1c63ec1
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ *  xen/drivers/cpufreq/cpufreq_misc_gov.c
+ *
+ *  Copyright (C)  2001 Russell King
+ *            (C)  2002 - 2004 Dominik Brodowski <linux@brodo.de>
+ *
+ *     Nov 2008 Liu Jinsong <jinsong.liu@intel.com>
+ *     Porting cpufreq_userspace.c, cpufreq_performance.c, and 
+ *     cpufreq_powersave.c from Liunx 2.6.23 to Xen hypervisor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <xen/init.h>
+#include <xen/sched.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+/*
+ * cpufreq userspace governor
+ */
+static unsigned int cpu_set_freq[NR_CPUS];
+
+static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
+                                      unsigned int event)
+{
+    int ret = 0;
+    unsigned int cpu;
+
+    if (unlikely(!policy) || 
+        unlikely(!cpu_online(cpu = policy->cpu)))
+        return -EINVAL;
+
+    switch (event) {
+    case CPUFREQ_GOV_START:
+        if (!cpu_set_freq[cpu])
+            cpu_set_freq[cpu] = policy->cur;
+        break;
+    case CPUFREQ_GOV_STOP:
+        cpu_set_freq[cpu] = 0;
+        break;
+    case CPUFREQ_GOV_LIMITS:
+        if (policy->max < cpu_set_freq[cpu])
+            ret = __cpufreq_driver_target(policy, policy->max,
+                        CPUFREQ_RELATION_H);
+        else if (policy->min > cpu_set_freq[cpu])
+            ret = __cpufreq_driver_target(policy, policy->min,
+                        CPUFREQ_RELATION_L);
+        else
+            ret = __cpufreq_driver_target(policy, cpu_set_freq[cpu],
+                        CPUFREQ_RELATION_L);
+
+        break;
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+int write_userspace_scaling_setspeed(unsigned int cpu, unsigned int freq)
+{
+    struct cpufreq_policy *policy = cpufreq_cpu_policy[cpu];
+
+    if (!cpu_online(cpu) || !policy)
+        return -EINVAL;
+
+    cpu_set_freq[cpu] = freq;
+
+    if (freq < policy->min)
+        freq = policy->min;
+    if (freq > policy->max)
+        freq = policy->max;
+
+    return __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+}
+
+static void __init 
+cpufreq_userspace_handle_option(const char *name, const char *val)
+{
+    if (!strcmp(name, "speed") && val) {
+        unsigned int usr_cmdline_freq;
+        unsigned int cpu;
+
+        usr_cmdline_freq = simple_strtoul(val, NULL, 0);
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+            cpu_set_freq[cpu] = usr_cmdline_freq;
+    }
+}
+
+struct cpufreq_governor cpufreq_gov_userspace = {
+    .name = "userspace",
+    .governor = cpufreq_governor_userspace,
+    .handle_option = cpufreq_userspace_handle_option
+};
+
+static int __init cpufreq_gov_userspace_init(void)
+{
+    return cpufreq_register_governor(&cpufreq_gov_userspace);
+}
+__initcall(cpufreq_gov_userspace_init);
+
+static void __exit cpufreq_gov_userspace_exit(void)
+{
+    cpufreq_unregister_governor(&cpufreq_gov_userspace);
+}
+__exitcall(cpufreq_gov_userspace_exit);
+
+
+/*
+ * cpufreq performance governor
+ */
+static int cpufreq_governor_performance(struct cpufreq_policy *policy,
+                                      unsigned int event)
+{
+    int ret = 0;
+
+    if (!policy)
+        return -EINVAL;
+
+    switch (event) {
+    case CPUFREQ_GOV_START:
+    case CPUFREQ_GOV_STOP:
+        break;
+    case CPUFREQ_GOV_LIMITS:
+        ret = __cpufreq_driver_target(policy, policy->max,
+                        CPUFREQ_RELATION_H);
+        break;
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+struct cpufreq_governor cpufreq_gov_performance = {
+    .name = "performance",
+    .governor = cpufreq_governor_performance,
+};
+
+static int __init cpufreq_gov_performance_init(void)
+{
+    return cpufreq_register_governor(&cpufreq_gov_performance);
+}
+__initcall(cpufreq_gov_performance_init);
+
+static void __exit cpufreq_gov_performance_exit(void)
+{
+    cpufreq_unregister_governor(&cpufreq_gov_performance);
+}
+__exitcall(cpufreq_gov_performance_exit);
+
+
+/*
+ * cpufreq powersave governor
+ */
+static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
+                                      unsigned int event)
+{
+    int ret = 0;
+
+    if (!policy)
+        return -EINVAL;
+
+    switch (event) {
+    case CPUFREQ_GOV_START:
+    case CPUFREQ_GOV_STOP:
+        break;
+    case CPUFREQ_GOV_LIMITS:
+        ret = __cpufreq_driver_target(policy, policy->min,
+                        CPUFREQ_RELATION_L);
+        break;
+    default:
+        ret = -EINVAL;
+        break;
+    }
+
+    return ret;
+}
+
+struct cpufreq_governor cpufreq_gov_powersave = {
+    .name = "powersave",
+    .governor = cpufreq_governor_powersave,
+};
+
+static int __init cpufreq_gov_powersave_init(void)
+{
+    return cpufreq_register_governor(&cpufreq_gov_powersave);
+}
+__initcall(cpufreq_gov_powersave_init);
+
+static void __exit cpufreq_gov_powersave_exit(void)
+{
+    cpufreq_unregister_governor(&cpufreq_gov_powersave);
+}
+__exitcall(cpufreq_gov_powersave_exit);
index f1b676c2f4c21ba6b03786dab82d02da6426d1f5..b01312d9afda585dfc570d6e800ec271d9966fe4 100644 (file)
 #include <acpi/cpufreq/cpufreq.h>
 
 #define DEF_FREQUENCY_UP_THRESHOLD              (80)
+#define MIN_FREQUENCY_UP_THRESHOLD              (11)
+#define MAX_FREQUENCY_UP_THRESHOLD              (100)
 
 #define MIN_DBS_INTERVAL                        (MICROSECS(100))
-#define MIN_SAMPLING_MILLISECS                  (20)
-#define MIN_STAT_SAMPLING_RATE                   \
+#define MIN_SAMPLING_RATE_RATIO                 (2)
+#define MIN_SAMPLING_MILLISECS                  (MIN_SAMPLING_RATE_RATIO * 10)
+#define MIN_STAT_SAMPLING_RATE                  \
     (MIN_SAMPLING_MILLISECS * MILLISECS(1))
+#define MIN_SAMPLING_RATE                       \
+    (def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
+#define MAX_SAMPLING_RATE                       (500 * def_sampling_rate)
 #define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER    (1000)
 #define TRANSITION_LATENCY_LIMIT                (10 * 1000 )
 
 static uint64_t def_sampling_rate;
+static uint64_t usr_sampling_rate;
 
 /* Sampling types */
 enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
@@ -42,29 +49,50 @@ static unsigned int dbs_enable;    /* number of CPUs using this policy */
 static struct dbs_tuners {
     uint64_t     sampling_rate;
     unsigned int up_threshold;
-    unsigned int ignore_nice;
     unsigned int powersave_bias;
 } dbs_tuners_ins = {
+    .sampling_rate = 0,
     .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
-    .ignore_nice = 0,
     .powersave_bias = 0,
 };
 
 static struct timer dbs_timer[NR_CPUS];
 
-uint64_t get_cpu_idle_time(unsigned int cpu)
+int write_ondemand_sampling_rate(unsigned int sampling_rate)
 {
-    uint64_t idle_ns;
-    struct vcpu *v;
+    if ( (sampling_rate > MAX_SAMPLING_RATE / MICROSECS(1)) ||
+         (sampling_rate < MIN_SAMPLING_RATE / MICROSECS(1)) )
+        return -EINVAL;
 
-    if ((v = idle_vcpu[cpu]) == NULL)
-        return 0;
+    dbs_tuners_ins.sampling_rate = sampling_rate * MICROSECS(1);
+    return 0;
+}
 
-    idle_ns = v->runstate.time[RUNSTATE_running];
-    if (v->is_running)
-        idle_ns += NOW() - v->runstate.state_entry_time;
+int write_ondemand_up_threshold(unsigned int up_threshold)
+{
+    if ( (up_threshold > MAX_FREQUENCY_UP_THRESHOLD) ||
+         (up_threshold < MIN_FREQUENCY_UP_THRESHOLD) )
+        return -EINVAL;
 
-    return idle_ns;
+    dbs_tuners_ins.up_threshold = up_threshold;
+    return 0;
+}
+
+int get_cpufreq_ondemand_para(uint32_t *sampling_rate_max,
+                              uint32_t *sampling_rate_min,
+                              uint32_t *sampling_rate,
+                              uint32_t *up_threshold)
+{
+    if (!sampling_rate_max || !sampling_rate_min ||
+        !sampling_rate || !up_threshold)
+        return -EINVAL;
+
+    *sampling_rate_max = MAX_SAMPLING_RATE/MICROSECS(1);
+    *sampling_rate_min = MIN_SAMPLING_RATE/MICROSECS(1);
+    *sampling_rate = dbs_tuners_ins.sampling_rate / MICROSECS(1);
+    *up_threshold = dbs_tuners_ins.up_threshold;
+
+    return 0;
 }
 
 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
@@ -133,9 +161,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
     if (load < (dbs_tuners_ins.up_threshold - 10)) {
         unsigned int freq_next, freq_cur;
 
-        freq_cur = __cpufreq_driver_getavg(policy);
-        if (!freq_cur)
-            freq_cur = policy->cur;
+        freq_cur = cpufreq_driver_getavg(policy->cpu, GOV_GETAVG);
 
         freq_next = (freq_cur * load) / (dbs_tuners_ins.up_threshold - 10);
 
@@ -209,14 +235,27 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
          * Start the timerschedule work, when this governor
          * is used for first time
          */
-        if (dbs_enable == 1) {
+        if ((dbs_enable == 1) && !dbs_tuners_ins.sampling_rate) {
             def_sampling_rate = policy->cpuinfo.transition_latency *
                 DEF_SAMPLING_RATE_LATENCY_MULTIPLIER;
 
             if (def_sampling_rate < MIN_STAT_SAMPLING_RATE)
                 def_sampling_rate = MIN_STAT_SAMPLING_RATE;
 
-            dbs_tuners_ins.sampling_rate = def_sampling_rate;
+            if (!usr_sampling_rate)
+                dbs_tuners_ins.sampling_rate = def_sampling_rate;
+            else if (usr_sampling_rate < MIN_SAMPLING_RATE) {
+                printk(KERN_WARNING "cpufreq/ondemand: "
+                       "specified sampling rate too low, using %"PRIu64"\n",
+                       MIN_SAMPLING_RATE);
+                dbs_tuners_ins.sampling_rate = MIN_SAMPLING_RATE;
+            } else if (usr_sampling_rate > MAX_SAMPLING_RATE) {
+                printk(KERN_WARNING "cpufreq/ondemand: "
+                       "specified sampling rate too high, using %"PRIu64"\n",
+                       MAX_SAMPLING_RATE);
+                dbs_tuners_ins.sampling_rate = MAX_SAMPLING_RATE;
+            } else
+                dbs_tuners_ins.sampling_rate = usr_sampling_rate;
         }
         dbs_timer_init(this_dbs_info);
 
@@ -240,7 +279,60 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event)
     return 0;
 }
 
+static void __init cpufreq_dbs_handle_option(const char *name, const char *val)
+{
+    if ( !strcmp(name, "rate") && val )
+    {
+        usr_sampling_rate = simple_strtoull(val, NULL, 0) * MICROSECS(1);
+    }
+    else if ( !strcmp(name, "up_threshold") && val )
+    {
+        unsigned long tmp = simple_strtoul(val, NULL, 0);
+
+        if ( tmp < MIN_FREQUENCY_UP_THRESHOLD )
+        {
+            printk(XENLOG_WARNING "cpufreq/ondemand: "
+                   "specified threshold too low, using %d\n",
+                   MIN_FREQUENCY_UP_THRESHOLD);
+            tmp = MIN_FREQUENCY_UP_THRESHOLD;
+        }
+        else if ( tmp > MAX_FREQUENCY_UP_THRESHOLD )
+        {
+            printk(XENLOG_WARNING "cpufreq/ondemand: "
+                   "specified threshold too high, using %d\n",
+                   MAX_FREQUENCY_UP_THRESHOLD);
+            tmp = MAX_FREQUENCY_UP_THRESHOLD;
+        }
+        dbs_tuners_ins.up_threshold = tmp;
+    }
+    else if ( !strcmp(name, "bias") && val )
+    {
+        unsigned long tmp = simple_strtoul(val, NULL, 0);
+
+        if ( tmp > 1000 )
+        {
+            printk(XENLOG_WARNING "cpufreq/ondemand: "
+                   "specified bias too high, using 1000\n");
+            tmp = 1000;
+        }
+        dbs_tuners_ins.powersave_bias = tmp;
+    }
+}
+
 struct cpufreq_governor cpufreq_gov_dbs = {
     .name = "ondemand",
     .governor = cpufreq_governor_dbs,
+    .handle_option = cpufreq_dbs_handle_option
 };
+
+static int __init cpufreq_gov_dbs_init(void)
+{
+    return cpufreq_register_governor(&cpufreq_gov_dbs);
+}
+__initcall(cpufreq_gov_dbs_init);
+
+static void __exit cpufreq_gov_dbs_exit(void)
+{
+    cpufreq_unregister_governor(&cpufreq_gov_dbs);
+}
+__exitcall(cpufreq_gov_dbs_exit);
index a26e2518c35d419c87f051fc20ccb264a9abada1..5daffedf593e958b93bffe81d47472e801f4269f 100644 (file)
@@ -36,35 +36,54 @@ struct cpufreq_driver   *cpufreq_driver;
 struct processor_pminfo *__read_mostly processor_pminfo[NR_CPUS];
 struct cpufreq_policy   *__read_mostly cpufreq_cpu_policy[NR_CPUS];
 
+DEFINE_PER_CPU(spinlock_t, cpufreq_statistic_lock) = SPIN_LOCK_UNLOCKED;
+
 /*********************************************************************
  *                    Px STATISTIC INFO                              *
  *********************************************************************/
 
+void cpufreq_residency_update(unsigned int cpu, uint8_t state)
+{
+    uint64_t now, total_idle_ns;
+    int64_t delta;
+    struct pm_px *pxpt = cpufreq_statistic_data[cpu];
+
+    total_idle_ns = get_cpu_idle_time(cpu);
+    now = NOW();
+
+    delta = (now - pxpt->prev_state_wall) - 
+            (total_idle_ns - pxpt->prev_idle_wall);
+
+    if ( likely(delta >= 0) )
+        pxpt->u.pt[state].residency += delta;
+
+    pxpt->prev_state_wall = now;
+    pxpt->prev_idle_wall = total_idle_ns;
+}
+
 void cpufreq_statistic_update(unsigned int cpu, uint8_t from, uint8_t to)
 {
-    uint64_t now;
     struct pm_px *pxpt = cpufreq_statistic_data[cpu];
     struct processor_pminfo *pmpt = processor_pminfo[cpu];
-    uint64_t total_idle_ns;
-    uint64_t tmp_idle_ns;
+    spinlock_t *cpufreq_statistic_lock = 
+               &per_cpu(cpufreq_statistic_lock, cpu);
 
-    if ( !pxpt || !pmpt )
-        return;
+    spin_lock(cpufreq_statistic_lock);
 
-    now = NOW();
-    total_idle_ns = get_cpu_idle_time(cpu);
-    tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
+    if ( !pxpt || !pmpt ) {
+        spin_unlock(cpufreq_statistic_lock);
+        return;
+    }
 
     pxpt->u.last = from;
     pxpt->u.cur = to;
     pxpt->u.pt[to].count++;
-    pxpt->u.pt[from].residency += now - pxpt->prev_state_wall;
-    pxpt->u.pt[from].residency -= tmp_idle_ns;
+
+    cpufreq_residency_update(cpu, from);
 
     (*(pxpt->u.trans_pt + from * pmpt->perf.state_count + to))++;
 
-    pxpt->prev_state_wall = now;
-    pxpt->prev_idle_wall = total_idle_ns;
+    spin_unlock(cpufreq_statistic_lock);
 }
 
 int cpufreq_statistic_init(unsigned int cpuid)
@@ -72,28 +91,41 @@ int cpufreq_statistic_init(unsigned int cpuid)
     uint32_t i, count;
     struct pm_px *pxpt = cpufreq_statistic_data[cpuid];
     const struct processor_pminfo *pmpt = processor_pminfo[cpuid];
-
-    count = pmpt->perf.state_count;
+    spinlock_t *cpufreq_statistic_lock = 
+                          &per_cpu(cpufreq_statistic_lock, cpuid);
 
     if ( !pmpt )
         return -EINVAL;
 
-    if ( !pxpt )
-    {
-        pxpt = xmalloc(struct pm_px);
-        if ( !pxpt )
-            return -ENOMEM;
-        memset(pxpt, 0, sizeof(*pxpt));
-        cpufreq_statistic_data[cpuid] = pxpt;
+    spin_lock(cpufreq_statistic_lock);
+
+    if ( pxpt ) {
+        spin_unlock(cpufreq_statistic_lock);
+        return 0;
+    }
+
+    count = pmpt->perf.state_count;
+
+    pxpt = xmalloc(struct pm_px);
+    if ( !pxpt ) {
+        spin_unlock(cpufreq_statistic_lock);
+        return -ENOMEM;
     }
+    memset(pxpt, 0, sizeof(*pxpt));
+    cpufreq_statistic_data[cpuid] = pxpt;
 
     pxpt->u.trans_pt = xmalloc_array(uint64_t, count * count);
-    if (!pxpt->u.trans_pt)
+    if (!pxpt->u.trans_pt) {
+        xfree(pxpt);
+        spin_unlock(cpufreq_statistic_lock);
         return -ENOMEM;
+    }
 
     pxpt->u.pt = xmalloc_array(struct pm_px_val, count);
     if (!pxpt->u.pt) {
         xfree(pxpt->u.trans_pt);
+        xfree(pxpt);
+        spin_unlock(cpufreq_statistic_lock);
         return -ENOMEM;
     }
 
@@ -109,18 +141,30 @@ int cpufreq_statistic_init(unsigned int cpuid)
     pxpt->prev_state_wall = NOW();
     pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
 
+    spin_unlock(cpufreq_statistic_lock);
+
     return 0;
 }
 
 void cpufreq_statistic_exit(unsigned int cpuid)
 {
     struct pm_px *pxpt = cpufreq_statistic_data[cpuid];
+    spinlock_t *cpufreq_statistic_lock = 
+               &per_cpu(cpufreq_statistic_lock, cpuid);
+
+    spin_lock(cpufreq_statistic_lock);
 
-    if (!pxpt)
+    if (!pxpt) {
+        spin_unlock(cpufreq_statistic_lock);
         return;
+    }
+
     xfree(pxpt->u.trans_pt);
     xfree(pxpt->u.pt);
-    memset(pxpt, 0, sizeof(struct pm_px));
+    xfree(pxpt);
+    cpufreq_statistic_data[cpuid] = NULL;
+
+    spin_unlock(cpufreq_statistic_lock);
 }
 
 void cpufreq_statistic_reset(unsigned int cpuid)
@@ -128,9 +172,15 @@ void cpufreq_statistic_reset(unsigned int cpuid)
     uint32_t i, j, count;
     struct pm_px *pxpt = cpufreq_statistic_data[cpuid];
     const struct processor_pminfo *pmpt = processor_pminfo[cpuid];
+    spinlock_t *cpufreq_statistic_lock = 
+               &per_cpu(cpufreq_statistic_lock, cpuid);
+
+    spin_lock(cpufreq_statistic_lock);
 
-    if ( !pxpt || !pmpt )
+    if ( !pmpt || !pxpt || !pxpt->u.pt || !pxpt->u.trans_pt ) {
+        spin_unlock(cpufreq_statistic_lock);
         return;
+    }
 
     count = pmpt->perf.state_count;
 
@@ -144,6 +194,8 @@ void cpufreq_statistic_reset(unsigned int cpuid)
 
     pxpt->prev_state_wall = NOW();
     pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
+
+    spin_unlock(cpufreq_statistic_lock);
 }
 
 
@@ -305,17 +357,23 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
     return retval;
 }
 
-int __cpufreq_driver_getavg(struct cpufreq_policy *policy)
+int cpufreq_driver_getavg(unsigned int cpu, unsigned int flag)
 {
-    int ret = 0;
+    struct cpufreq_policy *policy;
+    int freq_avg;
 
-    if (!policy)
-        return -EINVAL;
+    policy = cpufreq_cpu_policy[cpu];
+    if (!cpu_online(cpu) || !policy)
+        return 0;
 
-    if (cpu_online(policy->cpu) && cpufreq_driver->getavg)
-        ret = cpufreq_driver->getavg(policy->cpu);
+    if (cpufreq_driver->getavg)
+    {
+        freq_avg = cpufreq_driver->getavg(cpu, flag);
+        if (freq_avg > 0)
+            return freq_avg;
+    }
 
-    return ret;
+    return policy->cur;
 }
 
 
@@ -356,10 +414,15 @@ int __cpufreq_set_policy(struct cpufreq_policy *data,
         /* start new governor */
         data->governor = policy->governor;
         if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
+            printk(KERN_WARNING "Fail change to %s governor\n",
+                                 data->governor->name);
+
             /* new governor failed, so re-start old one */
             if (old_gov) {
                 data->governor = old_gov;
                 __cpufreq_governor(data, CPUFREQ_GOV_START);
+                printk(KERN_WARNING "Still stay at %s governor\n",
+                                     data->governor->name);
             }
             return -EINVAL;
         }
index fc5e80771196fe488c117641848e46ddca2bce08..a950881793d60f8ec495202f732ffa173876b552 100644 (file)
@@ -1,4 +1,5 @@
 subdir-$(x86) += vtd
+subdir-$(ia64) += vtd
 subdir-$(x86) += amd
 
 obj-y += iommu.o
index 0a2081fe39446b0c086514054f70df1548d91362..496faae2c3c9e64207de4d2bc1ed991ba68cd076 100644 (file)
@@ -37,9 +37,6 @@ struct ivrs_mappings *ivrs_mappings;
 struct list_head amd_iommu_head;
 struct table_struct device_table;
 
-extern void *int_remap_table;
-extern spinlock_t int_remap_table_lock;
-
 static int __init map_iommu_mmio_region(struct amd_iommu *iommu)
 {
     unsigned long mfn;
@@ -152,13 +149,33 @@ static void __init set_iommu_translation_control(struct amd_iommu *iommu,
 {
     u32 entry;
 
-    entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
-    set_field_in_reg_u32(iommu->ht_tunnel_support ? IOMMU_CONTROL_ENABLED :
-                         IOMMU_CONTROL_ENABLED, entry,
+    entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET);
+
+    if ( enable )
+    {
+        set_field_in_reg_u32(iommu->ht_tunnel_support ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
                          IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK,
                          IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT, &entry);
+        set_field_in_reg_u32(iommu->isochronous ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_ISOCHRONOUS_MASK,
+                         IOMMU_CONTROL_ISOCHRONOUS_SHIFT, &entry);
+        set_field_in_reg_u32(iommu->coherent ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_COHERENT_MASK,
+                         IOMMU_CONTROL_COHERENT_SHIFT, &entry);
+        set_field_in_reg_u32(iommu->res_pass_pw ? IOMMU_CONTROL_ENABLED :
+                         IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK,
+                         IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT, &entry);
+        /* do not set PassPW bit */
+        set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry,
+                         IOMMU_CONTROL_PASS_POSTED_WRITE_MASK,
+                         IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT, &entry);
+    }
     set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
-                         IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_CONTROL_DISABLED, entry,
                          IOMMU_CONTROL_TRANSLATION_ENABLE_MASK,
                          IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT, &entry);
     writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
@@ -171,10 +188,14 @@ static void __init set_iommu_command_buffer_control(struct amd_iommu *iommu,
 
     entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
     set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
-                         IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_CONTROL_DISABLED, entry,
                          IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK,
                          IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT, &entry);
     writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+
+    /*reset head and tail pointer */
+    writel(0x0, iommu->mmio_base + IOMMU_CMD_BUFFER_HEAD_OFFSET);
+    writel(0x0, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET);
 }
 
 static void __init register_iommu_exclusion_range(struct amd_iommu *iommu)
@@ -235,11 +256,14 @@ static void __init set_iommu_event_log_control(struct amd_iommu *iommu,
                          IOMMU_CONTROL_EVENT_LOG_INT_SHIFT, &entry);
     writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
 
-    set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED :
-                         IOMMU_CONTROL_DISABLED, entry,
+    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry,
                          IOMMU_CONTROL_COMP_WAIT_INT_MASK,
                          IOMMU_CONTROL_COMP_WAIT_INT_SHIFT, &entry);
     writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET);
+
+    /*reset head and tail pointer */
+    writel(0x0, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET);
+    writel(0x0, iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET);
 }
 
 static int amd_iommu_read_event_log(struct amd_iommu *iommu, u32 event[])
@@ -391,20 +415,19 @@ static void parse_event_log_entry(u32 entry[])
     u32 code;
     u64 *addr;
     char * event_str[] = {"ILLEGAL_DEV_TABLE_ENTRY",
-                                         "IO_PAGE_FALT",
-                                         "DEV_TABLE_HW_ERROR",
-                                         "PAGE_TABLE_HW_ERROR",
-                                         "ILLEGAL_COMMAND_ERROR",
-                                         "COMMAND_HW_ERROR",
-                                         "IOTLB_INV_TIMEOUT",
-                                         "INVALID_DEV_REQUEST"};
-
-    code = get_field_from_reg_u32(entry[1],
-                                           IOMMU_EVENT_CODE_MASK,
-                                           IOMMU_EVENT_CODE_SHIFT);
-
-    if ( (code > IOMMU_EVENT_INVALID_DEV_REQUEST)
-        || (code < IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY) )
+                          "IO_PAGE_FALT",
+                          "DEV_TABLE_HW_ERROR",
+                          "PAGE_TABLE_HW_ERROR",
+                          "ILLEGAL_COMMAND_ERROR",
+                          "COMMAND_HW_ERROR",
+                          "IOTLB_INV_TIMEOUT",
+                          "INVALID_DEV_REQUEST"};
+
+    code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK,
+                                            IOMMU_EVENT_CODE_SHIFT);
+
+    if ( (code > IOMMU_EVENT_INVALID_DEV_REQUEST) ||
+        (code < IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY) )
     {
         amd_iov_error("Invalid event log entry!\n");
         return;
@@ -428,13 +451,20 @@ static void parse_event_log_entry(u32 entry[])
 static void amd_iommu_page_fault(int vector, void *dev_id,
                              struct cpu_user_regs *regs)
 {
-    u32  event[4];
+    u32 event[4];
+    u32 entry;
     unsigned long flags;
     int ret = 0;
     struct amd_iommu *iommu = dev_id;
 
     spin_lock_irqsave(&iommu->lock, flags);
     ret = amd_iommu_read_event_log(iommu, event);
+    /* reset interrupt status bit */
+    entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_STATUS_EVENT_LOG_INT_MASK,
+                         IOMMU_STATUS_EVENT_LOG_INT_SHIFT, &entry);
+    writel(entry, iommu->mmio_base+IOMMU_STATUS_MMIO_OFFSET);
     spin_unlock_irqrestore(&iommu->lock, flags);
 
     if ( ret != 0 )
@@ -446,27 +476,29 @@ static int set_iommu_interrupt_handler(struct amd_iommu *iommu)
 {
     int vector, ret;
 
-    vector = assign_irq_vector(AUTO_ASSIGN);
-    vector_to_iommu[vector] = iommu;
-
-    /* make irq == vector */
-    irq_vector[vector] = vector;
-    vector_irq[vector] = vector;
-
-    if ( !vector )
+    vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
+    if ( vector <= 0 )
     {
-        amd_iov_error("no vectors\n");
+        gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
         return 0;
     }
 
     irq_desc[vector].handler = &iommu_msi_type;
-    ret = request_irq(vector, amd_iommu_page_fault, 0, "amd_iommu", iommu);
+    vector_to_iommu[vector] = iommu;
+    ret = request_irq_vector(vector, amd_iommu_page_fault, 0,
+                             "amd_iommu", iommu);
     if ( ret )
     {
+        irq_desc[vector].handler = &no_irq_type;
+        vector_to_iommu[vector] = NULL;
+        free_irq_vector(vector);
         amd_iov_error("can't request irq\n");
         return 0;
     }
 
+    /* Make sure that vector is never re-used. */
+    vector_irq[vector] = NEVER_ASSIGN_IRQ;
+    iommu->vector = vector;
     return vector;
 }
 
@@ -510,10 +542,11 @@ void __init enable_iommu(struct amd_iommu *iommu)
 static void __init deallocate_iommu_table_struct(
     struct table_struct *table)
 {
+    int order = 0;
     if ( table->buffer )
     {
-        free_xenheap_pages(table->buffer,
-            get_order_from_bytes(table->alloc_size));
+        order = get_order_from_bytes(table->alloc_size);
+        __free_amd_iommu_tables(table->buffer, order);
         table->buffer = NULL;
     }
 }
@@ -527,16 +560,19 @@ static void __init deallocate_iommu_tables(struct amd_iommu *iommu)
 static int __init allocate_iommu_table_struct(struct table_struct *table,
                                               const char *name)
 {
-    table->buffer = (void *) alloc_xenheap_pages(
-        get_order_from_bytes(table->alloc_size));
-
-    if ( !table->buffer )
+    int order = 0;
+    if ( table->buffer == NULL )
     {
-        amd_iov_error("Error allocating %s\n", name);
-        return -ENOMEM;
-    }
+        order = get_order_from_bytes(table->alloc_size);
+        table->buffer = __alloc_amd_iommu_tables(order);
 
-    memset(table->buffer, 0, table->alloc_size);
+        if ( table->buffer == NULL )
+        {
+            amd_iov_error("Error allocating %s\n", name);
+            return -ENOMEM;
+        }
+        memset(table->buffer, 0, PAGE_SIZE * (1UL << order));
+    }
     return 0;
 }
 
index e6ade3a10b3f352ef48cdf40112ce84b2fded6c6..c3a9dc81b8d47aa6a35cce1a5e0c313dff543207 100644 (file)
@@ -22,7 +22,8 @@
 #include <asm/amd-iommu.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
 
-DEFINE_SPINLOCK(int_remap_table_lock);
+#define INTREMAP_TABLE_ORDER    1
+static DEFINE_SPINLOCK(int_remap_table_lock);
 void *int_remap_table = NULL;
 
 static u8 *get_intremap_entry(u8 vector, u8 dm)
@@ -109,18 +110,13 @@ static void update_intremap_entry_from_ioapic(
 
 int __init amd_iommu_setup_intremap_table(void)
 {
-    unsigned long flags;
-
-    spin_lock_irqsave(&int_remap_table_lock, flags);
     if ( int_remap_table == NULL )
-        int_remap_table = (void *)alloc_xenheap_pages(1);
-    if ( !int_remap_table )
     {
-        spin_unlock_irqrestore(&int_remap_table_lock, flags);
-        return -ENOMEM;
+        int_remap_table = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER);
+        if ( int_remap_table == NULL )
+            return -ENOMEM;
+        memset(int_remap_table, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER));
     }
-    memset((u8*)int_remap_table, 0, PAGE_SIZE*2);
-    spin_unlock_irqrestore(&int_remap_table_lock, flags);
 
     return 0;
 }
@@ -206,15 +202,11 @@ void amd_iommu_msi_msg_update_ire(
 
 int __init deallocate_intremap_table(void)
 {
-    unsigned long flags;
-
-    spin_lock_irqsave(&int_remap_table_lock, flags);
     if ( int_remap_table )
     {
-        free_xenheap_pages(int_remap_table, 1);
+        __free_amd_iommu_tables(int_remap_table, INTREMAP_TABLE_ORDER);
         int_remap_table = NULL;
     }
-    spin_unlock_irqrestore(&int_remap_table_lock, flags);
 
     return 0;
 }
index a41fe608908cb3c0a997ff58c25a3fba519f7807..352c52a182f2d5b19dfea13328cf72552c52dab6 100644 (file)
@@ -159,21 +159,39 @@ void flush_command_buffer(struct amd_iommu *iommu)
     }
 }
 
-static void clear_page_table_entry_present(u32 *pte)
+static void clear_iommu_l1e_present(u64 l2e, unsigned long gfn)
 {
-    set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, pte[0],
-                         IOMMU_PTE_PRESENT_MASK,
-                         IOMMU_PTE_PRESENT_SHIFT, &pte[0]);
+    u32 *l1e;
+    int offset;
+    void *l1_table;
+
+    l1_table = map_domain_page(l2e >> PAGE_SHIFT);
+
+    offset = gfn & (~PTE_PER_TABLE_MASK);
+    l1e = (u32*)(l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
+
+    /* clear l1 entry */
+    l1e[0] = l1e[1] = 0;
+
+    unmap_domain_page(l1_table);
 }
 
-static void set_page_table_entry_present(u32 *pte, u64 page_addr,
-                                         int iw, int ir)
+static void set_iommu_l1e_present(u64 l2e, unsigned long gfn,
+                                 u64 maddr, int iw, int ir)
 {
     u64 addr_lo, addr_hi;
     u32 entry;
+    void *l1_table;
+    int offset;
+    u32 *l1e;
+
+    l1_table = map_domain_page(l2e >> PAGE_SHIFT);
 
-    addr_lo = page_addr & DMA_32BIT_MASK;
-    addr_hi = page_addr >> 32;
+    offset = gfn & (~PTE_PER_TABLE_MASK);
+    l1e = (u32*)((u8*)l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE));
+
+    addr_lo = maddr & DMA_32BIT_MASK;
+    addr_hi = maddr >> 32;
 
     set_field_in_reg_u32((u32)addr_hi, 0,
                          IOMMU_PTE_ADDR_HIGH_MASK,
@@ -186,7 +204,7 @@ static void set_page_table_entry_present(u32 *pte, u64 page_addr,
                          IOMMU_CONTROL_DISABLED, entry,
                          IOMMU_PTE_IO_READ_PERMISSION_MASK,
                          IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry);
-    pte[1] = entry;
+    l1e[1] = entry;
 
     set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0,
                          IOMMU_PTE_ADDR_LOW_MASK,
@@ -197,9 +215,10 @@ static void set_page_table_entry_present(u32 *pte, u64 page_addr,
     set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
                          IOMMU_PTE_PRESENT_MASK,
                          IOMMU_PTE_PRESENT_SHIFT, &entry);
-    pte[0] = entry;
-}
+    l1e[0] = entry;
 
+    unmap_domain_page(l1_table);
+}
 
 static void amd_iommu_set_page_directory_entry(u32 *pde, 
                                                u64 next_ptr, u8 next_level)
@@ -327,7 +346,7 @@ void amd_iommu_set_dev_table_entry(u32 *dte, u64 root_ptr, u64 intremap_ptr,
     dte[0] = entry;
 }
 
-void *amd_iommu_get_vptr_from_page_table_entry(u32 *entry)
+u64 amd_iommu_get_next_table_from_pte(u32 *entry)
 {
     u64 addr_lo, addr_hi, ptr;
 
@@ -342,7 +361,7 @@ void *amd_iommu_get_vptr_from_page_table_entry(u32 *entry)
         IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT);
 
     ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT);
-    return ptr ? maddr_to_virt((unsigned long)ptr) : NULL;
+    return ptr;
 }
 
 static int amd_iommu_is_pte_present(u32 *entry)
@@ -381,119 +400,112 @@ int amd_iommu_is_dte_page_translation_valid(u32 *entry)
                                    IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT));
 }
 
-static void *get_pte_from_page_tables(void *table, int level,
-                                      unsigned long io_pfn)
+static u64 iommu_l2e_from_pfn(struct page_info *table, int level,
+                              unsigned long io_pfn)
 {
     unsigned long offset;
     void *pde = NULL;
+    void *table_vaddr;
+    u64 next_table_maddr = 0;
 
-    BUG_ON(table == NULL);
+    BUG_ON( table == NULL || level == 0 );
 
-    while ( level > 0 )
+    while ( level > 1 )
     {
         offset = io_pfn >> ((PTE_PER_TABLE_SHIFT *
                              (level - IOMMU_PAGING_MODE_LEVEL_1)));
         offset &= ~PTE_PER_TABLE_MASK;
-        pde = table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE);
 
-        if ( level == 1 )
-            break;
-        if ( !pde )
-            return NULL;
+        table_vaddr = map_domain_page(page_to_mfn(table));
+        pde = table_vaddr + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE);
+        next_table_maddr = amd_iommu_get_next_table_from_pte(pde);
+
         if ( !amd_iommu_is_pte_present(pde) )
         {
-            void *next_table = alloc_xenheap_page();
-            if ( next_table == NULL )
-                return NULL;
-            memset(next_table, 0, PAGE_SIZE);
-            if ( *(u64 *)pde == 0 )
+            if ( next_table_maddr == 0 )
             {
-                unsigned long next_ptr = (u64)virt_to_maddr(next_table);
+                table = alloc_amd_iommu_pgtable();
+                if ( table == NULL )
+                    return 0;
+                next_table_maddr = page_to_maddr(table);
                 amd_iommu_set_page_directory_entry(
-                    (u32 *)pde, next_ptr, level - 1);
-            }
-            else
-            {
-                free_xenheap_page(next_table);
+                    (u32 *)pde, next_table_maddr, level - 1);
             }
+            else /* should never reach here */
+                return 0;
         }
-        table = amd_iommu_get_vptr_from_page_table_entry(pde);
+
+        unmap_domain_page(table_vaddr);
+        table = maddr_to_page(next_table_maddr);
         level--;
     }
 
-    return pde;
+    return next_table_maddr;
 }
 
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn)
 {
-    void *pte;
-    unsigned long flags;
-    u64 maddr;
+    u64 iommu_l2e;
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     int iw = IOMMU_IO_WRITE_ENABLED;
     int ir = IOMMU_IO_READ_ENABLED;
 
     BUG_ON( !hd->root_table );
 
-    spin_lock_irqsave(&hd->mapping_lock, flags);
+    spin_lock(&hd->mapping_lock);
 
     if ( is_hvm_domain(d) && !hd->p2m_synchronized )
         goto out;
 
-    maddr = (u64)mfn << PAGE_SHIFT;
-    pte = get_pte_from_page_tables(hd->root_table, hd->paging_mode, gfn);
-    if ( pte == NULL )
+    iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
+    if ( iommu_l2e == 0 )
     {
+        spin_unlock(&hd->mapping_lock);
         amd_iov_error("Invalid IO pagetable entry gfn = %lx\n", gfn);
-        spin_unlock_irqrestore(&hd->mapping_lock, flags);
         return -EFAULT;
     }
+    set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, iw, ir);
 
-    set_page_table_entry_present((u32 *)pte, maddr, iw, ir);
 out:
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    spin_unlock(&hd->mapping_lock);
     return 0;
 }
 
 int amd_iommu_unmap_page(struct domain *d, unsigned long gfn)
 {
-    void *pte;
+    u64 iommu_l2e;
     unsigned long flags;
-    u64 io_addr = gfn;
-    int requestor_id;
     struct amd_iommu *iommu;
     struct hvm_iommu *hd = domain_hvm_iommu(d);
 
     BUG_ON( !hd->root_table );
 
-    spin_lock_irqsave(&hd->mapping_lock, flags);
+    spin_lock(&hd->mapping_lock);
 
     if ( is_hvm_domain(d) && !hd->p2m_synchronized )
     {
-        spin_unlock_irqrestore(&hd->mapping_lock, flags);
+        spin_unlock(&hd->mapping_lock);
         return 0;
     }
 
-    requestor_id = hd->domain_id;
-    io_addr = (u64)gfn << PAGE_SHIFT;
+    iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
 
-    pte = get_pte_from_page_tables(hd->root_table, hd->paging_mode, gfn);
-    if ( pte == NULL )
+    if ( iommu_l2e == 0 )
     {
+        spin_unlock(&hd->mapping_lock);
         amd_iov_error("Invalid IO pagetable entry gfn = %lx\n", gfn);
-        spin_unlock_irqrestore(&hd->mapping_lock, flags);
         return -EFAULT;
     }
 
     /* mark PTE as 'page not present' */
-    clear_page_table_entry_present((u32 *)pte);
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    clear_iommu_l1e_present(iommu_l2e, gfn);
+    spin_unlock(&hd->mapping_lock);
 
     /* send INVALIDATE_IOMMU_PAGES command */
     for_each_amd_iommu ( iommu )
     {
         spin_lock_irqsave(&iommu->lock, flags);
-        invalidate_iommu_page(iommu, io_addr, requestor_id);
+        invalidate_iommu_page(iommu, (u64)gfn << PAGE_SHIFT, hd->domain_id);
         flush_command_buffer(iommu);
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
@@ -506,38 +518,39 @@ int amd_iommu_reserve_domain_unity_map(
     unsigned long phys_addr,
     unsigned long size, int iw, int ir)
 {
-    unsigned long flags, npages, i;
-    void *pte;
+    u64 iommu_l2e;
+    unsigned long npages, i;
     struct hvm_iommu *hd = domain_hvm_iommu(domain);
 
     npages = region_to_pages(phys_addr, size);
 
-    spin_lock_irqsave(&hd->mapping_lock, flags);
+    spin_lock(&hd->mapping_lock);
     for ( i = 0; i < npages; ++i )
     {
-        pte = get_pte_from_page_tables(
+        iommu_l2e = iommu_l2e_from_pfn(
             hd->root_table, hd->paging_mode, phys_addr >> PAGE_SHIFT);
-        if ( pte == NULL )
+
+        if ( iommu_l2e == 0 )
         {
-            amd_iov_error(
-            "Invalid IO pagetable entry phys_addr = %lx\n", phys_addr);
-            spin_unlock_irqrestore(&hd->mapping_lock, flags);
+            spin_unlock(&hd->mapping_lock);
+            amd_iov_error("Invalid IO pagetable entry phys_addr = %lx\n",
+                          phys_addr);
             return -EFAULT;
         }
-        set_page_table_entry_present((u32 *)pte,
-                                     phys_addr, iw, ir);
+
+        set_iommu_l1e_present(iommu_l2e,
+            (phys_addr >> PAGE_SHIFT), phys_addr, iw, ir);
+
         phys_addr += PAGE_SIZE;
     }
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    spin_unlock(&hd->mapping_lock);
     return 0;
 }
 
 int amd_iommu_sync_p2m(struct domain *d)
 {
-    unsigned long mfn, gfn, flags;
-    void *pte;
-    u64 maddr;
-    struct list_head *entry;
+    unsigned long mfn, gfn;
+    u64 iommu_l2e;
     struct page_info *page;
     struct hvm_iommu *hd;
     int iw = IOMMU_IO_WRITE_ENABLED;
@@ -548,35 +561,83 @@ int amd_iommu_sync_p2m(struct domain *d)
 
     hd = domain_hvm_iommu(d);
 
-    spin_lock_irqsave(&hd->mapping_lock, flags);
+    spin_lock(&hd->mapping_lock);
 
     if ( hd->p2m_synchronized )
         goto out;
 
-    for ( entry = d->page_list.next; entry != &d->page_list;
-            entry = entry->next )
+    spin_lock(&d->page_alloc_lock);
+
+    page_list_for_each ( page, &d->page_list )
     {
-        page = list_entry(entry, struct page_info, list);
         mfn = page_to_mfn(page);
         gfn = get_gpfn_from_mfn(mfn);
 
         if ( gfn == INVALID_M2P_ENTRY )
             continue;
 
-        maddr = (u64)mfn << PAGE_SHIFT;
-        pte = get_pte_from_page_tables(hd->root_table, hd->paging_mode, gfn);
-        if ( pte == NULL )
+        iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn);
+
+        if ( iommu_l2e == 0 )
         {
+            spin_unlock(&d->page_alloc_lock);
+            spin_unlock(&hd->mapping_lock);
             amd_iov_error("Invalid IO pagetable entry gfn = %lx\n", gfn);
-            spin_unlock_irqrestore(&hd->mapping_lock, flags);
             return -EFAULT;
         }
-        set_page_table_entry_present((u32 *)pte, maddr, iw, ir);
+
+        set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, iw, ir);
     }
 
+    spin_unlock(&d->page_alloc_lock);
+
     hd->p2m_synchronized = 1;
 
 out:
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
+    spin_unlock(&hd->mapping_lock);
     return 0;
 }
+
+void invalidate_all_iommu_pages(struct domain *d)
+{
+    u32 cmd[4], entry;
+    unsigned long flags;
+    struct amd_iommu *iommu;
+    int domain_id = d->domain_id;
+    u64 addr_lo = 0x7FFFFFFFFFFFF000ULL & DMA_32BIT_MASK;
+    u64 addr_hi = 0x7FFFFFFFFFFFF000ULL >> 32;
+
+    set_field_in_reg_u32(domain_id, 0,
+                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK,
+                         IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry,
+                         IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT,
+                         &entry);
+    cmd[1] = entry;
+
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0,
+                         IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK,
+                         IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry);
+    set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry,
+                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK,
+                         IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry);
+    set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry);
+    cmd[2] = entry;
+
+    set_field_in_reg_u32((u32)addr_hi, 0,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK,
+                         IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry);
+    cmd[3] = entry;
+
+    cmd[0] = 0;
+
+    for_each_amd_iommu ( iommu )
+    {
+        spin_lock_irqsave(&iommu->lock, flags);
+        send_iommu_command(iommu, cmd);
+        flush_command_buffer(iommu);
+        spin_unlock_irqrestore(&iommu->lock, flags);
+    }
+}
index 8e982455cb0a24069a0927da5071aad5b4836ce6..e23f5e8e2f2a836cbcc447f1e425c528655f2b08 100644 (file)
 #include <xen/pci_regs.h>
 #include <asm/amd-iommu.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
-#include <asm/mm.h>
 
 extern unsigned short ivrs_bdf_entries;
 extern struct ivrs_mappings *ivrs_mappings;
 extern void *int_remap_table;
 
-static void deallocate_domain_page_tables(struct hvm_iommu *hd)
-{
-    if ( hd->root_table )
-        free_xenheap_page(hd->root_table);
-}
-
-static void deallocate_domain_resources(struct hvm_iommu *hd)
-{
-    deallocate_domain_page_tables(hd);
-}
-
 int __init amd_iommu_init(void)
 {
     struct amd_iommu *iommu;
@@ -79,8 +67,6 @@ static void amd_iommu_setup_domain_device(
     struct domain *domain, struct amd_iommu *iommu, int bdf)
 {
     void *dte;
-    u64 root_ptr;
-    u64 intremap_ptr;
     unsigned long flags;
     int req_id;
     u8 sys_mgt, dev_ex;
@@ -88,22 +74,21 @@ static void amd_iommu_setup_domain_device(
 
     BUG_ON( !hd->root_table || !hd->paging_mode || !int_remap_table );
 
-    root_ptr = (u64)virt_to_maddr(hd->root_table);
     /* get device-table entry */
     req_id = ivrs_mappings[bdf].dte_requestor_id;
-    dte = iommu->dev_table.buffer +
-        (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+    dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
 
-    intremap_ptr = (u64)virt_to_maddr(int_remap_table);
+    spin_lock_irqsave(&iommu->lock, flags);
 
     if ( !amd_iommu_is_dte_page_translation_valid((u32 *)dte) )
     {
-        spin_lock_irqsave(&iommu->lock, flags); 
-
         /* bind DTE to domain page-tables */
         sys_mgt = ivrs_mappings[req_id].dte_sys_mgt_enable;
         dev_ex = ivrs_mappings[req_id].dte_allow_exclusion;
-        amd_iommu_set_dev_table_entry((u32 *)dte, root_ptr, intremap_ptr,
+
+        amd_iommu_set_dev_table_entry((u32 *)dte,
+                                      page_to_maddr(hd->root_table),
+                                      virt_to_maddr(int_remap_table),
                                       hd->domain_id, sys_mgt, dev_ex,
                                       hd->paging_mode);
 
@@ -111,11 +96,15 @@ static void amd_iommu_setup_domain_device(
         invalidate_interrupt_table(iommu, req_id);
         flush_command_buffer(iommu);
         amd_iov_info("Enable DTE:0x%x, "
-                "root_ptr:%"PRIx64", domain_id:%d, paging_mode:%d\n",
-                req_id, root_ptr, hd->domain_id, hd->paging_mode);
-
-        spin_unlock_irqrestore(&iommu->lock, flags);
+                "root_table:%"PRIx64", interrupt_table:%"PRIx64", "
+                "domain_id:%d, paging_mode:%d\n",
+                req_id, (u64)page_to_maddr(hd->root_table),
+                (u64)virt_to_maddr(int_remap_table), hd->domain_id,
+                hd->paging_mode);
     }
+
+    spin_unlock_irqrestore(&iommu->lock, flags);
+
 }
 
 static void amd_iommu_setup_dom0_devices(struct domain *d)
@@ -126,7 +115,7 @@ static void amd_iommu_setup_dom0_devices(struct domain *d)
     u32 l;
     int bdf;
 
-    write_lock(&pcidevs_lock);
+    spin_lock(&pcidevs_lock);
     for ( bus = 0; bus < 256; bus++ )
     {
         for ( dev = 0; dev < 32; dev++ )
@@ -153,7 +142,7 @@ static void amd_iommu_setup_dom0_devices(struct domain *d)
             }
         }
     }
-    write_unlock(&pcidevs_lock);
+    spin_unlock(&pcidevs_lock);
 }
 
 int amd_iov_detect(void)
@@ -183,23 +172,18 @@ int amd_iov_detect(void)
 static int allocate_domain_resources(struct hvm_iommu *hd)
 {
     /* allocate root table */
-    unsigned long flags;
-
-    spin_lock_irqsave(&hd->mapping_lock, flags);
+    spin_lock(&hd->mapping_lock);
     if ( !hd->root_table )
     {
-        hd->root_table = (void *)alloc_xenheap_page();
+        hd->root_table = alloc_amd_iommu_pgtable();
         if ( !hd->root_table )
-            goto error_out;
-        memset((u8*)hd->root_table, 0, PAGE_SIZE);
+        {
+            spin_unlock(&hd->mapping_lock);
+            return -ENOMEM;
+        }
     }
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
-
+    spin_unlock(&hd->mapping_lock);
     return 0;
-
- error_out:
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
-    return -ENOMEM;
 }
 
 static int get_paging_mode(unsigned long entries)
@@ -228,7 +212,8 @@ static int amd_iommu_domain_init(struct domain *domain)
     /* allocate page directroy */
     if ( allocate_domain_resources(hd) != 0 )
     {
-        deallocate_domain_resources(hd);
+        if ( hd->root_table )
+            free_domheap_page(hd->root_table);
         return -ENOMEM;
     }
 
@@ -258,12 +243,11 @@ static void amd_iommu_disable_domain_device(
     int req_id;
 
     req_id = ivrs_mappings[bdf].dte_requestor_id;
-    dte = iommu->dev_table.buffer +
-        (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+    dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
 
+    spin_lock_irqsave(&iommu->lock, flags); 
     if ( amd_iommu_is_dte_page_translation_valid((u32 *)dte) )
     {
-        spin_lock_irqsave(&iommu->lock, flags); 
         memset (dte, 0, IOMMU_DEV_TABLE_ENTRY_SIZE);
         invalidate_dev_table_entry(iommu, req_id);
         flush_command_buffer(iommu);
@@ -271,8 +255,8 @@ static void amd_iommu_disable_domain_device(
                 " domain_id:%d, paging_mode:%d\n",
                 req_id,  domain_hvm_iommu(domain)->domain_id,
                 domain_hvm_iommu(domain)->paging_mode);
-        spin_unlock_irqrestore(&iommu->lock, flags);
     }
+    spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
 static int reassign_device( struct domain *source, struct domain *target,
@@ -282,29 +266,27 @@ static int reassign_device( struct domain *source, struct domain *target,
     struct amd_iommu *iommu;
     int bdf;
 
-    pdev = pci_lock_domain_pdev(source, bus, devfn);
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev_by_domain(source, bus, devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     bdf = (bus << 8) | devfn;
     /* supported device? */
     iommu = (bdf < ivrs_bdf_entries) ?
-       find_iommu_for_device(bus, pdev->devfn) : NULL;
+    find_iommu_for_device(bus, pdev->devfn) : NULL;
 
     if ( !iommu )
     {
-       spin_unlock(&pdev->lock);
-       amd_iov_error("Fail to find iommu."
-                     " %x:%x.%x cannot be assigned to domain %d\n", 
-                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn), target->domain_id);
-       return -ENODEV;
+        amd_iov_error("Fail to find iommu."
+            " %x:%x.%x cannot be assigned to domain %d\n", 
+            bus, PCI_SLOT(devfn), PCI_FUNC(devfn), target->domain_id);
+        return -ENODEV;
     }
 
     amd_iommu_disable_domain_device(source, iommu, bdf);
 
-    write_lock(&pcidevs_lock);
     list_move(&pdev->domain_list, &target->arch.pdev_list);
-    write_unlock(&pcidevs_lock);
     pdev->domain = target;
 
     amd_iommu_setup_domain_device(target, iommu, bdf);
@@ -312,7 +294,6 @@ static int reassign_device( struct domain *source, struct domain *target,
                  bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
                  source->domain_id, target->domain_id);
 
-    spin_unlock(&pdev->lock);
     return 0;
 }
 
@@ -336,59 +317,50 @@ static int amd_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
     return reassign_device(dom0, d, bus, devfn);
 }
 
-static void deallocate_next_page_table(void *table, unsigned long index,
-                                       int level)
+static void deallocate_next_page_table(struct page_info* pg, int level)
 {
-    unsigned long next_index;
-    void *next_table, *pde;
-    int next_level;
+    void *table_vaddr, *pde;
+    u64 next_table_maddr;
+    int index;
 
-    pde = table + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE);
-    next_table = amd_iommu_get_vptr_from_page_table_entry((u32 *)pde);
+    table_vaddr = map_domain_page(page_to_mfn(pg));
 
-    if ( next_table )
+    if ( level > 1 )
     {
-        next_level = level - 1;
-        if ( next_level > 1 )
+        for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ )
         {
-            next_index = 0;
-            do
+            pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE);
+            next_table_maddr = amd_iommu_get_next_table_from_pte(pde);
+            if ( next_table_maddr != 0 )
             {
-                deallocate_next_page_table(next_table,
-                                           next_index, next_level);
-                next_index++;
-            } while (next_index < PTE_PER_TABLE_SIZE);
+                deallocate_next_page_table(
+                    maddr_to_page(next_table_maddr), level - 1);
+            }
         }
-
-        free_xenheap_page(next_table);
     }
+
+    unmap_domain_page(table_vaddr);
+    free_amd_iommu_pgtable(pg);
 }
 
 static void deallocate_iommu_page_tables(struct domain *d)
 {
-    unsigned long index;
     struct hvm_iommu *hd  = domain_hvm_iommu(d);
 
-    if ( hd ->root_table )
+    spin_lock(&hd->mapping_lock);
+    if ( hd->root_table )
     {
-        index = 0;
-
-        do
-        {
-            deallocate_next_page_table(hd->root_table,
-                                       index, hd->paging_mode);
-            index++;
-        } while ( index < PTE_PER_TABLE_SIZE );
-
-        free_xenheap_page(hd ->root_table);
+        deallocate_next_page_table(hd->root_table, hd->paging_mode);
+        hd->root_table = NULL;
     }
-
-    hd ->root_table = NULL;
+    spin_unlock(&hd->mapping_lock);
 }
 
+
 static void amd_iommu_domain_destroy(struct domain *d)
 {
     deallocate_iommu_page_tables(d);
+    invalidate_all_iommu_pages(d);
 }
 
 static int amd_iommu_return_device(
index d5a337ab084ff1bc68ce3d62c67c0677e9e379be..bdb0d4606abce5c8d08ea2bbb4bbb808533a5d3d 100644 (file)
 
 #include <xen/event.h>
 #include <xen/iommu.h>
+#include <asm/hvm/irq.h>
+#include <asm/hvm/iommu.h>
+#include <xen/hvm/irq.h>
+
+static int pt_irq_need_timer(uint32_t flags)
+{
+    return !(flags & (HVM_IRQ_DPCI_GUEST_MSI | HVM_IRQ_DPCI_TRANSLATE));
+}
 
 static void pt_irq_time_out(void *data)
 {
@@ -28,7 +36,11 @@ static void pt_irq_time_out(void *data)
     int vector;
     struct hvm_irq_dpci *dpci = NULL;
     struct dev_intx_gsi_link *digl;
+    struct hvm_girq_dpci_mapping *girq;
     uint32_t device, intx;
+    DECLARE_BITMAP(machine_gsi_map, NR_IRQS);
+
+    bitmap_zero(machine_gsi_map, NR_IRQS);
 
     spin_lock(&irq_map->dom->event_lock);
 
@@ -37,17 +49,35 @@ static void pt_irq_time_out(void *data)
     list_for_each_entry ( digl, &irq_map->digl_list, list )
     {
         guest_gsi = digl->gsi;
-        machine_gsi = dpci->girq[guest_gsi].machine_gsi;
+        list_for_each_entry ( girq, &dpci->girq[guest_gsi], list )
+        {
+            machine_gsi = girq->machine_gsi;
+            set_bit(machine_gsi, machine_gsi_map);
+        }
         device = digl->device;
         intx = digl->intx;
         hvm_pci_intx_deassert(irq_map->dom, device, intx);
     }
 
-    clear_bit(machine_gsi, dpci->dirq_mask);
-    vector = domain_irq_to_vector(irq_map->dom, machine_gsi);
-    dpci->mirq[machine_gsi].pending = 0;
+    for ( machine_gsi = find_first_bit(machine_gsi_map, NR_IRQS);
+          machine_gsi < NR_IRQS;
+          machine_gsi = find_next_bit(machine_gsi_map, NR_IRQS,
+                                      machine_gsi + 1) )
+    {
+        clear_bit(machine_gsi, dpci->dirq_mask);
+        vector = domain_irq_to_vector(irq_map->dom, machine_gsi);
+        dpci->mirq[machine_gsi].pending = 0;
+    }
+
     spin_unlock(&irq_map->dom->event_lock);
-    pirq_guest_eoi(irq_map->dom, machine_gsi);
+
+    for ( machine_gsi = find_first_bit(machine_gsi_map, NR_IRQS);
+          machine_gsi < NR_IRQS;
+          machine_gsi = find_next_bit(machine_gsi_map, NR_IRQS,
+                                      machine_gsi + 1) )
+    {
+        pirq_guest_eoi(irq_map->dom, machine_gsi);
+    }
 }
 
 int pt_irq_create_bind_vtd(
@@ -57,9 +87,10 @@ int pt_irq_create_bind_vtd(
     uint32_t machine_gsi, guest_gsi;
     uint32_t device, intx, link;
     struct dev_intx_gsi_link *digl;
-    int pirq = pt_irq_bind->machine_irq;
+    struct hvm_girq_dpci_mapping *girq;
+    int rc, pirq = pt_irq_bind->machine_irq;
 
-    if ( pirq < 0 || pirq >= NR_PIRQS )
+    if ( pirq < 0 || pirq >= NR_IRQS )
         return -EINVAL;
 
     spin_lock(&d->event_lock);
@@ -75,14 +106,17 @@ int pt_irq_create_bind_vtd(
         }
         memset(hvm_irq_dpci, 0, sizeof(*hvm_irq_dpci));
         for ( int i = 0; i < NR_IRQS; i++ )
+        {
             INIT_LIST_HEAD(&hvm_irq_dpci->mirq[i].digl_list);
-    }
+            INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]);
+        }
 
-    if ( domain_set_irq_dpci(d, hvm_irq_dpci) == 0 )
-    {
-        xfree(hvm_irq_dpci);
-        spin_unlock(&d->event_lock);
-        return -EINVAL;
+        if ( domain_set_irq_dpci(d, hvm_irq_dpci) == 0 )
+        {
+            spin_unlock(&d->event_lock);
+            xfree(hvm_irq_dpci);
+            return -EINVAL;
+        }
     }
 
     if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI )
@@ -90,12 +124,29 @@ int pt_irq_create_bind_vtd(
 
         if ( !test_and_set_bit(pirq, hvm_irq_dpci->mapping))
         {
-            set_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags);
+            hvm_irq_dpci->mirq[pirq].flags = HVM_IRQ_DPCI_MACH_MSI |
+                                             HVM_IRQ_DPCI_GUEST_MSI;
             hvm_irq_dpci->mirq[pirq].gmsi.gvec = pt_irq_bind->u.msi.gvec;
             hvm_irq_dpci->mirq[pirq].gmsi.gflags = pt_irq_bind->u.msi.gflags;
             hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = pirq;
             /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/
-            pirq_guest_bind(d->vcpu[0], pirq, 0);
+            rc = pirq_guest_bind(d->vcpu[0], pirq, 0);
+            if ( rc == 0 && pt_irq_bind->u.msi.gtable )
+            {
+                rc = msixtbl_pt_register(d, pirq, pt_irq_bind->u.msi.gtable);
+                if ( unlikely(rc) )
+                    pirq_guest_unbind(d, pirq);
+            }
+            if ( unlikely(rc) )
+            {
+                hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] = 0;
+                hvm_irq_dpci->mirq[pirq].gmsi.gflags = 0;
+                hvm_irq_dpci->mirq[pirq].gmsi.gvec = 0;
+                hvm_irq_dpci->mirq[pirq].flags = 0;
+                clear_bit(pirq, hvm_irq_dpci->mapping);
+                spin_unlock(&d->event_lock);
+                return rc;
+            }
         }
         else if (hvm_irq_dpci->mirq[pirq].gmsi.gvec != pt_irq_bind->u.msi.gvec
                 ||hvm_irq_dpci->msi_gvec_pirq[pt_irq_bind->u.msi.gvec] != pirq)
@@ -121,6 +172,14 @@ int pt_irq_create_bind_vtd(
             return -ENOMEM;
         }
 
+        girq = xmalloc(struct hvm_girq_dpci_mapping);
+        if ( !girq )
+        {
+            xfree(digl);
+            spin_unlock(&d->event_lock);
+            return -ENOMEM;
+        }
+
         digl->device = device;
         digl->intx = intx;
         digl->gsi = guest_gsi;
@@ -128,21 +187,52 @@ int pt_irq_create_bind_vtd(
         list_add_tail(&digl->list,
                       &hvm_irq_dpci->mirq[machine_gsi].digl_list);
 
-        hvm_irq_dpci->girq[guest_gsi].valid = 1;
-        hvm_irq_dpci->girq[guest_gsi].device = device;
-        hvm_irq_dpci->girq[guest_gsi].intx = intx;
-        hvm_irq_dpci->girq[guest_gsi].machine_gsi = machine_gsi;
+        girq->device = device;
+        girq->intx = intx;
+        girq->machine_gsi = machine_gsi;
+        list_add_tail(&girq->list, &hvm_irq_dpci->girq[guest_gsi]);
 
         /* Bind the same mirq once in the same domain */
         if ( !test_and_set_bit(machine_gsi, hvm_irq_dpci->mapping))
         {
+            unsigned int vector = domain_irq_to_vector(d, machine_gsi);
+            unsigned int share;
+
             hvm_irq_dpci->mirq[machine_gsi].dom = d;
+            if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
+            {
+                hvm_irq_dpci->mirq[machine_gsi].flags = HVM_IRQ_DPCI_MACH_MSI |
+                                                        HVM_IRQ_DPCI_GUEST_PCI |
+                                                        HVM_IRQ_DPCI_TRANSLATE;
+                share = 0;
+            }
+            else    /* PT_IRQ_TYPE_PCI */
+            {
+                hvm_irq_dpci->mirq[machine_gsi].flags = HVM_IRQ_DPCI_MACH_PCI |
+                                                        HVM_IRQ_DPCI_GUEST_PCI;
+                share = BIND_PIRQ__WILL_SHARE;
+            }
 
             /* Init timer before binding */
-            init_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, machine_gsi)],
-                       pt_irq_time_out, &hvm_irq_dpci->mirq[machine_gsi], 0);
+            if ( pt_irq_need_timer(hvm_irq_dpci->mirq[machine_gsi].flags) )
+                init_timer(&hvm_irq_dpci->hvm_timer[vector],
+                           pt_irq_time_out, &hvm_irq_dpci->mirq[machine_gsi], 0);
             /* Deal with gsi for legacy devices */
-            pirq_guest_bind(d->vcpu[0], machine_gsi, BIND_PIRQ__WILL_SHARE);
+            rc = pirq_guest_bind(d->vcpu[0], machine_gsi, share);
+            if ( unlikely(rc) )
+            {
+                if ( pt_irq_need_timer(hvm_irq_dpci->mirq[machine_gsi].flags) )
+                    kill_timer(&hvm_irq_dpci->hvm_timer[vector]);
+                hvm_irq_dpci->mirq[machine_gsi].dom = NULL;
+                clear_bit(machine_gsi, hvm_irq_dpci->mapping);
+                list_del(&girq->list);
+                xfree(girq);
+                list_del(&digl->list);
+                hvm_irq_dpci->link_cnt[link]--;
+                spin_unlock(&d->event_lock);
+                xfree(digl);
+                return rc;
+            }
         }
 
         gdprintk(XENLOG_INFO VTDPREFIX,
@@ -161,6 +251,7 @@ int pt_irq_destroy_bind_vtd(
     uint32_t device, intx, link;
     struct list_head *digl_list, *tmp;
     struct dev_intx_gsi_link *digl;
+    struct hvm_girq_dpci_mapping *girq;
 
     machine_gsi = pt_irq_bind->machine_irq;
     device = pt_irq_bind->u.pci.device;
@@ -183,8 +274,16 @@ int pt_irq_destroy_bind_vtd(
     }
 
     hvm_irq_dpci->link_cnt[link]--;
-    memset(&hvm_irq_dpci->girq[guest_gsi], 0,
-           sizeof(struct hvm_girq_dpci_mapping));
+
+    list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
+    {
+        if ( girq->machine_gsi == machine_gsi )
+        {
+                list_del(&girq->list);
+                xfree(girq);
+                break;
+        }
+    }
 
     /* clear the mirq info */
     if ( test_bit(machine_gsi, hvm_irq_dpci->mapping))
@@ -207,7 +306,9 @@ int pt_irq_destroy_bind_vtd(
         if ( list_empty(&hvm_irq_dpci->mirq[machine_gsi].digl_list) )
         {
             pirq_guest_unbind(d, machine_gsi);
-            kill_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, machine_gsi)]);
+            msixtbl_pt_unregister(d, machine_gsi);
+            if ( pt_irq_need_timer(hvm_irq_dpci->mirq[machine_gsi].flags) )
+                kill_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, machine_gsi)]);
             hvm_irq_dpci->mirq[machine_gsi].dom   = NULL;
             hvm_irq_dpci->mirq[machine_gsi].flags = 0;
             clear_bit(machine_gsi, hvm_irq_dpci->mapping);
@@ -237,7 +338,7 @@ int hvm_do_IRQ_dpci(struct domain *d, unsigned int mirq)
      * PIC) and we need to detect that.
      */
     set_bit(mirq, dpci->dirq_mask);
-    if ( !test_bit(_HVM_IRQ_DPCI_MSI, &dpci->mirq[mirq].flags) )
+    if ( pt_irq_need_timer(dpci->mirq[mirq].flags) )
         set_timer(&dpci->hvm_timer[domain_irq_to_vector(d, mirq)],
                   NOW() + PT_IRQ_TIME_OUT);
     vcpu_kick(d->vcpu[0]);
@@ -245,46 +346,145 @@ int hvm_do_IRQ_dpci(struct domain *d, unsigned int mirq)
     return 1;
 }
 
-void hvm_dpci_msi_eoi(struct domain *d, int vector)
+#ifdef SUPPORT_MSI_REMAPPING
+/* called with d->event_lock held */
+static void __msi_pirq_eoi(struct domain *d, int pirq)
 {
     struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
     irq_desc_t *desc;
+
+    if ( ( pirq >= 0 ) && ( pirq < NR_IRQS ) &&
+         test_bit(pirq, hvm_irq_dpci->mapping) &&
+         ( hvm_irq_dpci->mirq[pirq].flags & HVM_IRQ_DPCI_MACH_MSI) )
+    {
+         BUG_ON(!local_irq_is_enabled());
+         desc = domain_spin_lock_irq_desc(d, pirq, NULL);
+         if ( !desc )
+            return;
+
+         desc->status &= ~IRQ_INPROGRESS;
+         spin_unlock_irq(&desc->lock);
+
+         pirq_guest_eoi(d, pirq);
+    }
+}
+
+void hvm_dpci_msi_eoi(struct domain *d, int vector)
+{
+    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
     int pirq;
 
     if ( !iommu_enabled || (hvm_irq_dpci == NULL) )
        return;
 
     spin_lock(&d->event_lock);
+
     pirq = hvm_irq_dpci->msi_gvec_pirq[vector];
+    __msi_pirq_eoi(d, pirq);
 
-    if ( ( pirq >= 0 ) && (pirq < NR_PIRQS) &&
-          test_bit(pirq, hvm_irq_dpci->mapping) &&
-         (test_bit(_HVM_IRQ_DPCI_MSI, &hvm_irq_dpci->mirq[pirq].flags)))
-     {
-         BUG_ON(!local_irq_is_enabled());
-         desc = domain_spin_lock_irq_desc(d, pirq, NULL);
-         if (!desc)
-         {
+    spin_unlock(&d->event_lock);
+}
+
+extern int vmsi_deliver(struct domain *d, int pirq);
+static int hvm_pci_msi_assert(struct domain *d, int pirq)
+{
+    return vmsi_deliver(d, pirq);
+}
+#endif
+
+void hvm_dirq_assist(struct vcpu *v)
+{
+    unsigned int irq;
+    uint32_t device, intx;
+    struct domain *d = v->domain;
+    struct hvm_irq_dpci *hvm_irq_dpci = d->arch.hvm_domain.irq.dpci;
+    struct dev_intx_gsi_link *digl;
+
+    if ( !iommu_enabled || (v->vcpu_id != 0) || (hvm_irq_dpci == NULL) )
+        return;
+
+    for ( irq = find_first_bit(hvm_irq_dpci->dirq_mask, NR_IRQS);
+          irq < NR_IRQS;
+          irq = find_next_bit(hvm_irq_dpci->dirq_mask, NR_IRQS, irq + 1) )
+    {
+        if ( !test_and_clear_bit(irq, &hvm_irq_dpci->dirq_mask) )
+            continue;
+
+        spin_lock(&d->event_lock);
+#ifdef SUPPORT_MSI_REMAPPING
+        if ( hvm_irq_dpci->mirq[irq].flags & HVM_IRQ_DPCI_GUEST_MSI )
+        {
+            hvm_pci_msi_assert(d, irq);
             spin_unlock(&d->event_lock);
-            return;
-         }
+            continue;
+        }
+#endif
+        if ( pt_irq_need_timer(hvm_irq_dpci->mirq[irq].flags) )
+            stop_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)]);
 
-         desc->status &= ~IRQ_INPROGRESS;
-         spin_unlock_irq(&desc->lock);
+        list_for_each_entry ( digl, &hvm_irq_dpci->mirq[irq].digl_list, list )
+        {
+            device = digl->device;
+            intx = digl->intx;
+            hvm_pci_intx_assert(d, device, intx);
+            hvm_irq_dpci->mirq[irq].pending++;
 
-         pirq_guest_eoi(d, pirq);
-     }
+#ifdef SUPPORT_MSI_REMAPPING
+            if ( hvm_irq_dpci->mirq[irq].flags & HVM_IRQ_DPCI_TRANSLATE )
+            {
+                /* for translated MSI to INTx interrupt, eoi as early as possible */
+                __msi_pirq_eoi(d, irq);
+            }
+#endif
+        }
 
-    spin_unlock(&d->event_lock);
+        /*
+         * Set a timer to see if the guest can finish the interrupt or not. For
+         * example, the guest OS may unmask the PIC during boot, before the
+         * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
+         * guest will never deal with the irq, then the physical interrupt line
+         * will never be deasserted.
+         */
+        if ( pt_irq_need_timer(hvm_irq_dpci->mirq[irq].flags) )
+            set_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, irq)],
+                      NOW() + PT_IRQ_TIME_OUT);
+        spin_unlock(&d->event_lock);
+    }
+}
+
+static void __hvm_dpci_eoi(struct domain *d,
+                           struct hvm_irq_dpci *hvm_irq_dpci,
+                           struct hvm_girq_dpci_mapping *girq,
+                           union vioapic_redir_entry *ent)
+{
+    uint32_t device, intx, machine_gsi;
+
+    device = girq->device;
+    intx = girq->intx;
+    hvm_pci_intx_deassert(d, device, intx);
+
+    machine_gsi = girq->machine_gsi;
+
+    /*
+     * No need to get vector lock for timer
+     * since interrupt is still not EOIed
+     */
+    if ( --hvm_irq_dpci->mirq[machine_gsi].pending ||
+         ( ent && ent->fields.mask ) ||
+         ! pt_irq_need_timer(hvm_irq_dpci->mirq[machine_gsi].flags) )
+        return;
+
+    stop_timer(&hvm_irq_dpci->hvm_timer[domain_irq_to_vector(d, machine_gsi)]);
+    pirq_guest_eoi(d, machine_gsi);
 }
 
 void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
                   union vioapic_redir_entry *ent)
 {
-    struct hvm_irq_dpci *hvm_irq_dpci = NULL;
-    uint32_t device, intx, machine_gsi;
+    struct hvm_irq_dpci *hvm_irq_dpci;
+    struct hvm_girq_dpci_mapping *girq;
 
-    if ( !iommu_enabled)
+    if ( !iommu_enabled )
         return;
 
     if ( guest_gsi < NR_ISAIRQS )
@@ -296,31 +496,12 @@ void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
     spin_lock(&d->event_lock);
     hvm_irq_dpci = domain_get_irq_dpci(d);
 
-    if((hvm_irq_dpci == NULL) ||
-         (guest_gsi >= NR_ISAIRQS &&
-          !hvm_irq_dpci->girq[guest_gsi].valid) )
-    {
-        spin_unlock(&d->event_lock);
-        return;
-    }
+    if ( !hvm_irq_dpci )
+        goto unlock;
 
-    device = hvm_irq_dpci->girq[guest_gsi].device;
-    intx = hvm_irq_dpci->girq[guest_gsi].intx;
-    hvm_pci_intx_deassert(d, device, intx);
+    list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
+        __hvm_dpci_eoi(d, hvm_irq_dpci, girq, ent);
 
-    machine_gsi = hvm_irq_dpci->girq[guest_gsi].machine_gsi;
-    if ( --hvm_irq_dpci->mirq[machine_gsi].pending == 0 )
-    {
-        if ( (ent == NULL) || !ent->fields.mask )
-        {
-            /*
-             * No need to get vector lock for timer
-             * since interrupt is still not EOIed
-             */
-            stop_timer(&hvm_irq_dpci->hvm_timer[
-                domain_irq_to_vector(d, machine_gsi)]);
-            pirq_guest_eoi(d, machine_gsi);
-        }
-    }
+unlock:
     spin_unlock(&d->event_lock);
 }
index cc0ec145927f0fcb8de1f1a881d122641d96fffb..daab8c9b20163fa0b2b42d9d3e580e520f218f0d 100644 (file)
@@ -19,8 +19,6 @@
 #include <xen/paging.h>
 #include <xen/guest_access.h>
 
-extern struct iommu_ops intel_iommu_ops;
-extern struct iommu_ops amd_iommu_ops;
 static void parse_iommu_param(char *s);
 static int iommu_populate_page_table(struct domain *d);
 int intel_vtd_setup(void);
@@ -34,18 +32,28 @@ int amd_iov_detect(void);
  *   pv                         Enable IOMMU for PV domains
  *   no-pv                      Disable IOMMU for PV domains (default)
  *   force|required             Don't boot unless IOMMU is enabled
- *   passthrough                Bypass VT-d translation for Dom0
+ *   passthrough                Enable VT-d DMA passthrough (no DMA
+ *                              translation for Dom0)
+ *   no-snoop                   Disable VT-d Snoop Control
+ *   no-qinval                  Disable VT-d Queued Invalidation
+ *   no-intremap                Disable VT-d Interrupt Remapping
  */
 custom_param("iommu", parse_iommu_param);
 int iommu_enabled = 0;
 int iommu_pv_enabled = 0;
 int force_iommu = 0;
 int iommu_passthrough = 0;
+int iommu_snoop = 0;
+int iommu_qinval = 0;
+int iommu_intremap = 0;
 
 static void __init parse_iommu_param(char *s)
 {
     char *ss;
     iommu_enabled = 1;
+    iommu_snoop = 1;
+    iommu_qinval = 1;
+    iommu_intremap = 1;
 
     do {
         ss = strchr(s, ',');
@@ -63,6 +71,12 @@ static void __init parse_iommu_param(char *s)
             force_iommu = 1;
         else if ( !strcmp(s, "passthrough") )
             iommu_passthrough = 1;
+        else if ( !strcmp(s, "no-snoop") )
+            iommu_snoop = 0;
+        else if ( !strcmp(s, "no-qinval") )
+            iommu_qinval = 0;
+        else if ( !strcmp(s, "no-intremap") )
+            iommu_intremap = 0;
 
         s = ss + 1;
     } while ( ss );
@@ -85,9 +99,12 @@ int iommu_domain_init(struct domain *domain)
 int iommu_add_device(struct pci_dev *pdev)
 {
     struct hvm_iommu *hd;
+
     if ( !pdev->domain )
         return -EINVAL;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
     hd = domain_hvm_iommu(pdev->domain);
     if ( !iommu_enabled || !hd->platform_ops )
         return 0;
@@ -111,20 +128,24 @@ int iommu_remove_device(struct pci_dev *pdev)
 int assign_device(struct domain *d, u8 bus, u8 devfn)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
-    int rc;
+    int rc = 0;
 
     if ( !iommu_enabled || !hd->platform_ops )
         return 0;
 
+    spin_lock(&pcidevs_lock);
     if ( (rc = hd->platform_ops->assign_device(d, bus, devfn)) )
-        return rc;
+        goto done;
 
     if ( has_arch_pdevs(d) && !is_hvm_domain(d) && !need_iommu(d) )
     {
         d->need_iommu = 1;
-        return iommu_populate_page_table(d);
+        rc = iommu_populate_page_table(d);
+        goto done;
     }
-    return 0;
+done:    
+    spin_unlock(&pcidevs_lock);
+    return rc;
 }
 
 static int iommu_populate_page_table(struct domain *d)
@@ -135,7 +156,7 @@ static int iommu_populate_page_table(struct domain *d)
 
     spin_lock(&d->page_alloc_lock);
 
-    list_for_each_entry ( page, &d->page_list, list )
+    page_list_for_each ( page, &d->page_list )
     {
         if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
         {
@@ -206,12 +227,26 @@ int iommu_unmap_page(struct domain *d, unsigned long gfn)
     return hd->platform_ops->unmap_page(d, gfn);
 }
 
-void deassign_device(struct domain *d, u8 bus, u8 devfn)
+/* caller should hold the pcidevs_lock */
+int deassign_device(struct domain *d, u8 bus, u8 devfn)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
+    struct pci_dev *pdev = NULL;
 
     if ( !iommu_enabled || !hd->platform_ops )
-        return;
+        return -EINVAL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(bus, devfn);
+    if (!pdev)
+        return -ENODEV;
+
+    if (pdev->domain != d)
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: deassign a device not owned\n");
+        return -EINVAL;
+    }
 
     hd->platform_ops->reassign_device(d, dom0, bus, devfn);
 
@@ -220,6 +255,8 @@ void deassign_device(struct domain *d, u8 bus, u8 devfn)
         d->need_iommu = 0;
         hd->platform_ops->teardown(d);
     }
+
+    return 0;
 }
 
 static int iommu_setup(void)
@@ -262,7 +299,7 @@ int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn,
 
     group_id = ops->get_device_group_id(bus, devfn);
 
-    read_lock(&pcidevs_lock);
+    spin_lock(&pcidevs_lock);
     for_each_pdev( d, pdev )
     {
         if ( (pdev->bus == bus) && (pdev->devfn == devfn) )
@@ -276,13 +313,13 @@ int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn,
             bdf |= (pdev->devfn & 0xff) << 8;
             if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
             {
-                read_unlock(&pcidevs_lock);
+                spin_unlock(&pcidevs_lock);
                 return -1;
             }
             i++;
         }
     }
-    read_unlock(&pcidevs_lock);
+    spin_unlock(&pcidevs_lock);
 
     return i;
 }
index da39b475f2542c1d9cc205a36f44b22f57f6b081..d85a4675a03d6550da23eb745d24025f6d98d498 100644 (file)
 #include <xen/list.h>
 #include <xen/prefetch.h>
 #include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
+#include <asm/hvm/irq.h>
 #include <xen/delay.h>
 #include <xen/keyhandler.h>
 
 
 LIST_HEAD(alldevs_list);
-rwlock_t pcidevs_lock = RW_LOCK_UNLOCKED;
+spinlock_t pcidevs_lock = SPIN_LOCK_UNLOCKED;
 
 struct pci_dev *alloc_pdev(u8 bus, u8 devfn)
 {
@@ -39,13 +41,14 @@ struct pci_dev *alloc_pdev(u8 bus, u8 devfn)
     pdev = xmalloc(struct pci_dev);
     if ( !pdev )
         return NULL;
+    memset(pdev, 0, sizeof(struct pci_dev));
 
     *((u8*) &pdev->bus) = bus;
     *((u8*) &pdev->devfn) = devfn;
     pdev->domain = NULL;
-    spin_lock_init(&pdev->lock);
     INIT_LIST_HEAD(&pdev->msi_list);
     list_add(&pdev->alldevs_list, &alldevs_list);
+    spin_lock_init(&pdev->msix_table_lock);
 
     return pdev;
 }
@@ -56,42 +59,35 @@ void free_pdev(struct pci_dev *pdev)
     xfree(pdev);
 }
 
-struct pci_dev *pci_lock_pdev(int bus, int devfn)
+struct pci_dev *pci_get_pdev(int bus, int devfn)
 {
-    struct pci_dev *pdev;
+    struct pci_dev *pdev = NULL;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
 
-    read_lock(&pcidevs_lock);
     list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
         if ( (pdev->bus == bus || bus == -1) &&
              (pdev->devfn == devfn || devfn == -1) )
-    {
-        spin_lock(&pdev->lock);
-        read_unlock(&pcidevs_lock);
-        return pdev;
-    }
-    read_unlock(&pcidevs_lock);
+        {
+            return pdev;
+        }
 
     return NULL;
 }
 
-struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn)
+struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn)
 {
-    struct pci_dev *pdev;
+    struct pci_dev *pdev = NULL;
 
-    read_lock(&pcidevs_lock);
-    list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list )
-    {
-        spin_lock(&pdev->lock);
-        if ( (pdev->bus == bus || bus == -1) &&
-             (pdev->devfn == devfn || devfn == -1) &&
-             (pdev->domain == d) )
-        {
-            read_unlock(&pcidevs_lock);
-            return pdev;
-        }
-        spin_unlock(&pdev->lock);
-    }
-    read_unlock(&pcidevs_lock);
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
+    list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
+         if ( (pdev->bus == bus || bus == -1) &&
+              (pdev->devfn == devfn || devfn == -1) &&
+              (pdev->domain == d) )
+         {
+             return pdev;
+         }
 
     return NULL;
 }
@@ -101,30 +97,26 @@ int pci_add_device(u8 bus, u8 devfn)
     struct pci_dev *pdev;
     int ret = -ENOMEM;
 
-    write_lock(&pcidevs_lock);
+    spin_lock(&pcidevs_lock);
     pdev = alloc_pdev(bus, devfn);
     if ( !pdev )
         goto out;
 
     ret = 0;
-    spin_lock(&pdev->lock);
     if ( !pdev->domain )
     {
         pdev->domain = dom0;
         ret = iommu_add_device(pdev);
         if ( ret )
-        {
-            spin_unlock(&pdev->lock);
             goto out;
-        }
+
         list_add(&pdev->domain_list, &dom0->arch.pdev_list);
     }
-    spin_unlock(&pdev->lock);
-    printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus,
-           PCI_SLOT(devfn), PCI_FUNC(devfn));
 
 out:
-    write_unlock(&pcidevs_lock);
+    spin_unlock(&pcidevs_lock);
+    printk(XENLOG_DEBUG "PCI add device %02x:%02x.%x\n", bus,
+           PCI_SLOT(devfn), PCI_FUNC(devfn));
     return ret;
 }
 
@@ -133,11 +125,10 @@ int pci_remove_device(u8 bus, u8 devfn)
     struct pci_dev *pdev;
     int ret = -ENODEV;;
 
-    write_lock(&pcidevs_lock);
+    spin_lock(&pcidevs_lock);
     list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
         if ( pdev->bus == bus && pdev->devfn == devfn )
         {
-            spin_lock(&pdev->lock);
             ret = iommu_remove_device(pdev);
             if ( pdev->domain )
                 list_del(&pdev->domain_list);
@@ -148,7 +139,48 @@ int pci_remove_device(u8 bus, u8 devfn)
             break;
         }
 
-    write_unlock(&pcidevs_lock);
+    spin_unlock(&pcidevs_lock);
+    return ret;
+}
+
+int pci_add_device_ext(u8 bus, u8 devfn, struct pci_dev_info *info)
+{
+    int ret;
+    char *pdev_type;
+    struct pci_dev *pdev;
+
+    if (info->is_extfn)
+        pdev_type = "Extended Function";
+    else if (info->is_virtfn)
+        pdev_type = "Virtual Function";
+    else
+       return -EINVAL;;
+
+
+    ret = -ENOMEM;
+    spin_lock(&pcidevs_lock);
+    pdev = alloc_pdev(bus, devfn);
+    if ( !pdev )
+        goto out;
+
+    pdev->info = *info;
+
+    ret = 0;
+    if ( !pdev->domain )
+    {
+        pdev->domain = dom0;
+        ret = iommu_add_device(pdev);
+        if ( ret )
+            goto out;
+
+        list_add(&pdev->domain_list, &dom0->arch.pdev_list);
+    }
+
+out:
+    spin_unlock(&pcidevs_lock);
+    printk(XENLOG_DEBUG "PCI add %s %02x:%02x.%x\n", pdev_type,
+           bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
     return ret;
 }
 
@@ -169,9 +201,9 @@ static void pci_clean_dpci_irqs(struct domain *d)
     hvm_irq_dpci = domain_get_irq_dpci(d);
     if ( hvm_irq_dpci != NULL )
     {
-        for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_PIRQS);
-              i < NR_PIRQS;
-              i = find_next_bit(hvm_irq_dpci->mapping, NR_PIRQS, i + 1) )
+        for ( i = find_first_bit(hvm_irq_dpci->mapping, NR_IRQS);
+              i < NR_IRQS;
+              i = find_next_bit(hvm_irq_dpci->mapping, NR_IRQS, i + 1) )
         {
             pirq_guest_unbind(d, i);
             kill_timer(&hvm_irq_dpci->hvm_timer[irq_to_vector(i)]);
@@ -197,37 +229,37 @@ void pci_release_devices(struct domain *d)
     struct pci_dev *pdev;
     u8 bus, devfn;
 
+    spin_lock(&pcidevs_lock);
     pci_clean_dpci_irqs(d);
-    while ( (pdev = pci_lock_domain_pdev(d, -1, -1)) )
+    while ( (pdev = pci_get_pdev_by_domain(d, -1, -1)) )
     {
         pci_cleanup_msi(pdev);
         bus = pdev->bus; devfn = pdev->devfn;
-        spin_unlock(&pdev->lock);
         deassign_device(d, bus, devfn);
     }
+    spin_unlock(&pcidevs_lock);
 }
 
+#ifdef SUPPORT_MSI_REMAPPING
 static void dump_pci_devices(unsigned char ch)
 {
     struct pci_dev *pdev;
     struct msi_desc *msi;
 
     printk("==== PCI devices ====\n");
-    read_lock(&pcidevs_lock);
+    spin_lock(&pcidevs_lock);
 
     list_for_each_entry ( pdev, &alldevs_list, alldevs_list )
     {
-        spin_lock(&pdev->lock);
         printk("%02x:%02x.%x - dom %-3d - MSIs < ",
                pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn),
                pdev->domain ? pdev->domain->domain_id : -1);
         list_for_each_entry ( msi, &pdev->msi_list, list )
                printk("%d ", msi->vector);
         printk(">\n");
-        spin_unlock(&pdev->lock);
     }
 
-    read_unlock(&pcidevs_lock);
+    spin_unlock(&pcidevs_lock);
 }
 
 static int __init setup_dump_pcidevs(void)
@@ -236,7 +268,7 @@ static int __init setup_dump_pcidevs(void)
     return 0;
 }
 __initcall(setup_dump_pcidevs);
-
+#endif
 
 
 /*
index 06ee624ae781f94c3d4ebf2e2c90d9b90643025b..0e6f1639df035f81365138218c79dc4cbe806731 100644 (file)
@@ -1,4 +1,5 @@
 subdir-$(x86) += x86
+subdir-$(ia64) += ia64
 
 obj-y += iommu.o
 obj-y += dmar.o
index 43107b3ae345a8af6a0554facf6d24d43de814d3..f5d7b786ff831859bddca5f3ef7ae810ba02a265 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <xen/init.h>
 #include <xen/bitmap.h>
+#include <xen/errno.h>
 #include <xen/kernel.h>
 #include <xen/acpi.h>
 #include <xen/mm.h>
@@ -29,6 +30,7 @@
 #include <xen/pci_regs.h>
 #include <asm/string.h>
 #include "dmar.h"
+#include "iommu.h"
 
 int vtd_enabled = 1;
 
@@ -150,12 +152,24 @@ static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr)
     return 0;
 }
 
-struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn)
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(struct pci_dev *pdev)
 {
+    u8 bus, devfn;
     struct acpi_drhd_unit *drhd;
     struct acpi_drhd_unit *found = NULL, *include_all = NULL;
     int i;
 
+    if (pdev->info.is_extfn) {
+        bus = pdev->bus;
+        devfn = 0;
+    } else if (pdev->info.is_virtfn) {
+        bus = pdev->info.physfn.bus;
+        devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn;
+    } else {
+        bus = pdev->bus;
+        devfn = pdev->devfn;
+    }
+
     list_for_each_entry ( drhd, &acpi_drhd_units, list )
     {
         for (i = 0; i < drhd->scope.devices_cnt; i++)
@@ -172,6 +186,28 @@ struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn)
     return found ? found : include_all;
 }
 
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn)
+{
+    struct acpi_atsr_unit *atsr;
+    struct acpi_atsr_unit *found = NULL, *include_all = NULL;
+    int i;
+
+    list_for_each_entry ( atsr, &acpi_atsr_units, list )
+    {
+        for (i = 0; i < atsr->scope.devices_cnt; i++)
+            if ( atsr->scope.devices[i] == PCI_BDF2(bus, devfn) )
+                return atsr;
+
+        if ( test_bit(bus, atsr->scope.buses) )
+            found = atsr;
+
+        if ( atsr->all_ports )
+            include_all = atsr;
+    }
+
+    return found ? found : include_all;
+}
+
 /*
  * Count number of devices in device scope.  Do not include PCI sub
  * hierarchies.
@@ -242,7 +278,6 @@ static int __init acpi_parse_dev_scope(void *start, void *end,
         switch ( acpi_scope->dev_type )
         {
         case ACPI_DEV_P2PBRIDGE:
-        {
             sec_bus = pci_conf_read8(
                 bus, path->dev, path->fn, PCI_SECONDARY_BUS);
             sub_bus = pci_conf_read8(
@@ -253,7 +288,6 @@ static int __init acpi_parse_dev_scope(void *start, void *end,
 
             dmar_scope_add_buses(scope, sec_bus, sub_bus);
             break;
-        }
 
         case ACPI_DEV_MSI_HPET:
             dprintk(XENLOG_INFO VTDPREFIX, "found MSI HPET: bdf = %x:%x.%x\n",
@@ -268,7 +302,6 @@ static int __init acpi_parse_dev_scope(void *start, void *end,
             break;
 
         case ACPI_DEV_IOAPIC:
-        {
             dprintk(XENLOG_INFO VTDPREFIX, "found IOAPIC: bdf = %x:%x.%x\n",
                     bus, path->dev, path->fn);
 
@@ -288,7 +321,6 @@ static int __init acpi_parse_dev_scope(void *start, void *end,
             scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn);
             break;
         }
-        }
 
         start += acpi_scope->length;
    }
@@ -351,10 +383,25 @@ acpi_parse_one_rmrr(struct acpi_dmar_entry_header *header)
 
     if ( rmrr->base_address >= rmrr->end_address )
     {
-        dprintk(XENLOG_ERR VTDPREFIX, "RMRR is incorrect.\n");
+        dprintk(XENLOG_ERR VTDPREFIX,
+                "RMRR error: base_addr %"PRIx64" end_address %"PRIx64"\n",
+                rmrr->base_address, rmrr->end_address);
         return -EFAULT;
     }
 
+#ifdef CONFIG_X86
+    /* This check is here simply to detect when RMRR values are not properly represented in the 
+       system memory map and inform the user */
+    if ( (!page_is_ram_type(paddr_to_pfn(rmrr->base_address), RAM_TYPE_RESERVED))||
+         (!page_is_ram_type(paddr_to_pfn(rmrr->end_address) - 1, RAM_TYPE_RESERVED)) )
+    {
+        dprintk(XENLOG_WARNING VTDPREFIX,
+                "RMRR address range not in reserved memory base = %"PRIx64" end = %"PRIx64"; " \
+                "iommu_inclusive_mapping=1 parameter may be needed.\n",
+                rmrr->base_address, rmrr->end_address);
+    }
+#endif
+
     rmrru = xmalloc(struct acpi_rmrr_unit);
     if ( !rmrru )
         return -ENOMEM;
@@ -485,6 +532,15 @@ static int __init acpi_parse_dmar(struct acpi_table_header *table)
     return ret;
 }
 
+#ifdef CONFIG_X86
+#include <asm/tboot.h>
+/* ACPI tables may not be DMA protected by tboot, so use DMAR copy */
+/* SINIT saved in SinitMleData in TXT heap (which is DMA protected) */
+#define parse_dmar_table(h) tboot_parse_dmar_table(h)
+#else
+#define parse_dmar_table(h) acpi_table_parse(ACPI_SIG_DMAR, h)
+#endif
+
 int acpi_dmar_init(void)
 {
     int rc;
@@ -496,7 +552,7 @@ int acpi_dmar_init(void)
     if ( !iommu_enabled )
         goto fail;
 
-    rc = acpi_table_parse(ACPI_SIG_DMAR, acpi_parse_dmar);
+    rc = parse_dmar_table(acpi_parse_dmar);
     if ( rc )
         goto fail;
 
@@ -504,7 +560,7 @@ int acpi_dmar_init(void)
     if ( list_empty(&acpi_drhd_units) )
         goto fail;
 
-    printk("Intel VT-d has been enabled\n");
+    printk("Intel VT-d DMAR tables have been parsed.\n");
 
     return 0;
 
index bcbb88bf342221ba84f8f4f00d32e642d58ed728..3664b19d2107908bf952f1cde5d3c6c4c7bb04ec 100644 (file)
@@ -79,7 +79,8 @@ struct acpi_atsr_unit {
         for (idx = 0; (bdf = rmrr->scope.devices[idx]) && \
                  idx < rmrr->scope.devices_cnt; idx++)
 
-struct acpi_drhd_unit * acpi_find_matched_drhd_unit(u8 bus, u8 devfn);
+struct acpi_drhd_unit * acpi_find_matched_drhd_unit(struct pci_dev *pdev);
+struct acpi_atsr_unit * acpi_find_matched_atsr_unit(u8 bus, u8 devfn);
 void dmar_scope_add_buses(struct dmar_scope *scope, u16 sec, u16 sub);
 void dmar_scope_remove_buses(struct dmar_scope *scope, u16 sec, u16 sub);
 
index 19091216e9ab85788c8c0b70aa93bc87fb4fcb9c..52b531f3b0e35121de572a7169734005caf43223 100644 (file)
@@ -30,8 +30,10 @@ void print_iommu_regs(struct acpi_drhd_unit *drhd);
 void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn);
 void dump_iommu_info(unsigned char key);
 
-int qinval_setup(struct iommu *iommu);
-int intremap_setup(struct iommu *iommu);
+int enable_qinval(struct iommu *iommu);
+void disable_qinval(struct iommu *iommu);
+int enable_intremap(struct iommu *iommu);
+void disable_intremap(struct iommu *iommu);
 int queue_invalidate_context(struct iommu *iommu,
     u16 did, u16 source_id, u8 function_mask, u8 granu);
 int queue_invalidate_iotlb(struct iommu *iommu,
diff --git a/xen/drivers/passthrough/vtd/ia64/Makefile b/xen/drivers/passthrough/vtd/ia64/Makefile
new file mode 100644 (file)
index 0000000..85243e3
--- /dev/null
@@ -0,0 +1 @@
+obj-y += vtd.o
diff --git a/xen/drivers/passthrough/vtd/ia64/vtd.c b/xen/drivers/passthrough/vtd/ia64/vtd.c
new file mode 100644 (file)
index 0000000..91d6338
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) Allen Kay <allen.m.kay@intel.com>
+ * Copyright (C) Weidong Han <weidong.han@intel.com>
+ */
+
+#include <xen/sched.h>
+#include <xen/domain_page.h>
+#include <xen/iommu.h>
+#include <xen/numa.h>
+#include <asm/xensystem.h>
+#include <asm/sal.h>
+#include "../iommu.h"
+#include "../dmar.h"
+#include "../vtd.h"
+
+
+int vector_irq[NR_VECTORS] __read_mostly = {
+    [0 ... NR_VECTORS - 1] = FREE_TO_ASSIGN_IRQ
+};
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+u8 irq_vector[NR_IRQS] __read_mostly;
+
+void *map_vtd_domain_page(u64 maddr)
+{
+    return (void *)((u64)map_domain_page(maddr >> PAGE_SHIFT) |
+            (maddr & (PAGE_SIZE - PAGE_SIZE_4K)));
+}
+
+void unmap_vtd_domain_page(void *va)
+{
+    unmap_domain_page(va);
+}
+
+/* Allocate page table, return its machine address */
+u64 alloc_pgtable_maddr(struct domain *d, unsigned long npages)
+{
+    struct page_info *pg;
+    u64 *vaddr;
+
+    pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
+                             d ? MEMF_node(domain_to_node(d)) : 0);
+    vaddr = map_domain_page(page_to_mfn(pg));
+    if ( !vaddr )
+        return 0;
+    memset(vaddr, 0, PAGE_SIZE * npages);
+
+    iommu_flush_cache_page(vaddr, npages);
+    unmap_domain_page(vaddr);
+
+    return page_to_maddr(pg);
+}
+
+void free_pgtable_maddr(u64 maddr)
+{
+    if ( maddr != 0 )
+        free_domheap_page(maddr_to_page(maddr));
+}
+
+unsigned int get_cache_line_size(void)
+{
+    return L1_CACHE_BYTES;
+}
+
+void cacheline_flush(char * addr)
+{
+    ia64_fc(addr);
+    ia64_sync_i();
+    ia64_srlz_i();
+}
+
+void flush_all_cache()
+{
+    ia64_sal_cache_flush(3);
+}
+
+void * map_to_nocache_virt(int nr_iommus, u64 maddr)
+{
+  return (void *) ( maddr + __IA64_UNCACHED_OFFSET);
+}
+
+struct hvm_irq_dpci *domain_get_irq_dpci(struct domain *domain)
+{
+    if ( !domain )
+        return NULL;
+
+    return domain->arch.hvm_domain.irq.dpci;
+}
+
+int domain_set_irq_dpci(struct domain *domain, struct hvm_irq_dpci *dpci)
+{
+    if ( !domain || !dpci )
+        return 0;
+
+    domain->arch.hvm_domain.irq.dpci = dpci;
+    return 1;
+}
+
+void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq)
+{
+    /* dummy */
+}
+
+static int do_dom0_iommu_mapping(unsigned long start, unsigned long end,
+                               void *arg)
+{
+    unsigned long tmp, pfn, j, page_addr = start;
+    struct domain *d = (struct domain *)arg;
+
+    extern int xen_in_range(paddr_t start, paddr_t end);
+    /* Set up 1:1 page table for dom0 for all Ram except Xen bits.*/
+
+    while (page_addr < end)
+    {
+       if (xen_in_range(page_addr, page_addr + PAGE_SIZE))
+            continue;
+
+        pfn = page_addr >> PAGE_SHIFT;
+        tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K);
+        for ( j = 0; j < tmp; j++ )
+            iommu_map_page(d, (pfn*tmp+j), (pfn*tmp+j));
+
+       page_addr += PAGE_SIZE;
+    }
+    return 0;
+}
+
+void iommu_set_dom0_mapping(struct domain *d)
+{
+       if (dom0)
+           BUG_ON(d != dom0);
+       efi_memmap_walk(do_dom0_iommu_mapping, d);
+}
index e2ed43db17e048ea069d61128f46d74ccd9800cc..eb3e69ef3b9fbb521e10287e067cfb7d67575ab3 100644 (file)
@@ -21,6 +21,7 @@
 #include <xen/irq.h>
 #include <xen/sched.h>
 #include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
 #include <xen/time.h>
 #include <xen/pci.h>
 #include <xen/pci_regs.h>
 #include "vtd.h"
 #include "extern.h"
 
+#ifndef dest_SMI
+#define dest_SMI -1
+#endif
+
+/* The max number of IOAPIC (or IOSAPIC) pin. The typical values can be 24 or
+ * 48 on x86 and Itanium platforms. Here we use a biger number 256. This
+ * should be big enough. Actually now IREMAP_ENTRY_NR is also 256.
+ */
+#define MAX_IOAPIC_PIN_NUM  256
+
+static int ioapic_pin_to_intremap_index[MAX_IOAPIC_PIN_NUM] =
+    { [0 ... MAX_IOAPIC_PIN_NUM-1] = -1 };
+
 u16 apicid_to_bdf(int apic_id)
 {
     struct acpi_drhd_unit *drhd = ioapic_to_drhd(apic_id);
@@ -89,7 +103,7 @@ static int remap_entry_to_ioapic_rte(
 }
 
 static int ioapic_rte_to_remap_entry(struct iommu *iommu,
-    int apic_id, struct IO_xAPIC_route_entry *old_rte,
+    int apic_id, unsigned int ioapic_pin, struct IO_xAPIC_route_entry *old_rte,
     unsigned int rte_upper, unsigned int value)
 {
     struct iremap_entry *iremap_entry = NULL, *iremap_entries;
@@ -103,13 +117,14 @@ static int ioapic_rte_to_remap_entry(struct iommu *iommu,
     remap_rte = (struct IO_APIC_route_remap_entry *) old_rte;
     spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
 
-    if ( remap_rte->format == 0 )
+    if ( ioapic_pin_to_intremap_index[ioapic_pin] < 0 )
     {
         ir_ctrl->iremap_index++;
         index = ir_ctrl->iremap_index;
+        ioapic_pin_to_intremap_index[ioapic_pin] = index;
     }
     else
-        index = (remap_rte->index_15 << 15) | remap_rte->index_0_14;
+        index = ioapic_pin_to_intremap_index[ioapic_pin];
 
     if ( index > IREMAP_ENTRY_NR - 1 )
     {
@@ -128,7 +143,13 @@ static int ioapic_rte_to_remap_entry(struct iommu *iommu,
     memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry));
 
     if ( rte_upper )
+    {
+#if defined(__i386__) || defined(__x86_64__)
         new_ire.lo.dst = (value >> 24) << 8;
+#else /* __ia64__ */
+        new_ire.lo.dst = value >> 16;
+#endif
+    }
     else
     {
         *(((u32 *)&new_rte) + 0) = value;
@@ -179,7 +200,7 @@ unsigned int io_apic_read_remap_rte(
     struct IO_xAPIC_route_entry old_rte = { 0 };
     struct IO_APIC_route_remap_entry *remap_rte;
     int rte_upper = (reg & 1) ? 1 : 0;
-    struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid);
+    struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic));
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
     if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ||
@@ -200,7 +221,7 @@ unsigned int io_apic_read_remap_rte(
 
     remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
 
-    if ( remap_rte->format == 0 )
+    if ( (remap_rte->format == 0) || (old_rte.delivery_mode == dest_SMI) )
     {
         *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
         return *(IO_APIC_BASE(apic)+4);
@@ -221,10 +242,11 @@ unsigned int io_apic_read_remap_rte(
 void io_apic_write_remap_rte(
     unsigned int apic, unsigned int reg, unsigned int value)
 {
+    unsigned int ioapic_pin = (reg - 0x10) / 2;
     struct IO_xAPIC_route_entry old_rte = { 0 };
     struct IO_APIC_route_remap_entry *remap_rte;
     unsigned int rte_upper = (reg & 1) ? 1 : 0;
-    struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid);
+    struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic));
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
     int saved_mask;
 
@@ -246,6 +268,31 @@ void io_apic_write_remap_rte(
 
     remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte;
 
+    if ( old_rte.delivery_mode == dest_SMI )
+    {
+        /* Some BIOS does not zero out reserve fields in IOAPIC
+         * RTE's.  clear_IO_APIC() zeroes out all RTE's except for RTE
+         * with MSI delivery type.  This is a problem when the host
+         * OS converts SMI delivery type to some other type but leaving
+         * the reserved field uninitialized.  This can cause interrupt
+         * remapping table out of bound error if "format" field is 1
+         * and the "index" field has a value that that is larger than 
+         * the maximum index of interrupt remapping table.
+         */
+        if ( remap_rte->format == 1 )
+        {
+            remap_rte->format = 0;
+            *IO_APIC_BASE(apic) = reg;
+            *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+0);
+            *IO_APIC_BASE(apic) = reg + 1;
+            *(IO_APIC_BASE(apic)+4) = *(((u32 *)&old_rte)+1);
+        }
+
+        *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
+        *(IO_APIC_BASE(apic)+4) = value;
+        return;
+    }
+
     /* mask the interrupt while we change the intremap table */
     saved_mask = remap_rte->mask;
     remap_rte->mask = 1;
@@ -253,7 +300,8 @@ void io_apic_write_remap_rte(
     *(IO_APIC_BASE(apic)+4) = *(((int *)&old_rte)+0);
     remap_rte->mask = saved_mask;
 
-    if ( ioapic_rte_to_remap_entry(iommu, mp_ioapics[apic].mpc_apicid,
+    ASSERT(ioapic_pin < MAX_IOAPIC_PIN_NUM);
+    if ( ioapic_rte_to_remap_entry(iommu, IO_APIC_ID(apic), ioapic_pin,
                                    &old_rte, rte_upper, value) )
     {
         *IO_APIC_BASE(apic) = rte_upper ? (reg + 1) : reg;
@@ -414,7 +462,7 @@ void msi_msg_read_remap_rte(
     struct iommu *iommu = NULL;
     struct ir_ctrl *ir_ctrl;
 
-    drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+    drhd = acpi_find_matched_drhd_unit(pdev);
     iommu = drhd->iommu;
 
     ir_ctrl = iommu_ir_ctrl(iommu);
@@ -432,7 +480,7 @@ void msi_msg_write_remap_rte(
     struct iommu *iommu = NULL;
     struct ir_ctrl *ir_ctrl;
 
-    drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+    drhd = acpi_find_matched_drhd_unit(pdev);
     iommu = drhd->iommu;
 
     ir_ctrl = iommu_ir_ctrl(iommu);
@@ -455,23 +503,22 @@ void msi_msg_write_remap_rte(
 }
 #endif
 
-int intremap_setup(struct iommu *iommu)
+int enable_intremap(struct iommu *iommu)
 {
     struct ir_ctrl *ir_ctrl;
     s_time_t start_time;
 
-    if ( !ecap_intr_remap(iommu->ecap) )
-        return -ENODEV;
+    ASSERT(ecap_intr_remap(iommu->ecap) && iommu_intremap);
 
     ir_ctrl = iommu_ir_ctrl(iommu);
     if ( ir_ctrl->iremap_maddr == 0 )
     {
-        ir_ctrl->iremap_maddr = alloc_pgtable_maddr();
+        ir_ctrl->iremap_maddr = alloc_pgtable_maddr(NULL, 1);
         if ( ir_ctrl->iremap_maddr == 0 )
         {
             dprintk(XENLOG_WARNING VTDPREFIX,
                     "Cannot allocate memory for ir_ctrl->iremap_maddr\n");
-            return -ENODEV;
+            return -ENOMEM;
         }
         ir_ctrl->iremap_index = -1;
     }
@@ -479,10 +526,10 @@ int intremap_setup(struct iommu *iommu)
 #if defined(ENABLED_EXTENDED_INTERRUPT_SUPPORT)
     /* set extended interrupt mode bit */
     ir_ctrl->iremap_maddr |=
-            ecap_ext_intr(iommu->ecap) ? (1 << IRTA_REG_EIMI_SHIFT) : 0;
+            ecap_ext_intr(iommu->ecap) ? (1 << IRTA_REG_EIME_SHIFT) : 0;
 #endif
-    /* size field = 256 entries per 4K page = 8 - 1 */
-    ir_ctrl->iremap_maddr |= 7;
+    /* set size of the interrupt remapping table */
+    ir_ctrl->iremap_maddr |= IRTA_REG_TABLE_SIZE;
     dmar_writeq(iommu->reg, DMAR_IRTA_REG, ir_ctrl->iremap_maddr);
 
     /* set SIRTP */
@@ -494,11 +541,7 @@ int intremap_setup(struct iommu *iommu)
     while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_SIRTPS) )
     {
         if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
-        {
-            dprintk(XENLOG_ERR VTDPREFIX,
-                    "Cannot set SIRTP field for interrupt remapping\n");
-            return -ENODEV;
-        }
+            panic("Cannot set SIRTP field for interrupt remapping\n");
         cpu_relax();
     }
 
@@ -510,11 +553,7 @@ int intremap_setup(struct iommu *iommu)
     while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_CFIS) )
     {
         if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
-        {
-            dprintk(XENLOG_ERR VTDPREFIX,
-                    "Cannot set CFI field for interrupt remapping\n");
-            return -ENODEV;
-        }
+            panic("Cannot set CFI field for interrupt remapping\n");
         cpu_relax();
     }
 
@@ -525,12 +564,8 @@ int intremap_setup(struct iommu *iommu)
     start_time = NOW();
     while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_IRES) )
     {
-        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) 
-        {
-            dprintk(XENLOG_ERR VTDPREFIX,
-                    "Cannot set IRE field for interrupt remapping\n");
-            return -ENODEV;
-        }
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            panic("Cannot set IRE field for interrupt remapping\n");
         cpu_relax();
     }
 
@@ -539,3 +574,21 @@ int intremap_setup(struct iommu *iommu)
 
     return 0;
 }
+
+void disable_intremap(struct iommu *iommu)
+{
+    s_time_t start_time;
+
+    ASSERT(ecap_intr_remap(iommu->ecap) && iommu_intremap);
+
+    iommu->gcmd &= ~(DMA_GCMD_SIRTP | DMA_GCMD_CFI | DMA_GCMD_IRE);
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    start_time = NOW();
+    while ( dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_IRES )
+    {
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            panic("Cannot clear IRE field for interrupt remapping\n");
+        cpu_relax();
+    }
+}
index 2b36b7d06877afef705f8e89d395dec0d1b5fe5b..6e4d4a1afad6be64f4070f0284bbbea8641ee5d0 100644 (file)
 #include <xen/xmalloc.h>
 #include <xen/domain_page.h>
 #include <xen/iommu.h>
+#include <asm/hvm/iommu.h>
 #include <xen/numa.h>
 #include <xen/time.h>
 #include <xen/pci.h>
 #include <xen/pci_regs.h>
 #include <xen/keyhandler.h>
+#include <asm/msi.h>
 #include "iommu.h"
 #include "dmar.h"
 #include "extern.h"
@@ -39,6 +41,7 @@
 static spinlock_t domid_bitmap_lock;    /* protect domain id bitmap */
 static int domid_bitmap_size;           /* domain id bitmap size in bits */
 static unsigned long *domid_bitmap;     /* iommu domain id bitmap */
+static bool_t rwbf_quirk;
 
 static void setup_dom0_devices(struct domain *d);
 static void setup_dom0_rmrr(struct domain *d);
@@ -48,15 +51,14 @@ static void setup_dom0_rmrr(struct domain *d);
 static void context_set_domain_id(struct context_entry *context,
                                   struct domain *d)
 {
-    unsigned long flags;
     domid_t iommu_domid = domain_iommu_domid(d);
 
     if ( iommu_domid == 0 )
     {
-        spin_lock_irqsave(&domid_bitmap_lock, flags);
+        spin_lock(&domid_bitmap_lock);
         iommu_domid = find_first_zero_bit(domid_bitmap, domid_bitmap_size);
         set_bit(iommu_domid, domid_bitmap);
-        spin_unlock_irqrestore(&domid_bitmap_lock, flags);
+        spin_unlock(&domid_bitmap_lock);
         d->arch.hvm_domain.hvm_iommu.iommu_domid = iommu_domid;
     }
 
@@ -129,9 +131,9 @@ void iommu_flush_cache_entry(void *addr)
     __iommu_flush_cache(addr, 8);
 }
 
-void iommu_flush_cache_page(void *addr)
+void iommu_flush_cache_page(void *addr, unsigned long npages)
 {
-    __iommu_flush_cache(addr, PAGE_SIZE_4K);
+    __iommu_flush_cache(addr, PAGE_SIZE_4K * npages);
 }
 
 int nr_iommus;
@@ -139,19 +141,17 @@ int nr_iommus;
 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
 {
     struct root_entry *root, *root_entries;
-    unsigned long flags;
     u64 maddr;
 
-    spin_lock_irqsave(&iommu->lock, flags);
+    ASSERT(spin_is_locked(&iommu->lock));
     root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
     root = &root_entries[bus];
     if ( !root_present(*root) )
     {
-        maddr = alloc_pgtable_maddr();
+        maddr = alloc_pgtable_maddr(NULL, 1);
         if ( maddr == 0 )
         {
             unmap_vtd_domain_page(root_entries);
-            spin_unlock_irqrestore(&iommu->lock, flags);
             return 0;
         }
         set_root_value(*root, maddr);
@@ -160,36 +160,9 @@ static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
     }
     maddr = (u64) get_context_addr(*root);
     unmap_vtd_domain_page(root_entries);
-    spin_unlock_irqrestore(&iommu->lock, flags);
     return maddr;
 }
 
-static int device_context_mapped(struct iommu *iommu, u8 bus, u8 devfn)
-{
-    struct root_entry *root, *root_entries;
-    struct context_entry *context;
-    u64 context_maddr;
-    int ret;
-    unsigned long flags;
-
-    spin_lock_irqsave(&iommu->lock, flags);
-    root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
-    root = &root_entries[bus];
-    if ( !root_present(*root) )
-    {
-        ret = 0;
-        goto out;
-    }
-    context_maddr = get_context_addr(*root);
-    context = (struct context_entry *)map_vtd_domain_page(context_maddr);
-    ret = context_present(context[devfn]);
-    unmap_vtd_domain_page(context);
- out:
-    unmap_vtd_domain_page(root_entries);
-    spin_unlock_irqrestore(&iommu->lock, flags);
-    return ret;
-}
-
 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(domain);
@@ -197,14 +170,13 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
     struct dma_pte *parent, *pte = NULL;
     int level = agaw_to_level(hd->agaw);
     int offset;
-    unsigned long flags;
     u64 pte_maddr = 0, maddr;
     u64 *vaddr = NULL;
 
     addr &= (((u64)1) << addr_width) - 1;
-    spin_lock_irqsave(&hd->mapping_lock, flags);
+    ASSERT(spin_is_locked(&hd->mapping_lock));
     if ( hd->pgd_maddr == 0 )
-        if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr()) == 0) )
+        if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(domain, 1)) == 0) )
             goto out;
 
     parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr);
@@ -217,11 +189,11 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
         {
             if ( !alloc )
                 break;
-            maddr = alloc_pgtable_maddr();
+            maddr = alloc_pgtable_maddr(domain, 1);
+            if ( !maddr )
+                break;
             dma_set_pte_addr(*pte, maddr);
             vaddr = map_vtd_domain_page(maddr);
-            if ( !vaddr )
-                break;
 
             /*
              * high level table always sets r/w, last level
@@ -234,8 +206,6 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
         else
         {
             vaddr = map_vtd_domain_page(pte->val);
-            if ( !vaddr )
-                break;
         }
 
         if ( level == 2 )
@@ -253,7 +223,6 @@ static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
 
     unmap_vtd_domain_page(parent);
  out:
-    spin_unlock_irqrestore(&hd->mapping_lock, flags);
     return pte_maddr;
 }
 
@@ -263,7 +232,7 @@ static void iommu_flush_write_buffer(struct iommu *iommu)
     unsigned long flag;
     s_time_t start_time;
 
-    if ( !cap_rwbf(iommu->cap) )
+    if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
         return;
     val = iommu->gcmd | DMA_GCMD_WBF;
 
@@ -447,10 +416,6 @@ static int flush_iotlb_reg(void *_iommu, u16 did,
     if ( DMA_TLB_IAIG(val) == 0 )
         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
 
-    if ( DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type) )
-        dprintk(XENLOG_INFO VTDPREFIX,
-                "IOMMU: tlb flush request %x, actual %x\n",
-               (u32)DMA_TLB_IIRG(type), (u32)DMA_TLB_IAIG(val));
     /* flush iotlb entry will implicitly flush write buffer */
     return 0;
 }
@@ -541,22 +506,30 @@ static void dma_pte_clear_one(struct domain *domain, u64 addr)
     struct dma_pte *page = NULL, *pte = NULL;
     u64 pg_maddr;
 
+    spin_lock(&hd->mapping_lock);
     /* get last level pte */
     pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
     if ( pg_maddr == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
         return;
+    }
+
     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
     pte = page + address_level_offset(addr, 1);
 
     if ( !dma_pte_present(*pte) )
     {
+        spin_unlock(&hd->mapping_lock);
         unmap_vtd_domain_page(page);
         return;
     }
 
     dma_clear_pte(*pte); 
+    spin_unlock(&hd->mapping_lock);
     iommu_flush_cache_entry(pte);
 
+    /* No need pcidevs_lock here since do that on assign/deassign device*/
     for_each_drhd_unit ( drhd )
     {
         iommu = drhd->iommu;
@@ -569,26 +542,6 @@ static void dma_pte_clear_one(struct domain *domain, u64 addr)
     unmap_vtd_domain_page(page);
 }
 
-/* clear last level pte, a tlb flush should be followed */
-static void dma_pte_clear_range(struct domain *domain, u64 start, u64 end)
-{
-    struct hvm_iommu *hd = domain_hvm_iommu(domain);
-    int addr_width = agaw_to_width(hd->agaw);
-
-    start &= (((u64)1) << addr_width) - 1;
-    end &= (((u64)1) << addr_width) - 1;
-    /* in case it's partial page */
-    start = PAGE_ALIGN_4K(start);
-    end &= PAGE_MASK_4K;
-
-    /* we don't need lock here, nobody else touches the iova range */
-    while ( start < end )
-    {
-        dma_pte_clear_one(domain, start);
-        start += PAGE_SIZE_4K;
-    }
-}
-
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
     int i;
@@ -623,16 +576,18 @@ static int iommu_set_root_entry(struct iommu *iommu)
     unsigned long flags;
     s_time_t start_time;
 
-    spin_lock_irqsave(&iommu->register_lock, flags);
+    spin_lock(&iommu->lock);
 
     if ( iommu->root_maddr == 0 )
-        iommu->root_maddr = alloc_pgtable_maddr();
+        iommu->root_maddr = alloc_pgtable_maddr(NULL, 1);
     if ( iommu->root_maddr == 0 )
     {
-        spin_unlock_irqrestore(&iommu->register_lock, flags);
+        spin_unlock(&iommu->lock);
         return -ENOMEM;
     }
 
+    spin_unlock(&iommu->lock);
+    spin_lock_irqsave(&iommu->register_lock, flags);
     dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
     cmd = iommu->gcmd | DMA_GCMD_SRTP;
     dmar_writel(iommu->reg, DMAR_GCMD_REG, cmd);
@@ -655,7 +610,7 @@ static int iommu_set_root_entry(struct iommu *iommu)
     return 0;
 }
 
-static int iommu_enable_translation(struct iommu *iommu)
+static void iommu_enable_translation(struct iommu *iommu)
 {
     u32 sts;
     unsigned long flags;
@@ -682,10 +637,9 @@ static int iommu_enable_translation(struct iommu *iommu)
     /* Disable PMRs when VT-d engine takes effect per spec definition */
     disable_pmr(iommu);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
-    return 0;
 }
 
-int iommu_disable_translation(struct iommu *iommu)
+static void iommu_disable_translation(struct iommu *iommu)
 {
     u32 sts;
     unsigned long flags;
@@ -708,7 +662,6 @@ int iommu_disable_translation(struct iommu *iommu)
         cpu_relax();
     }
     spin_unlock_irqrestore(&iommu->register_lock, flags);
-    return 0;
 }
 
 static struct iommu *vector_to_iommu[NR_VECTORS];
@@ -736,22 +689,22 @@ static void iommu_fault_status(u32 fault_status)
     if ( fault_status & DMA_FSTS_PFO )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Fault Overflow\n");
-    else if ( fault_status & DMA_FSTS_PPF )
+    if ( fault_status & DMA_FSTS_PPF )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Primary Pending Fault\n");
-    else if ( fault_status & DMA_FSTS_AFO )
+    if ( fault_status & DMA_FSTS_AFO )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Advanced Fault Overflow\n");
-    else if ( fault_status & DMA_FSTS_APF )
+    if ( fault_status & DMA_FSTS_APF )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Advanced Pending Fault\n");
-    else if ( fault_status & DMA_FSTS_IQE )
+    if ( fault_status & DMA_FSTS_IQE )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Invalidation Queue Error\n");
-    else if ( fault_status & DMA_FSTS_ICE )
+    if ( fault_status & DMA_FSTS_ICE )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Invalidation Completion Error\n");
-    else if ( fault_status & DMA_FSTS_ITE )
+    if ( fault_status & DMA_FSTS_ITE )
         dprintk(XENLOG_ERR VTDPREFIX,
             "iommu_fault_status: Invalidation Time-out Error\n");
 }
@@ -768,18 +721,17 @@ static void iommu_page_fault(int vector, void *dev_id,
     dprintk(XENLOG_WARNING VTDPREFIX,
             "iommu_page_fault: iommu->reg = %p\n", iommu->reg);
 
-    spin_lock_irqsave(&iommu->register_lock, flags);
     fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
-    spin_unlock_irqrestore(&iommu->register_lock, flags);
 
     iommu_fault_status(fault_status);
 
     /* FIXME: ignore advanced fault log */
     if ( !(fault_status & DMA_FSTS_PPF) )
-        return;
+        goto clear_overflow;
+
     fault_index = dma_fsts_fault_record_index(fault_status);
     reg = cap_fault_reg_offset(iommu->cap);
-    for ( ; ; )
+    while (1)
     {
         u8 fault_reason;
         u16 source_id;
@@ -819,8 +771,9 @@ static void iommu_page_fault(int vector, void *dev_id,
         if ( fault_index > cap_num_fault_regs(iommu->cap) )
             fault_index = 0;
     }
-
+clear_overflow:
     /* clear primary fault overflow */
+    fault_status = readl(iommu->reg + DMAR_FSTS_REG);
     if ( fault_status & DMA_FSTS_PFO )
     {
         spin_lock_irqsave(&iommu->register_lock, flags);
@@ -911,27 +864,32 @@ static struct hw_interrupt_type dma_msi_type = {
     .set_affinity = dma_msi_set_affinity,
 };
 
-int iommu_set_interrupt(struct iommu *iommu)
+static int iommu_set_interrupt(struct iommu *iommu)
 {
     int vector, ret;
 
-    vector = assign_irq_vector(AUTO_ASSIGN);
-    vector_to_iommu[vector] = iommu;
-
-    /* VT-d fault is a MSI, make irq == vector */
-    irq_vector[vector] = vector;
-    vector_irq[vector] = vector;
-
-    if ( !vector )
+    vector = assign_irq_vector(AUTO_ASSIGN_IRQ);
+    if ( vector <= 0 )
     {
         gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no vectors\n");
         return -EINVAL;
     }
 
     irq_desc[vector].handler = &dma_msi_type;
-    ret = request_irq(vector, iommu_page_fault, 0, "dmar", iommu);
+    vector_to_iommu[vector] = iommu;
+    ret = request_irq_vector(vector, iommu_page_fault, 0, "dmar", iommu);
     if ( ret )
+    {
+        irq_desc[vector].handler = &no_irq_type;
+        vector_to_iommu[vector] = NULL;
+        free_irq_vector(vector);
         gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
+        return ret;
+    }
+
+    /* Make sure that vector is never re-used. */
+    vector_irq[vector] = NEVER_ASSIGN_IRQ;
+
     return vector;
 }
 
@@ -1007,7 +965,7 @@ static void iommu_free(struct acpi_drhd_unit *drhd)
         iounmap(iommu->reg);
 
     free_intel_iommu(iommu->intel);
-    free_irq(iommu->vector);
+    release_irq_vector(iommu->vector);
     xfree(iommu);
 
     drhd->iommu = NULL;
@@ -1024,7 +982,6 @@ static int intel_iommu_domain_init(struct domain *d)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct iommu *iommu = NULL;
-    u64 i;
     struct acpi_drhd_unit *drhd;
 
     drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
@@ -1034,21 +991,8 @@ static int intel_iommu_domain_init(struct domain *d)
 
     if ( d->domain_id == 0 )
     {
-        extern int xen_in_range(paddr_t start, paddr_t end);
-        extern int tboot_in_range(paddr_t start, paddr_t end);
-
-        /* 
-         * Set up 1:1 page table for dom0 except the critical segments
-         * like Xen and tboot.
-         */
-        for ( i = 0; i < max_page; i++ )
-        {
-            if ( xen_in_range(i << PAGE_SHIFT_4K, (i + 1) << PAGE_SHIFT_4K) ||
-                 tboot_in_range(i << PAGE_SHIFT_4K, (i + 1) << PAGE_SHIFT_4K) )
-                continue;
-
-            iommu_map_page(d, i, i);
-        }
+        /* Set up 1:1 page table for dom0 */
+        iommu_set_dom0_mapping(d);
 
         setup_dom0_devices(d);
         setup_dom0_rmrr(d);
@@ -1058,8 +1002,7 @@ static int intel_iommu_domain_init(struct domain *d)
         for_each_drhd_unit ( drhd )
         {
             iommu = drhd->iommu;
-            if ( iommu_enable_translation(iommu) )
-                return -EIO;
+            iommu_enable_translation(iommu);
         }
     }
 
@@ -1073,29 +1016,39 @@ static int domain_context_mapping_one(
 {
     struct hvm_iommu *hd = domain_hvm_iommu(domain);
     struct context_entry *context, *context_entries;
-    unsigned long flags;
     u64 maddr, pgd_maddr;
+    struct pci_dev *pdev = NULL;
     int agaw;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    spin_lock(&iommu->lock);
     maddr = bus_to_context_maddr(iommu, bus);
     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
     context = &context_entries[devfn];
 
     if ( context_present(*context) )
     {
+        int res = 0;
+
+        pdev = pci_get_pdev(bus, devfn);
+        if (!pdev)
+            res = -ENODEV;
+        else if (pdev->domain != domain)
+            res = -EINVAL;
         unmap_vtd_domain_page(context_entries);
-        return 0;
+        spin_unlock(&iommu->lock);
+        return res;
     }
 
-    spin_lock_irqsave(&iommu->lock, flags);
-    if ( iommu_passthrough &&
-         ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+    if ( iommu_passthrough && (domain->domain_id == 0) )
     {
         context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
         agaw = level_to_agaw(iommu->nr_pt_levels);
     }
     else
     {
+        spin_lock(&hd->mapping_lock);
+
         /* Ensure we have pagetables allocated down to leaf PTE. */
         if ( hd->pgd_maddr == 0 )
         {
@@ -1103,8 +1056,9 @@ static int domain_context_mapping_one(
             if ( hd->pgd_maddr == 0 )
             {
             nomem:
+                spin_unlock(&hd->mapping_lock);
+                spin_unlock(&iommu->lock);
                 unmap_vtd_domain_page(context_entries);
-                spin_unlock_irqrestore(&iommu->lock, flags);
                 return -ENOMEM;
             }
         }
@@ -1124,6 +1078,7 @@ static int domain_context_mapping_one(
 
         context_set_address_root(*context, pgd_maddr);
         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+        spin_unlock(&hd->mapping_lock);
     }
 
     /*
@@ -1135,17 +1090,18 @@ static int domain_context_mapping_one(
     context_set_fault_enable(*context);
     context_set_present(*context);
     iommu_flush_cache_entry(context);
-
-    unmap_vtd_domain_page(context_entries);
+    spin_unlock(&iommu->lock);
 
     /* Context entry was previously non-present (with domid 0). */
-    iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
-                               DMA_CCMD_MASK_NOBIT, 1);
-    if ( iommu_flush_iotlb_dsi(iommu, 0, 1) )
+    if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn,
+                                    DMA_CCMD_MASK_NOBIT, 1) )
         iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, 0, 1);
 
     set_bit(iommu->index, &hd->iommu_bitmap);
-    spin_unlock_irqrestore(&iommu->lock, flags);
+
+    unmap_vtd_domain_page(context_entries);
 
     return 0;
 }
@@ -1155,8 +1111,8 @@ static int domain_context_mapping_one(
 
 enum {
     DEV_TYPE_PCIe_ENDPOINT,
-    DEV_TYPE_PCIe_BRIDGE,
-    DEV_TYPE_PCI_BRIDGE,
+    DEV_TYPE_PCIe_BRIDGE,    // PCIe root port, switch
+    DEV_TYPE_PCI_BRIDGE,     // PCIe-to-PCI/PCIx bridge, PCI-to-PCI bridge
     DEV_TYPE_PCI,
 };
 
@@ -1170,7 +1126,8 @@ int pdev_type(u8 bus, u8 devfn)
     class_device = pci_conf_read16(bus, d, f, PCI_CLASS_DEVICE);
     if ( class_device == PCI_CLASS_BRIDGE_PCI )
     {
-        pos = pci_find_next_cap(bus, devfn, PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
+        pos = pci_find_next_cap(bus, devfn,
+                                PCI_CAPABILITY_LIST, PCI_CAP_ID_EXP);
         if ( !pos )
             return DEV_TYPE_PCI_BRIDGE;
         creg = pci_conf_read16(bus, d, f, pos + PCI_EXP_FLAGS);
@@ -1189,17 +1146,15 @@ int pdev_type(u8 bus, u8 devfn)
 }
 
 #define MAX_BUSES 256
+static DEFINE_SPINLOCK(bus2bridge_lock);
 static struct { u8 map, bus, devfn; } bus2bridge[MAX_BUSES];
 
-static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
+static int _find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
 {
     int cnt = 0;
     *secbus = *bus;
 
-    if ( *bus == 0 )
-        /* assume integrated PCI devices in RC have valid requester-id */
-        return 1;
-
+    ASSERT(spin_is_locked(&bus2bridge_lock));
     if ( !bus2bridge[*bus].map )
         return 0;
 
@@ -1215,38 +1170,58 @@ static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
     return 1;
 }
 
+static int find_pcie_endpoint(u8 *bus, u8 *devfn, u8 *secbus)
+{
+    int ret = 0;
+
+    if ( *bus == 0 )
+        /* assume integrated PCI devices in RC have valid requester-id */
+        return 1;
+
+    spin_lock(&bus2bridge_lock);
+    ret = _find_pcie_endpoint(bus, devfn, secbus);
+    spin_unlock(&bus2bridge_lock);
+
+    return ret;
+}
+
 static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
 {
     struct acpi_drhd_unit *drhd;
     int ret = 0;
-    u16 sec_bus, sub_bus, ob, odf;
+    u16 sec_bus, sub_bus;
     u32 type;
-    u8 secbus;
+    u8 secbus, secdevfn;
+    struct pci_dev *pdev = pci_get_pdev(bus, devfn);
+
+    BUG_ON(!pdev);
 
-    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    drhd = acpi_find_matched_drhd_unit(pdev);
     if ( !drhd )
         return -ENODEV;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
     type = pdev_type(bus, devfn);
     switch ( type )
     {
     case DEV_TYPE_PCIe_BRIDGE:
+        break;
+
     case DEV_TYPE_PCI_BRIDGE:
         sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
                                  PCI_SECONDARY_BUS);
         sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
                                  PCI_SUBORDINATE_BUS);
-        /*dmar_scope_add_buses(&drhd->scope, sec_bus, sub_bus);*/
-
-        if ( type == DEV_TYPE_PCIe_BRIDGE )
-            break;
 
+        spin_lock(&bus2bridge_lock);
         for ( sub_bus &= 0xff; sec_bus <= sub_bus; sec_bus++ )
         {
             bus2bridge[sec_bus].map = 1;
             bus2bridge[sec_bus].bus =  bus;
             bus2bridge[sec_bus].devfn =  devfn;
         }
+        spin_unlock(&bus2bridge_lock);
         break;
 
     case DEV_TYPE_PCIe_ENDPOINT:
@@ -1258,26 +1233,28 @@ static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
 
     case DEV_TYPE_PCI:
         gdprintk(XENLOG_INFO VTDPREFIX,
-                 "domain_context_mapping:PCI:  bdf = %x:%x.%x\n",
+                 "domain_context_mapping:PCI: bdf = %x:%x.%x\n",
                  bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 
-        ob = bus; odf = devfn;
-        if ( !find_pcie_endpoint(&bus, &devfn, &secbus) )
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+        if ( ret )
+           break;
+
+        secbus = bus;
+        secdevfn = devfn;
+        /* dependent devices mapping */
+        while ( bus2bridge[bus].map )
         {
-            gdprintk(XENLOG_WARNING VTDPREFIX,
-                     "domain_context_mapping:invalid\n");
-            break;
+            secbus = bus;
+            secdevfn = devfn;
+            devfn = bus2bridge[bus].devfn;
+            bus = bus2bridge[bus].bus;
+            ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
+            if ( ret )
+                return ret;
         }
 
-        if ( ob != bus || odf != devfn )
-            gdprintk(XENLOG_INFO VTDPREFIX,
-                     "domain_context_mapping:map:  "
-                     "bdf = %x:%x.%x -> %x:%x.%x\n",
-                     ob, PCI_SLOT(odf), PCI_FUNC(odf),
-                     bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
-
-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn);
-        if ( secbus != bus )
+        if ( (secbus != bus) && (secdevfn != 0) )
             /*
              * The source-id for transactions on non-PCIe buses seem
              * to originate from devfn=0 on the secondary bus behind
@@ -1285,7 +1262,7 @@ static int domain_context_mapping(struct domain *domain, u8 bus, u8 devfn)
              * these scanarios is not particularly well documented
              * anywhere.
              */
-            domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
+            ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0);
         break;
 
     default:
@@ -1305,27 +1282,35 @@ static int domain_context_unmap_one(
     u8 bus, u8 devfn)
 {
     struct context_entry *context, *context_entries;
-    unsigned long flags;
     u64 maddr;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    spin_lock(&iommu->lock);
+
     maddr = bus_to_context_maddr(iommu, bus);
     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
     context = &context_entries[devfn];
 
     if ( !context_present(*context) )
     {
+        spin_unlock(&iommu->lock);
         unmap_vtd_domain_page(context_entries);
         return 0;
     }
 
-    spin_lock_irqsave(&iommu->lock, flags);
     context_clear_present(*context);
     context_clear_entry(*context);
     iommu_flush_cache_entry(context);
-    iommu_flush_context_domain(iommu, domain_iommu_domid(domain), 0);
-    iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
+
+    if ( iommu_flush_context_device(iommu, domain_iommu_domid(domain),
+                                    (((u16)bus) << 8) | devfn,
+                                    DMA_CCMD_MASK_NOBIT, 0) )
+        iommu_flush_write_buffer(iommu);
+    else
+        iommu_flush_iotlb_dsi(iommu, domain_iommu_domid(domain), 0);
+
+    spin_unlock(&iommu->lock);
     unmap_vtd_domain_page(context_entries);
-    spin_unlock_irqrestore(&iommu->lock, flags);
 
     return 0;
 }
@@ -1333,12 +1318,14 @@ static int domain_context_unmap_one(
 static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
 {
     struct acpi_drhd_unit *drhd;
-    u16 sec_bus, sub_bus;
     int ret = 0;
     u32 type;
-    u8 secbus;
+    u8 secbus, secdevfn;
+    struct pci_dev *pdev = pci_get_pdev(bus, devfn);
+
+    BUG_ON(!pdev);
 
-    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    drhd = acpi_find_matched_drhd_unit(pdev);
     if ( !drhd )
         return -ENODEV;
 
@@ -1347,24 +1334,39 @@ static int domain_context_unmap(struct domain *domain, u8 bus, u8 devfn)
     {
     case DEV_TYPE_PCIe_BRIDGE:
     case DEV_TYPE_PCI_BRIDGE:
-        sec_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                                 PCI_SECONDARY_BUS);
-        sub_bus = pci_conf_read8(bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
-                                 PCI_SUBORDINATE_BUS);
-        /*dmar_scope_remove_buses(&drhd->scope, sec_bus, sub_bus);*/
-        if ( DEV_TYPE_PCI_BRIDGE )
-            ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
         break;
 
     case DEV_TYPE_PCIe_ENDPOINT:
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCIe: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
         ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
         break;
 
     case DEV_TYPE_PCI:
-        if ( find_pcie_endpoint(&bus, &devfn, &secbus) )
+        gdprintk(XENLOG_INFO VTDPREFIX,
+                 "domain_context_unmap:PCI: bdf = %x:%x.%x\n",
+                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+        ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
+        if ( ret )
+            break;
+
+        secbus = bus;
+        secdevfn = devfn;
+        /* dependent devices unmapping */
+        while ( bus2bridge[bus].map )
+        {
+            secbus = bus;
+            secdevfn = devfn;
+            devfn = bus2bridge[bus].devfn;
+            bus = bus2bridge[bus].bus;
             ret = domain_context_unmap_one(domain, drhd->iommu, bus, devfn);
-        if ( bus != secbus )
-            domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
+            if ( ret )
+                return ret;
+        }
+
+        if ( (secbus != bus) && (secdevfn != 0) )
+            ret = domain_context_unmap_one(domain, drhd->iommu, secbus, 0);
         break;
 
     default:
@@ -1389,10 +1391,13 @@ static int reassign_device_ownership(
     struct iommu *pdev_iommu;
     int ret, found = 0;
 
-    if ( !(pdev = pci_lock_domain_pdev(source, bus, devfn)) )
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev_by_domain(source, bus, devfn);
+
+    if (!pdev)
         return -ENODEV;
 
-    drhd = acpi_find_matched_drhd_unit(bus, devfn);
+    drhd = acpi_find_matched_drhd_unit(pdev);
     pdev_iommu = drhd->iommu;
     domain_context_unmap(source, bus, devfn);
 
@@ -1400,24 +1405,18 @@ static int reassign_device_ownership(
     if ( ret )
         return ret;
 
-    write_lock(&pcidevs_lock);
     list_move(&pdev->domain_list, &target->arch.pdev_list);
-    write_unlock(&pcidevs_lock);
     pdev->domain = target;
 
-    spin_unlock(&pdev->lock);
-
-    read_lock(&pcidevs_lock);
     for_each_pdev ( source, pdev )
     {
-        drhd = acpi_find_matched_drhd_unit(pdev->bus, pdev->devfn);
+        drhd = acpi_find_matched_drhd_unit(pdev);
         if ( drhd->iommu == pdev_iommu )
         {
             found = 1;
             break;
         }
     }
-    read_unlock(&pcidevs_lock);
 
     if ( !found )
         clear_bit(pdev_iommu->index, &source_hd->iommu_bitmap);
@@ -1432,20 +1431,12 @@ void iommu_domain_teardown(struct domain *d)
     if ( list_empty(&acpi_drhd_units) )
         return;
 
+    spin_lock(&hd->mapping_lock);
     iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
     hd->pgd_maddr = 0;
-    iommu_domid_release(d);
-}
-
-static int domain_context_mapped(u8 bus, u8 devfn)
-{
-    struct acpi_drhd_unit *drhd;
-
-    for_each_drhd_unit ( drhd )
-        if ( device_context_mapped(drhd->iommu, bus, devfn) )
-            return 1;
+    spin_unlock(&hd->mapping_lock);
 
-    return 0;
+    iommu_domid_release(d);
 }
 
 int intel_iommu_map_page(
@@ -1462,21 +1453,35 @@ int intel_iommu_map_page(
     iommu = drhd->iommu;
 
     /* do nothing if dom0 and iommu supports pass thru */
-    if ( iommu_passthrough &&
-         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+    if ( iommu_passthrough && (d->domain_id == 0) )
         return 0;
 
+    spin_lock(&hd->mapping_lock);
+
     pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
     if ( pg_maddr == 0 )
+    {
+        spin_unlock(&hd->mapping_lock);
         return -ENOMEM;
+    }
     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
     pte = page + (gfn & LEVEL_MASK);
     pte_present = dma_pte_present(*pte);
     dma_set_pte_addr(*pte, (paddr_t)mfn << PAGE_SHIFT_4K);
     dma_set_pte_prot(*pte, DMA_PTE_READ | DMA_PTE_WRITE);
+
+    /* Set the SNP on leaf page table if Snoop Control available */
+    if ( iommu_snoop )
+        dma_set_pte_snp(*pte);
+
     iommu_flush_cache_entry(pte);
+    spin_unlock(&hd->mapping_lock);
     unmap_vtd_domain_page(page);
 
+    /*
+     * No need pcideves_lock here because we have flush
+     * when assign/deassign device
+     */
     for_each_drhd_unit ( drhd )
     {
         iommu = drhd->iommu;
@@ -1502,8 +1507,7 @@ int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
     iommu = drhd->iommu;
 
     /* do nothing if dom0 and iommu supports pass thru */
-    if ( iommu_passthrough &&
-         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+    if ( iommu_passthrough && (d->domain_id == 0) )
         return 0;
 
     dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
@@ -1511,78 +1515,29 @@ int intel_iommu_unmap_page(struct domain *d, unsigned long gfn)
     return 0;
 }
 
-int iommu_page_mapping(struct domain *domain, paddr_t iova,
-                       paddr_t hpa, size_t size, int prot)
-{
-    struct hvm_iommu *hd = domain_hvm_iommu(domain);
-    struct acpi_drhd_unit *drhd;
-    struct iommu *iommu;
-    u64 start_pfn, end_pfn;
-    struct dma_pte *page = NULL, *pte = NULL;
-    int index;
-    u64 pg_maddr;
-
-    if ( (prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0 )
-        return -EINVAL;
-
-    iova = (iova >> PAGE_SHIFT_4K) << PAGE_SHIFT_4K;
-    start_pfn = hpa >> PAGE_SHIFT_4K;
-    end_pfn = (PAGE_ALIGN_4K(hpa + size)) >> PAGE_SHIFT_4K;
-    index = 0;
-    while ( start_pfn < end_pfn )
-    {
-        pg_maddr = addr_to_dma_page_maddr(domain, iova + PAGE_SIZE_4K*index, 1);
-        if ( pg_maddr == 0 )
-            return -ENOMEM;
-        page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
-        pte = page + (start_pfn & LEVEL_MASK);
-        dma_set_pte_addr(*pte, (paddr_t)start_pfn << PAGE_SHIFT_4K);
-        dma_set_pte_prot(*pte, prot);
-        iommu_flush_cache_entry(pte);
-        unmap_vtd_domain_page(page);
-        start_pfn++;
-        index++;
-    }
-
-    if ( index > 0 )
-    {
-        for_each_drhd_unit ( drhd )
-        {
-            iommu = drhd->iommu;
-            if ( test_bit(iommu->index, &hd->iommu_bitmap) )
-                if ( iommu_flush_iotlb_psi(iommu, domain_iommu_domid(domain),
-                                           iova, index, 1))
-                    iommu_flush_write_buffer(iommu);
-        }
-    }
-
-    return 0;
-}
-
-int iommu_page_unmapping(struct domain *domain, paddr_t addr, size_t size)
-{
-    dma_pte_clear_range(domain, addr, addr + size);
-
-    return 0;
-}
-
 static int iommu_prepare_rmrr_dev(struct domain *d,
                                   struct acpi_rmrr_unit *rmrr,
                                   u8 bus, u8 devfn)
 {
-    u64 size;
-    int ret;
-
-    /* page table init */
-    size = rmrr->end_address - rmrr->base_address + 1;
-    ret = iommu_page_mapping(d, rmrr->base_address,
-                             rmrr->base_address, size,
-                             DMA_PTE_READ|DMA_PTE_WRITE);
-    if ( ret )
-        return ret;
+    int ret = 0;
+    u64 base, end;
+    unsigned long base_pfn, end_pfn;
+
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    ASSERT(rmrr->base_address < rmrr->end_address);
+    
+    base = rmrr->base_address & PAGE_MASK_4K;
+    base_pfn = base >> PAGE_SHIFT_4K;
+    end = PAGE_ALIGN_4K(rmrr->end_address);
+    end_pfn = end >> PAGE_SHIFT_4K;
+
+    while ( base_pfn < end_pfn )
+    {
+        intel_iommu_map_page(d, base_pfn, base_pfn);
+        base_pfn++;
+    }
 
-    if ( domain_context_mapped(bus, devfn) == 0 )
-        ret = domain_context_mapping(d, bus, devfn);
+    ret = domain_context_mapping(d, bus, devfn);
 
     return ret;
 }
@@ -1593,6 +1548,8 @@ static int intel_iommu_add_device(struct pci_dev *pdev)
     u16 bdf;
     int ret, i;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+
     if ( !pdev->domain )
         return -EINVAL;
 
@@ -1654,7 +1611,7 @@ static void setup_dom0_devices(struct domain *d)
 
     hd = domain_hvm_iommu(d);
 
-    write_lock(&pcidevs_lock);
+    spin_lock(&pcidevs_lock);
     for ( bus = 0; bus < 256; bus++ )
     {
         for ( dev = 0; dev < 32; dev++ )
@@ -1674,7 +1631,7 @@ static void setup_dom0_devices(struct domain *d)
             }
         }
     }
-    write_unlock(&pcidevs_lock);
+    spin_unlock(&pcidevs_lock);
 }
 
 void clear_fault_bits(struct iommu *iommu)
@@ -1710,6 +1667,11 @@ static int init_vtd_hw(void)
         }
 
         vector = iommu_set_interrupt(iommu);
+        if ( vector < 0 )
+        {
+            gdprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
+            return vector;
+        }
         dma_msi_data_init(iommu, vector);
         dma_msi_addr_init(iommu, cpu_physical_id(first_cpu(cpu_online_map)));
         iommu->vector = vector;
@@ -1722,20 +1684,32 @@ static int init_vtd_hw(void)
         flush->iotlb = flush_iotlb_reg;
     }
 
-    for_each_drhd_unit ( drhd )
+    if ( iommu_qinval )
     {
-        iommu = drhd->iommu;
-        if ( qinval_setup(iommu) != 0 )
-            dprintk(XENLOG_INFO VTDPREFIX,
-                    "Queued Invalidation hardware not found\n");
+        for_each_drhd_unit ( drhd )
+        {
+            iommu = drhd->iommu;
+            if ( enable_qinval(iommu) != 0 )
+            {
+                dprintk(XENLOG_INFO VTDPREFIX,
+                        "Failed to enable Queued Invalidation!\n");
+                break;
+            }
+        }
     }
 
-    for_each_drhd_unit ( drhd )
+    if ( iommu_intremap )
     {
-        iommu = drhd->iommu;
-        if ( intremap_setup(iommu) != 0 )
-            dprintk(XENLOG_INFO VTDPREFIX,
-                    "Interrupt Remapping hardware not found\n");
+        for_each_drhd_unit ( drhd )
+        {
+            iommu = drhd->iommu;
+            if ( enable_intremap(iommu) != 0 )
+            {
+                dprintk(XENLOG_INFO VTDPREFIX,
+                        "Failed to enable Interrupt Remapping!\n");
+                break;
+            }
+        }
     }
 
     return 0;
@@ -1747,6 +1721,7 @@ static void setup_dom0_rmrr(struct domain *d)
     u16 bdf;
     int ret, i;
 
+    spin_lock(&pcidevs_lock);
     for_each_rmrr_device ( rmrr, bdf, i )
     {
         ret = iommu_prepare_rmrr_dev(d, rmrr, PCI_BUS(bdf), PCI_DEVFN2(bdf));
@@ -1754,6 +1729,20 @@ static void setup_dom0_rmrr(struct domain *d)
             gdprintk(XENLOG_ERR VTDPREFIX,
                      "IOMMU: mapping reserved region failed\n");
     }
+    spin_unlock(&pcidevs_lock);
+}
+
+static void platform_quirks(void)
+{
+    u32 id;
+
+    /* Mobile 4 Series Chipset neglects to set RWBF capability. */
+    id = pci_conf_read32(0, 0, 0, 0);
+    if ( id == 0x2a408086 )
+    {
+        dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n");
+        rwbf_quirk = 1;
+    }
 }
 
 int intel_vtd_setup(void)
@@ -1764,13 +1753,49 @@ int intel_vtd_setup(void)
     if ( !vtd_enabled )
         return -ENODEV;
 
+    platform_quirks();
+
     spin_lock_init(&domid_bitmap_lock);
     clflush_size = get_cache_line_size();
 
+    /* We enable the following features only if they are supported by all VT-d
+     * engines: Snoop Control, DMA passthrough, Queued Invalidation and
+     * Interrupt Remapping.
+     */
     for_each_drhd_unit ( drhd )
+    {
         if ( iommu_alloc(drhd) != 0 )
             goto error;
 
+        iommu = drhd->iommu;
+
+        if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
+            iommu_snoop = 0;
+
+        if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
+            iommu_passthrough = 0;
+
+        if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
+            iommu_qinval = 0;
+
+        if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
+            iommu_intremap = 0;
+    }
+
+    if ( !iommu_qinval && iommu_intremap )
+    {
+        iommu_intremap = 0;
+        gdprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
+            "since Queued Invalidation isn't supported or enabled.\n");
+    }
+
+#define P(p,s) printk("Intel VT-d %s %ssupported.\n", s, (p)? "" : "not ")
+    P(iommu_snoop, "Snoop Control");
+    P(iommu_passthrough, "DMA Passthrough");
+    P(iommu_qinval, "Queued Invalidation");
+    P(iommu_intremap, "Interrupt Remapping");
+#undef P
+
     /* Allocate IO page directory page for the domain. */
     drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
     iommu = drhd->iommu;
@@ -1795,6 +1820,10 @@ int intel_vtd_setup(void)
     for_each_drhd_unit ( drhd )
         iommu_free(drhd);
     vtd_enabled = 0;
+    iommu_snoop = 0;
+    iommu_passthrough = 0;
+    iommu_qinval = 0;
+    iommu_intremap = 0;
     return -ENOMEM;
 }
 
@@ -1806,27 +1835,43 @@ int device_assigned(u8 bus, u8 devfn)
 {
     struct pci_dev *pdev;
 
-    if ( (pdev = pci_lock_domain_pdev(dom0, bus, devfn)) )
+    spin_lock(&pcidevs_lock);
+    pdev = pci_get_pdev_by_domain(dom0, bus, devfn);
+    if (!pdev)
     {
-        spin_unlock(&pdev->lock);
-        return 0;
+        spin_unlock(&pcidevs_lock);
+        return -1;
     }
 
-    return 1;
+    spin_unlock(&pcidevs_lock);
+    return 0;
 }
 
 int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
 {
     struct acpi_rmrr_unit *rmrr;
     int ret = 0, i;
+    struct pci_dev *pdev;
     u16 bdf;
 
     if ( list_empty(&acpi_drhd_units) )
         return -ENODEV;
 
+    ASSERT(spin_is_locked(&pcidevs_lock));
+    pdev = pci_get_pdev(bus, devfn);
+    if (!pdev)
+        return -ENODEV;
+
+    if (pdev->domain != dom0)
+    {
+        gdprintk(XENLOG_ERR VTDPREFIX,
+                "IOMMU: assign a assigned device\n");
+       return -EBUSY;
+    }
+
     ret = reassign_device_ownership(dom0, d, bus, devfn);
     if ( ret )
-        return ret;
+        goto done;
 
     /* Setup rmrr identity mapping */
     for_each_rmrr_device( rmrr, bdf, i )
@@ -1837,16 +1882,20 @@ int intel_iommu_assign_device(struct domain *d, u8 bus, u8 devfn)
              * ignore USB RMRR temporarily.
              */
             if ( is_usb_device(bus, devfn) )
-                return 0;
+            {
+                ret = 0;
+                goto done;
+            }
 
             ret = iommu_prepare_rmrr_dev(d, rmrr, bus, devfn);
             if ( ret )
                 gdprintk(XENLOG_ERR VTDPREFIX,
                          "IOMMU: mapping reserved region failed\n");
-            return ret;
+            goto done; 
         }
     }
 
+done:
     return ret;
 }
 
@@ -1860,14 +1909,14 @@ static int intel_iommu_group_id(u8 bus, u8 devfn)
 }
 
 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
-int iommu_suspend(void)
+void iommu_suspend(void)
 {
     struct acpi_drhd_unit *drhd;
     struct iommu *iommu;
     u32    i;
 
     if ( !vtd_enabled )
-        return 0;
+        return;
 
     iommu_flush_all();
 
@@ -1884,21 +1933,31 @@ int iommu_suspend(void)
             (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
         iommu_state[i][DMAR_FEUADDR_REG] =
             (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
-    }
 
-    return 0;
+        iommu_disable_translation(iommu);
+
+        if ( iommu_intremap )
+            disable_intremap(iommu);
+
+        if ( iommu_qinval )
+            disable_qinval(iommu);
+    }
 }
 
-int iommu_resume(void)
+void iommu_resume(void)
 {
     struct acpi_drhd_unit *drhd;
     struct iommu *iommu;
     u32 i;
 
     if ( !vtd_enabled )
-        return 0;
+        return;
 
-    iommu_flush_all();
+    /* Not sure whether the flush operation is required to meet iommu
+     * specification. Note that BIOS also executes in S3 resume and iommu may
+     * be touched again, so let us do the flush operation for safety.
+     */
+    flush_all_cache();
 
     if ( init_vtd_hw() != 0  && force_iommu )
          panic("IOMMU setup failed, crash Xen for security purpose!\n");
@@ -1917,11 +1976,8 @@ int iommu_resume(void)
         dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
                     (u32) iommu_state[i][DMAR_FEUADDR_REG]);
 
-        if ( iommu_enable_translation(iommu) )
-            return -EIO;
+        iommu_enable_translation(iommu);
     }
-
-    return 0;
 }
 
 struct iommu_ops intel_iommu_ops = {
index dd17a6b642c14a09f1d879f6545bed4d0faf5cb0..fd8b9e124df5d760292973ab117436ec4adba5ed 100644 (file)
 #define ecap_ext_intr(e)         ((e >> 4) & 0x1)
 #define ecap_cache_hints(e)      ((e >> 5) & 0x1)
 #define ecap_pass_thru(e)        ((e >> 6) & 0x1)
+#define ecap_snp_ctl(e)          ((e >> 7) & 0x1)
 
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
@@ -260,10 +261,12 @@ struct dma_pte {
 };
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
+#define DMA_PTE_SNP  (1 << 11)
 #define dma_clear_pte(p)    do {(p).val = 0;} while(0)
 #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0)
 #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0)
 #define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0)
+#define dma_set_pte_snp(p)  do {(p).val |= DMA_PTE_SNP;} while(0)
 #define dma_set_pte_prot(p, prot) \
             do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
 #define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
@@ -309,6 +312,10 @@ struct iremap_entry {
 /* queue invalidation entry */
 struct qinval_entry {
     union {
+        struct {
+            u64 lo;
+            u64 hi;
+        }val;
         struct {
             struct {
                 u64 type    : 4,
@@ -390,7 +397,9 @@ struct poll_info {
     u32 udata;
 };
 
-#define QINVAL_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct qinval_entry))
+#define NUM_QINVAL_PAGES 1
+#define IQA_REG_QS       0    // derived from NUM_QINVAL_PAGES per VT-d spec.
+#define QINVAL_ENTRY_NR (PAGE_SIZE_4K*NUM_QINVAL_PAGES/sizeof(struct qinval_entry))
 #define qinval_present(v) ((v).lo & 1)
 #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
 
index bc015f2c468fd6f737a992188fcffe337706488f..e82337045f2f6d6e06c022815d94d6207651c474 100644 (file)
@@ -34,13 +34,13 @@ static void print_qi_regs(struct iommu *iommu)
     u64 val;
 
     val = dmar_readq(iommu->reg, DMAR_IQA_REG);
-    printk("DMAR_IAQ_REG = %"PRIx64"\n", val);
+    printk("DMAR_IQA_REG = %"PRIx64"\n", val);
 
     val = dmar_readq(iommu->reg, DMAR_IQH_REG);
-    printk("DMAR_IAH_REG = %"PRIx64"\n", val);
+    printk("DMAR_IQH_REG = %"PRIx64"\n", val);
 
     val = dmar_readq(iommu->reg, DMAR_IQT_REG);
-    printk("DMAR_IAT_REG = %"PRIx64"\n", val);
+    printk("DMAR_IQT_REG = %"PRIx64"\n", val);
 }
 
 static int qinval_next_index(struct iommu *iommu)
@@ -252,14 +252,15 @@ static int gen_dev_iotlb_inv_dsc(struct iommu *iommu, int index,
     qinval_entry->q.dev_iotlb_inv_dsc.lo.res_3 = 0;
 
     qinval_entry->q.dev_iotlb_inv_dsc.hi.size = size;
-    qinval_entry->q.dev_iotlb_inv_dsc.hi.addr = addr;
+    qinval_entry->q.dev_iotlb_inv_dsc.hi.res_1 = 0;
+    qinval_entry->q.dev_iotlb_inv_dsc.hi.addr = addr >> PAGE_SHIFT_4K;
 
     unmap_vtd_domain_page(qinval_entries);
     spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags);
     return 0;
 }
 
-int queue_invalidate_device_iotlb(struct iommu *iommu,
+int qinval_device_iotlb(struct iommu *iommu,
     u32 max_invs_pend, u16 sid, u16 size, u64 addr)
 {
     int ret = -1;
@@ -316,7 +317,6 @@ int queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
     return ret;
 }
 
-u64 iec_cap;
 int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
 {
     int ret;
@@ -327,7 +327,7 @@ int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx)
      * reading vt-d architecture register will ensure
      * draining happens in implementation independent way.
      */
-    iec_cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
+    (void)dmar_readq(iommu->reg, DMAR_CAP_REG);
     return ret;
 }
 
@@ -412,7 +412,7 @@ static int flush_iotlb_qi(
     return ret;
 }
 
-int qinval_setup(struct iommu *iommu)
+int enable_qinval(struct iommu *iommu)
 {
     s_time_t start_time;
     struct qi_ctrl *qi_ctrl;
@@ -421,14 +421,17 @@ int qinval_setup(struct iommu *iommu)
     qi_ctrl = iommu_qi_ctrl(iommu);
     flush = iommu_get_flush(iommu);
 
-    if ( !ecap_queued_inval(iommu->ecap) )
-        return -ENODEV;
+    ASSERT(ecap_queued_inval(iommu->ecap) && iommu_qinval);
 
     if ( qi_ctrl->qinval_maddr == 0 )
     {
-        qi_ctrl->qinval_maddr = alloc_pgtable_maddr();
+        qi_ctrl->qinval_maddr = alloc_pgtable_maddr(NULL, NUM_QINVAL_PAGES);
         if ( qi_ctrl->qinval_maddr == 0 )
-            panic("Cannot allocate memory for qi_ctrl->qinval_maddr\n");
+        {
+            dprintk(XENLOG_WARNING VTDPREFIX,
+                    "Cannot allocate memory for qi_ctrl->qinval_maddr\n");
+            return -ENOMEM;
+        }
         flush->context = flush_context_qi;
         flush->iotlb = flush_iotlb_qi;
     }
@@ -440,8 +443,11 @@ int qinval_setup(struct iommu *iommu)
      * registers are automatically reset to 0 with write
      * to IQA register.
      */
+    qi_ctrl->qinval_maddr |= IQA_REG_QS;
     dmar_writeq(iommu->reg, DMAR_IQA_REG, qi_ctrl->qinval_maddr);
 
+    dmar_writeq(iommu->reg, DMAR_IQT_REG, 0);
+
     /* enable queued invalidation hardware */
     iommu->gcmd |= DMA_GCMD_QIE;
     dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
@@ -457,3 +463,22 @@ int qinval_setup(struct iommu *iommu)
 
     return 0;
 }
+
+void disable_qinval(struct iommu *iommu)
+{
+    s_time_t start_time;
+
+    ASSERT(ecap_queued_inval(iommu->ecap) && iommu_qinval);
+
+    iommu->gcmd &= ~DMA_GCMD_QIE;
+    dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd);
+
+    /* Make sure hardware complete it */
+    start_time = NOW();
+    while ( dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES )
+    {
+        if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+            panic("Cannot clear QIE field for queue invalidation\n");
+        cpu_relax();
+    }
+}
index 4404a1f1c1a372072fb4f66b970f56740c22c86c..c720030c0a1921f6d1e22257321fe55ed0ea6e42 100644 (file)
@@ -204,6 +204,7 @@ void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn)
 
 void dump_iommu_info(unsigned char key)
 {
+#if defined(__i386__) || defined(__x86_64__)
     struct acpi_drhd_unit *drhd;
     struct iommu *iommu;
     int i;
@@ -305,6 +306,10 @@ void dump_iommu_info(unsigned char key)
             }
         }
     }
+#else
+    printk("%s: not implemnted on IA64 for now.\n", __func__);
+    /* ia64: TODO */
+#endif
 }
 
 /*
index 84cd2e5f8a4765cd93a5128deb261e0ff55d85af..d119117068983f1cae57bbbde38441981c5ad203 100644 (file)
@@ -101,12 +101,12 @@ unsigned int get_cache_line_size(void);
 void cacheline_flush(char *);
 void flush_all_cache(void);
 void *map_to_nocache_virt(int nr_iommus, u64 maddr);
-u64 alloc_pgtable_maddr(void);
+u64 alloc_pgtable_maddr(struct domain *d, unsigned long npages);
 void free_pgtable_maddr(u64 maddr);
 void *map_vtd_domain_page(u64 maddr);
 void unmap_vtd_domain_page(void *va);
 
 void iommu_flush_cache_entry(void *addr);
-void iommu_flush_cache_page(void *addr);
+void iommu_flush_cache_page(void *addr, unsigned long npages);
 
 #endif // _VTD_H_
index 2c931eb64c632047e0e404292673cffd6962e193..bf6c1ead0fc43a4b82950128c65f01d3a201d5c6 100644 (file)
 #include <xen/domain_page.h>
 #include <asm/paging.h>
 #include <xen/iommu.h>
+#include <xen/numa.h>
 #include "../iommu.h"
 #include "../dmar.h"
 #include "../vtd.h"
 
+/*
+ * iommu_inclusive_mapping: when set, all memory below 4GB is included in dom0
+ * 1:1 iommu mappings except xen and unusable regions.
+ */
+static int iommu_inclusive_mapping;
+boolean_param("iommu_inclusive_mapping", iommu_inclusive_mapping);
+
 void *map_vtd_domain_page(u64 maddr)
 {
     return map_domain_page(maddr >> PAGE_SHIFT_4K);
@@ -37,21 +45,24 @@ void unmap_vtd_domain_page(void *va)
 }
 
 /* Allocate page table, return its machine address */
-u64 alloc_pgtable_maddr(void)
+u64 alloc_pgtable_maddr(struct domain *d, unsigned long npages)
 {
     struct page_info *pg;
     u64 *vaddr;
+    unsigned long mfn;
 
-    pg = alloc_domheap_page(NULL, 0);
-    vaddr = map_domain_page(page_to_mfn(pg));
-    if ( !vaddr )
+    pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
+                             d ? MEMF_node(domain_to_node(d)) : 0);
+    if ( !pg )
         return 0;
-    memset(vaddr, 0, PAGE_SIZE);
+    mfn = page_to_mfn(pg);
+    vaddr = map_domain_page(mfn);
+    memset(vaddr, 0, PAGE_SIZE * npages);
 
-    iommu_flush_cache_page(vaddr);
+    iommu_flush_cache_page(vaddr, npages);
     unmap_domain_page(vaddr);
 
-    return page_to_maddr(pg);
+    return (u64)mfn << PAGE_SHIFT_4K;
 }
 
 void free_pgtable_maddr(u64 maddr)
@@ -119,9 +130,9 @@ void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq)
         return;
     }
     /* Multiple mirq may be mapped to one isa irq */
-    for ( i = find_first_bit(dpci->mapping, NR_PIRQS);
-          i < NR_PIRQS;
-          i = find_next_bit(dpci->mapping, NR_PIRQS, i + 1) )
+    for ( i = find_first_bit(dpci->mapping, NR_IRQS);
+          i < NR_IRQS;
+          i = find_next_bit(dpci->mapping, NR_IRQS, i + 1) )
     {
         list_for_each_entry_safe ( digl, tmp,
             &dpci->mirq[i].digl_list, list )
@@ -139,3 +150,35 @@ void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq)
     }
     spin_unlock(&d->event_lock);
 }
+
+void iommu_set_dom0_mapping(struct domain *d)
+{
+    u64 i, j, tmp, max_pfn;
+    extern int xen_in_range(paddr_t start, paddr_t end);
+
+    BUG_ON(d->domain_id != 0);
+
+    max_pfn = max_t(u64, max_page, 0x100000000ull >> PAGE_SHIFT);
+
+    for ( i = 0; i < max_pfn; i++ )
+    {
+        /*
+         * Set up 1:1 mapping for dom0. Default to use only conventional RAM 
+         * areas and let RMRRs include needed reserved regions. When set, the 
+         * inclusive mapping maps in everything below 4GB except unusable
+         * ranges. 
+         */
+        if ( !page_is_ram_type(i, RAM_TYPE_CONVENTIONAL) &&
+             (!iommu_inclusive_mapping ||
+              page_is_ram_type(i, RAM_TYPE_UNUSABLE)) )
+            continue;
+
+        /* Exclude Xen bits */
+        if ( xen_in_range(i << PAGE_SHIFT, (i + 1) << PAGE_SHIFT) )
+            continue;
+
+        tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K);
+        for ( j = 0; j < tmp; j++ )
+            iommu_map_page(d, (i*tmp+j), (i*tmp+j));
+    }
+}
index e36e358789d601e4df6048b6a1546803199af91a..bace5c625d776a3f122d36a9957b619129c88932 100644 (file)
@@ -146,10 +146,20 @@ void __init vesa_init(void)
     xfree(text_buf);
 }
 
-void __init vesa_endboot(void)
+void __init vesa_endboot(bool_t keep)
 {
-    xpos = 0;
-    vga_puts = vesa_scroll_puts;
+    if ( keep )
+    {
+        xpos = 0;
+        vga_puts = vesa_scroll_puts;
+    }
+    else
+    {
+        unsigned int i, bpp = (vlfb_info.bits_per_pixel + 7) >> 3;
+        for ( i = 0; i < vlfb_info.height; i++ )
+            memset(lfb + i * vlfb_info.bytes_per_line, 0,
+                   vlfb_info.width * bpp);
+    }
 }
 
 #if defined(CONFIG_X86)
index ef21178a011640220329622af7c3009cbd1e5660..e1828c1687190d839bf92add01afaa52d606d394 100644 (file)
@@ -57,10 +57,10 @@ static unsigned int columns, lines;
 
 #ifdef CONFIG_X86_64
 void vesa_early_init(void);
-void vesa_endboot(void);
+void vesa_endboot(bool_t keep);
 #else
 #define vesa_early_init() ((void)0)
-#define vesa_endboot(   ((void)0)
+#define vesa_endboot(x)   ((void)0)
 #endif
 
 void __init vga_init(void)
@@ -79,7 +79,7 @@ void __init vga_init(void)
     switch ( vga_console_info.video_type )
     {
     case XEN_VGATYPE_TEXT_MODE_3:
-        if ( memory_is_conventional_ram(0xB8000) ||
+        if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
              ((video = ioremap(0xB8000, 0x8000)) == NULL) )
             return;
         outw(0x200a, 0x3d4); /* disable cursor */
@@ -105,10 +105,21 @@ void __init vga_endboot(void)
     printk("Xen is %s VGA console.\n",
            vgacon_keep ? "keeping" : "relinquishing");
 
-    vesa_endboot();
-
     if ( !vgacon_keep )
         vga_puts = vga_noop_puts;
+
+    switch ( vga_console_info.video_type )
+    {
+    case XEN_VGATYPE_TEXT_MODE_3:
+        if ( !vgacon_keep )
+            memset(video, 0, columns * lines * 2);
+        break;
+    case XEN_VGATYPE_VESA_LFB:
+        vesa_endboot(vgacon_keep);
+        break;
+    default:
+        BUG();
+    }
 }
 
 static void vga_text_puts(const char *s)
index 64ae8c92f0ba418bfb4f9c49df3d34c28bfd1bd9..84273715969420aefcc2d4f83735c1c58db74725 100644 (file)
@@ -51,7 +51,7 @@ compat/%.h: compat/%.i Makefile
        mv -f $@.new $@
 
 compat/%.i: compat/%.c Makefile
-       $(CPP) $(CFLAGS) $(cppflags-y) -o $@ $<
+       $(CPP) $(filter-out -M% .%.d,$(CFLAGS)) $(cppflags-y) -o $@ $<
 
 compat/%.c: public/%.h xlat.lst Makefile
        mkdir -p $(@D)
index 77824417eda533ae5cda736f9ecf9f4419705fdc..8423664efe17ab5392c5c055a524940f101c801f 100644 (file)
  * published by the Free Software Foundation.
  */
 
+#ifndef __XEN_CPUFREQ_PM_H__
+#define __XEN_CPUFREQ_PM_H__
+
 #include <xen/types.h>
 #include <xen/list.h>
 #include <xen/cpumask.h>
 
 #include "processor_perf.h"
 
-#define CPUFREQ_NAME_LEN 16
+DECLARE_PER_CPU(spinlock_t, cpufreq_statistic_lock);
 
 struct cpufreq_governor;
 
@@ -55,6 +58,8 @@ extern struct cpufreq_policy *cpufreq_cpu_policy[NR_CPUS];
 extern int __cpufreq_set_policy(struct cpufreq_policy *data,
                                 struct cpufreq_policy *policy);
 
+void cpufreq_cmdline_parse(char *);
+
 #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
 #define CPUFREQ_SHARED_TYPE_HW   (1) /* HW does needed coordination */
 #define CPUFREQ_SHARED_TYPE_ALL  (2) /* All dependent CPUs should set freq */
@@ -82,16 +87,29 @@ struct cpufreq_governor {
     char    name[CPUFREQ_NAME_LEN];
     int     (*governor)(struct cpufreq_policy *policy,
                         unsigned int event);
+    void    (*handle_option)(const char *name, const char *value);
+    struct list_head governor_list;
 };
 
+extern struct cpufreq_governor *cpufreq_opt_governor;
 extern struct cpufreq_governor cpufreq_gov_dbs;
-#define CPUFREQ_DEFAULT_GOVERNOR &cpufreq_gov_dbs
+extern struct cpufreq_governor cpufreq_gov_userspace;
+extern struct cpufreq_governor cpufreq_gov_performance;
+extern struct cpufreq_governor cpufreq_gov_powersave;
+
+extern int cpufreq_register_governor(struct cpufreq_governor *governor);
+extern int cpufreq_unregister_governor(struct cpufreq_governor *governor);
+extern struct cpufreq_governor *__find_governor(const char *governor);
+#define CPUFREQ_DEFAULT_GOVERNOR &cpufreq_gov_userspace
 
 /* pass a target to the cpufreq driver */
 extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
                                    unsigned int target_freq,
                                    unsigned int relation);
-extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy);
+
+#define GOV_GETAVG     1
+#define USR_GETAVG     2
+extern int cpufreq_driver_getavg(unsigned int cpu, unsigned int flag);
 
 static __inline__ int 
 __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
@@ -108,13 +126,14 @@ __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 #define CPUFREQ_RELATION_H 1  /* highest frequency below or at target */
 
 struct cpufreq_driver {
+    char   name[CPUFREQ_NAME_LEN];
     int    (*init)(struct cpufreq_policy *policy);
     int    (*verify)(struct cpufreq_policy *policy);
     int    (*target)(struct cpufreq_policy *policy,
                      unsigned int target_freq,
                      unsigned int relation);
     unsigned int    (*get)(unsigned int cpu);
-    unsigned int    (*getavg)(unsigned int cpu);
+    unsigned int    (*getavg)(unsigned int cpu, unsigned int flag);
     int    (*exit)(struct cpufreq_policy *policy);
 };
 
@@ -205,3 +224,12 @@ struct cpu_dbs_info_s {
 };
 
 int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event);
+int get_cpufreq_ondemand_para(uint32_t *sampling_rate_max,
+                              uint32_t *sampling_rate_min,
+                              uint32_t *sampling_rate,
+                              uint32_t *up_threshold);
+int write_ondemand_sampling_rate(unsigned int sampling_rate);
+int write_ondemand_up_threshold(unsigned int up_threshold);
+
+int write_userspace_scaling_setspeed(unsigned int cpu, unsigned int freq);
+#endif /* __XEN_CPUFREQ_PM_H__ */
index 7821aeffb13d5dcfbf67360a0dd818a64789853f..cc6be7a9133672c5e087a4704ca5e167c03e7f4d 100644 (file)
@@ -9,6 +9,7 @@
 int get_cpu_id(u8);
 int powernow_cpufreq_init(void);
 
+void cpufreq_residency_update(unsigned int, uint8_t);
 void cpufreq_statistic_update(unsigned int, uint8_t, uint8_t);
 int  cpufreq_statistic_init(unsigned int);
 void cpufreq_statistic_exit(unsigned int);
@@ -19,8 +20,6 @@ int  cpufreq_limit_change(unsigned int);
 int  cpufreq_add_cpu(unsigned int);
 int  cpufreq_del_cpu(unsigned int);
 
-uint64_t get_cpu_idle_time(unsigned int);
-
 struct processor_performance {
     uint32_t state;
     uint32_t platform_limit;
@@ -60,8 +59,5 @@ struct pm_px {
 
 extern struct pm_px *cpufreq_statistic_data[NR_CPUS];
 
-int xenpf_copy_px_states(struct processor_performance *pxpt,
-        struct xen_processor_performance *dom0_px_info);
-
 int cpufreq_cpu_init(unsigned int cpuid);
 #endif /* __XEN_PROCESSOR_PM_H__ */
index cf4b033c37babddf6198345af53e6b573dd34c0a..b96dd8e3bf6a51a68cf4155c47bfd368d6ec114c 100644 (file)
@@ -4,7 +4,12 @@
 #define BUG() __bug(__FILE__, __LINE__)
 #define WARN() __warn(__FILE__, __LINE__)
 
-#define dump_execution_state() printk("FIXME: implement ia64 dump_execution_state()\n")
+#define dump_execution_state()                                      \
+    do {                                                            \
+        printk("FIXME: implement ia64 dump_execution_state()\n");      \
+        dump_stack();                                               \
+    } while (0)
+
 #define vcpu_show_execution_state(v) printk("FIXME: implement ia64 vcpu_show_execution_state()\n")
 
 #endif /* __IA64_BUG_H__ */
index 4e9f83376bb709286932e0491c5ce59f8f90f703..8d8d1c96304e9ffc6ca44a305eeb73a28293f103 100644 (file)
@@ -33,6 +33,11 @@ typedef union U_INST64_B9 {
     struct { unsigned long qp:6, imm20:20, :1, x6:6, :3, i:1, major:4; };
 } INST64_B9;
 
+typedef union U_INST64_I18 {
+    IA64_INST inst;
+    struct { unsigned long qp:6, imm20:20, y:1, x6:6, x3:3, i:1, major:4; };
+} INST64_I18;
+
 typedef union U_INST64_I19 {
     IA64_INST inst;
     struct { unsigned long qp:6, imm20:20, :1, x6:6, x3:3, i:1, major:4; };
@@ -191,6 +196,7 @@ typedef union U_INST64 {
     INST64_B4 B4;      // used in build_hypercall_bundle only
     INST64_B8 B8;      // rfi, bsw.[01]
     INST64_B9 B9;      // break.b
+    INST64_I18 I18;    // nop.i used in build_fpswa_hypercall_bundle only
     INST64_I19 I19;    // used in build_hypercall_bundle only
     INST64_I26 I26;    // mov register to ar (I unit)
     INST64_I27 I27;    // mov immediate to ar (I unit)
index aaf0613677dc7fdc2cf2cfa2135733115acb2b59..78ba3cb602c21a6e97d41f5fec356dd05a48c7f3 100644 (file)
@@ -86,7 +86,6 @@ typedef unsigned long paddr_t;
 // FIXME?: x86-ism used in xen/mm.h
 #define LOCK_PREFIX
 
-extern unsigned long xenheap_phys_end;
 extern unsigned long total_pages;
 extern unsigned long xen_pstart;
 extern unsigned long xenheap_size;
@@ -119,9 +118,6 @@ extern char _end[]; /* standard ELF symbol */
 // FIXME SMP: leave SMP for a later time
 ///////////////////////////////////////////////////////////////
 // xen/include/asm/config.h
-// Natural boundary upon TR size to define xenheap space
-#define XENHEAP_DEFAULT_MB (1 << (KERNEL_TR_PAGE_SHIFT - 20))
-#define XENHEAP_DEFAULT_SIZE   (1 << KERNEL_TR_PAGE_SHIFT)
 #define        ELFSIZE 64
 
 ///////////////////////////////////////////////////////////////
index eacf18af7b4644365d2ccc824475e6d4297ad7cf..444804feb799cdaada86ddb92dffe2d8ff042ba4 100644 (file)
@@ -5,6 +5,9 @@
  *     Dan Magenheimer (dan.magenheimer@hp.com)
  */
 
+#define __IA64_XEN_HYPERCALL_DEFAULT           0x1000
+#define __IA64_XEN_HYPERCALL_DEFAULT_STR       "0x1000"
+
 /* Portion of guest physical memory space reserved for PAL/SAL/EFI/ACPI
    data and code.  */
 #define FW_BASE_PADDR          0x0000UL
@@ -65,6 +68,7 @@
 #define FW_HYPERCALL_PAL_CALL_INDEX    0x80UL
 #define FW_HYPERCALL_PAL_CALL_PADDR    FW_HYPERCALL_PADDR(FW_HYPERCALL_PAL_CALL_INDEX)
 #define FW_HYPERCALL_PAL_CALL          0x1000UL
+#define FW_HYPERCALL_PAL_CALL_ASM      0x1000
 
 /*
  * SAL consists of a table of descriptors, one of which (type=0)
 
 /*
  * This is a hypercall number for FPSWA.
- * FPSWA hypercall uses 2 bundles for a pseudo-entry-point and a hypercall-patch.
+ * FPSWA hypercall uses one bundle for a pseudo-entry-point
+ * and 14 bundles for a hypercall-patch.
+ *
+ * 0x500 was used before. But that implemetation is broken.
+ * To keep hypercall abi, 0x500 is obsoleted and allocate 0x501 for 
+ * fspwa hypercall.
  */
 #define FW_HYPERCALL_FPSWA_ENTRY_INDEX                 0x90UL
 #define FW_HYPERCALL_FPSWA_PATCH_INDEX                 0x91UL
 #define FW_HYPERCALL_FPSWA_ENTRY_PADDR                 FW_HYPERCALL_PADDR(FW_HYPERCALL_FPSWA_ENTRY_INDEX)
 #define FW_HYPERCALL_FPSWA_PATCH_PADDR                 FW_HYPERCALL_PADDR(FW_HYPERCALL_FPSWA_PATCH_INDEX)
-#define FW_HYPERCALL_FPSWA                             0x500UL
+#define FW_HYPERCALL_FPSWA_BASE                                0x500UL
+#define FW_HYPERCALL_FPSWA_BROKEN                      0x500UL
+#define FW_HYPERCALL_FPSWA                             0x501UL
+#define FW_HYPERCALL_FPSWA_STR                         "0x501"
 
 /* Set the shared_info base virtual address.  */
 #define FW_HYPERCALL_SET_SHARED_INFO_VA                        0x600UL
 /* Additionnal OEM SAL.  */
 #define SAL_XEN_SAL_RETURN     0x02000000
 
-#ifdef __XEN__
+#if defined(__XEN__) && !defined(__ASSEMBLY__)
 #include <linux/efi.h>
 extern struct ia64_pal_retval xen_pal_emulator(u64, u64, u64, u64);
 extern struct sal_ret_values sal_emulator (long index, unsigned long in1, unsigned long in2, unsigned long in3, unsigned long in4, unsigned long in5, unsigned long in6, unsigned long in7);
index 9ba42021376a6c10f649872f00d2dfcdd0cc5003..47b61ab16c6efd04d1cbc7f9d656c67d6073ffd2 100644 (file)
@@ -38,7 +38,7 @@ typedef struct xc_dom_image domain_t;
 #define printk(fmt, args ...)  xc_dom_printf(fmt, ## args)
 
 #define BUG_ON(p)      assert(!(p))
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)]))
+#define BUILD_BUG_ON(condition) ((void)sizeof(struct { int:-!!(condition); }))
 
 //for sort in linux/sort.h.
 #define sort(base, num, size, cmp, swap) qsort((base), (num), (size), (cmp))
index db05a1119d6c79006487a6d587b2633f1aa77974..d181b5f826ea5abf7a58460497a7c2b78e2827b2 100644 (file)
@@ -10,6 +10,7 @@
 #include <asm/vmx_platform.h>
 #include <xen/list.h>
 #include <xen/cpumask.h>
+#include <xen/mm.h>
 #include <asm/fpswa.h>
 #include <xen/rangeset.h>
 
@@ -43,6 +44,8 @@ extern int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc);
 extern void panic_domain(struct pt_regs *, const char *, ...)
      __attribute__ ((noreturn, format (printf, 2, 3)));
 
+#define has_arch_pdevs(d)    (!list_empty(&(d)->arch.pdev_list))
+
 struct mm_struct {
        volatile pgd_t * pgd;
     // atomic_t mm_users;                      /* How many users with user space? */
@@ -166,6 +169,7 @@ struct arch_domain {
     unsigned char rid_bits;            /* number of virtual rid bits (default: 18) */
     int breakimm;               /* The imm value for hypercalls.  */
 
+    struct list_head pdev_list;
     struct virtual_platform_def     vmx_platform;
 #define        hvm_domain vmx_platform /* platform defs are not vmx specific */
 
@@ -174,6 +178,10 @@ struct arch_domain {
     /* Address of SAL emulator data  */
     struct xen_sal_data *sal_data;
 
+    /* Shared page for notifying that explicit PIRQ EOI is required. */
+    unsigned long *pirq_eoi_map;
+    unsigned long pirq_eoi_map_mfn;
+
     /* Address of efi_runtime_services_t (placed in domain memory)  */
     void *efi_runtime;
     /* Address of fpswa_interface_t (placed in domain memory)  */
@@ -217,7 +225,7 @@ struct arch_domain {
     /* Continuable mm_teardown() */
     unsigned long mm_teardown_offset;
     /* Continuable domain_relinquish_resources() */
-    struct list_head relmem_list;
+    struct page_list_head relmem_list;
 };
 #define INT_ENABLE_OFFSET(v)             \
     (sizeof(vcpu_info_t) * (v)->vcpu_id + \
@@ -281,7 +289,7 @@ struct arch_vcpu {
     char irq_new_condition;    // vpsr.i/vtpr change, check for pending VHPI
     char hypercall_continuation;
 
-    fpswa_ret_t fpswa_ret;     /* save return values of FPSWA emulation */
+    fpswa_ret_t fpswa_ret;     /* save return values of FPSWA emulation */
     struct timer hlt_timer;
     struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
 
@@ -316,6 +324,15 @@ ia64_fault(unsigned long vector, unsigned long isr, unsigned long ifa,
            unsigned long iim, unsigned long itir, unsigned long arg5,
            unsigned long arg6, unsigned long arg7, unsigned long stack);
 
+void
+ia64_lazy_load_fpu(struct vcpu *vcpu);
+
+int construct_dom0(
+    struct domain *d,
+    unsigned long image_start, unsigned long image_len,
+    unsigned long initrd_start, unsigned long initrd_len,
+    char *cmdline);
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
index b78d0c5f34d6317d0740ffb096de4c242d10be17..28c508fbb7efe9907964fc547d1a6cd3a9a5430e 100644 (file)
@@ -4,6 +4,7 @@
 #define __ARCH_IRQ_STAT        1
 #define HARDIRQ_BITS   14
 #include <linux/hardirq.h>
+#include <xen/sched.h>
 
 #define local_softirq_pending()                (local_cpu_data->softirq_pending)
 
diff --git a/xen/include/asm-ia64/hvm/iommu.h b/xen/include/asm-ia64/hvm/iommu.h
new file mode 100644 (file)
index 0000000..6d66502
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef __ASM_IA64_HVM_IOMMU_H__
+#define __ASM_IA64_HVM_IOMMU_H__
+
+#include <asm/hvm/irq.h>
+#include <public/event_channel.h>
+#include <public/arch-ia64/hvm/save.h>
+#include <asm/hw_irq.h>
+#include <asm/iosapic.h>
+
+struct iommu_ops;
+extern struct iommu_ops intel_iommu_ops;
+extern int intel_vtd_setup(void);
+
+#define iommu_get_ops() (&intel_iommu_ops)
+#define iommu_hardware_setup()  (intel_vtd_setup())
+
+static inline int domain_irq_to_vector(struct domain *d, int irq)
+{
+    return irq;
+}
+
+static inline void ack_APIC_irq(void)
+{
+    /* TODO */
+}
+
+static inline void pci_cleanup_msi(struct pci_dev *pdev)
+{
+    /* TODO */
+}
+
+
+extern int assign_irq_vector (int irq);
+
+#endif /* __ASM_IA64_HVM_IOMMU_H__ */
diff --git a/xen/include/asm-ia64/hvm/irq.h b/xen/include/asm-ia64/hvm/irq.h
new file mode 100644 (file)
index 0000000..c16664d
--- /dev/null
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * irq.h
+ *
+ * Interrupt distribution and delivery logic.
+ *
+ * Copyright (c) 2006, K A Fraser, XenSource Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#ifndef __ASM_IA64_HVM_IRQ_H__
+#define __ASM_IA64_HVM_IRQ_H__
+
+#include <xen/irq.h>
+
+#define VIOAPIC_NUM_PINS  48
+
+#include <xen/hvm/irq.h>
+
+struct hvm_hw_pci_irqs {
+    /*
+     * Virtual interrupt wires for a single PCI bus.
+     * Indexed by: device*4 + INTx#.
+     */
+    union {
+        DECLARE_BITMAP(i, 32*4);
+        uint64_t pad[2];
+    };
+};
+
+struct hvm_irq {
+    /*
+     * Virtual interrupt wires for a single PCI bus.
+     * Indexed by: device*4 + INTx#.
+     */
+    struct hvm_hw_pci_irqs pci_intx;
+
+    /* Virtual interrupt and via-link for paravirtual platform driver. */
+    uint32_t callback_via_asserted;
+    union {
+        enum {
+            HVMIRQ_callback_none,
+            HVMIRQ_callback_gsi,
+            HVMIRQ_callback_pci_intx
+        } callback_via_type;
+    };
+    union {
+        uint32_t gsi;
+        struct { uint8_t dev, intx; } pci;
+    } callback_via;
+
+    /*
+     * Number of wires asserting each GSI.
+     *
+     * GSIs 0-15 are the ISA IRQs. ISA devices map directly into this space
+     * except ISA IRQ 0, which is connected to GSI 2.
+     * PCI links map into this space via the PCI-ISA bridge.
+     *
+     * GSIs 16+ are used only be PCI devices. The mapping from PCI device to
+     * GSI is as follows: ((device*4 + device/8 + INTx#) & 31) + 16
+     */
+    u8 gsi_assert_count[VIOAPIC_NUM_PINS];
+
+    /*
+     * GSIs map onto PIC/IO-APIC in the usual way:
+     *  0-7:  Master 8259 PIC, IO-APIC pins 0-7
+     *  8-15: Slave  8259 PIC, IO-APIC pins 8-15
+     *  16+ : IO-APIC pins 16+
+     */
+
+    /* Last VCPU that was delivered a LowestPrio interrupt. */
+    u8 round_robin_prev_vcpu;
+
+    struct hvm_irq_dpci *dpci;
+};
+
+#define hvm_pci_intx_gsi(dev, intx)  \
+    (((((dev)<<2) + ((dev)>>3) + (intx)) & 31) + 16)
+#define hvm_pci_intx_link(dev, intx) \
+    (((dev) + (intx)) & 3)
+
+#define IA64_INVALID_VECTOR    ((unsigned int)((int)-1))
+static inline unsigned int irq_to_vector(int irq)
+{
+    int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
+    unsigned int vector;
+
+    if ( acpi_gsi_to_irq(irq, &vector) < 0)
+        return 0;
+
+    return vector;
+}
+
+extern u8 irq_vector[NR_IRQS];
+extern int vector_irq[NR_VECTORS];
+
+#endif /* __ASM_IA64_HVM_IRQ_H__ */
index a90a49f47d89ac682cf1ffa105b29a4ef4d6e1fd..4f140b847684cb8d7f0912690b4f9b828719f862 100644 (file)
@@ -7,7 +7,7 @@
 #ifndef __IA64_IOCAP_H__
 #define __IA64_IOCAP_H__
 
-extern int ioports_permit_access(struct domain *d,
+extern int ioports_permit_access(struct domain *d, unsigned int gs,
                                 unsigned int s, unsigned int e);
 extern int ioports_deny_access(struct domain *d,
                               unsigned int s, unsigned int e);
index 54085bd26844af8ed32f38c24eff198c86705b58..4e0986fdd5bab1f17b1274ceea8486e17b8060f9 100644 (file)
@@ -10,6 +10,7 @@ cache.h                       -> linux/include/asm-ia64/cache.h
 gcc_intrin.h           -> linux/include/asm-ia64/gcc_intrin.h
 ia64regs.h             -> linux/include/asm-ia64/ia64regs.h
 io.h                   -> linux/include/asm-ia64/io.h
+hw_irq.h               -> linux/include/asm-ia64/hw_irq.h
 kregs.h                        -> linux/include/asm-ia64/kregs.h
 mca_asm.h              -> linux/include/asm-ia64/mca_asm.h
 meminit.h              -> linux/include/asm-ia64/meminit.h
index f59db9202e02e73910c1228364d5db7ce923b696..7a66a0c12415e8674d5a1d7bc85e3798adca2a44 100644 (file)
@@ -38,6 +38,7 @@
 #include <asm/numa.h>
 #ifdef XEN
 #include <xen/nodemask.h>
+extern int acpi_dmar_init(void);
 #endif
 
 #define COMPILER_DEPENDENT_INT64       long
diff --git a/xen/include/asm-ia64/linux-xen/asm/hw_irq.h b/xen/include/asm-ia64/linux-xen/asm/hw_irq.h
new file mode 100644 (file)
index 0000000..9578cd9
--- /dev/null
@@ -0,0 +1,141 @@
+#ifndef _ASM_IA64_HW_IRQ_H
+#define _ASM_IA64_HW_IRQ_H
+
+/*
+ * Copyright (C) 2001-2003 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/profile.h>
+
+#include <asm/machvec.h>
+#include <asm/ptrace.h>
+#include <asm/smp.h>
+
+typedef u8 ia64_vector;
+
+/*
+ * 0 special
+ *
+ * 1,3-14 are reserved from firmware
+ *
+ * 16-255 (vectored external interrupts) are available
+ *
+ * 15 spurious interrupt (see IVR)
+ *
+ * 16 lowest priority, 255 highest priority
+ *
+ * 15 classes of 16 interrupts each.
+ */
+#define IA64_MIN_VECTORED_IRQ           16
+#define IA64_MAX_VECTORED_IRQ          255
+#define IA64_NUM_VECTORS               256
+
+#define AUTO_ASSIGN_IRQ                        (-1)
+
+#define IA64_SPURIOUS_INT_VECTOR       0x0f
+
+/*
+ * Vectors 0x10-0x1f are used for low priority interrupts, e.g. CMCI.
+ */
+#define IA64_CPEP_VECTOR               0x1c    /* corrected platform error polling vector */
+#define IA64_CMCP_VECTOR               0x1d    /* corrected machine-check polling vector */
+#define IA64_CPE_VECTOR                        0x1e    /* corrected platform error interrupt vector */
+#define IA64_CMC_VECTOR                        0x1f    /* corrected machine-check interrupt vector */
+/*
+ * Vectors 0x20-0x2f are reserved for legacy ISA IRQs.
+ */
+#define IA64_FIRST_DEVICE_VECTOR       0x30
+#define IA64_LAST_DEVICE_VECTOR                0xe7
+#define IA64_NUM_DEVICE_VECTORS                (IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1)
+
+#define IA64_MCA_RENDEZ_VECTOR         0xe8    /* MCA rendez interrupt */
+#define IA64_PERFMON_VECTOR            0xee    /* performanc monitor interrupt vector */
+#define IA64_TIMER_VECTOR              0xef    /* use highest-prio group 15 interrupt for timer */
+#define        IA64_MCA_WAKEUP_VECTOR          0xf0    /* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
+#define IA64_IPI_RESCHEDULE            0xfd    /* SMP reschedule */
+#define IA64_IPI_VECTOR                        0xfe    /* inter-processor interrupt vector */
+
+/* Used for encoding redirected irqs */
+
+#define IA64_IRQ_REDIRECTED            (1 << 31)
+
+/* IA64 inter-cpu interrupt related definitions */
+
+#define IA64_IPI_DEFAULT_BASE_ADDR     0xfee00000
+
+/* Delivery modes for inter-cpu interrupts */
+enum {
+        IA64_IPI_DM_INT =       0x0,    /* pend an external interrupt */
+        IA64_IPI_DM_PMI =       0x2,    /* pend a PMI */
+        IA64_IPI_DM_NMI =       0x4,    /* pend an NMI (vector 2) */
+        IA64_IPI_DM_INIT =      0x5,    /* pend an INIT interrupt */
+        IA64_IPI_DM_EXTINT =    0x7,    /* pend an 8259-compatible interrupt. */
+};
+
+extern __u8 isa_irq_to_vector_map[16];
+#define isa_irq_to_vector(x)   isa_irq_to_vector_map[(x)]
+
+extern struct hw_interrupt_type irq_type_ia64_lsapic;  /* CPU-internal interrupt controller */
+
+extern int assign_irq_vector (int irq);        /* allocate a free vector */
+extern void free_irq_vector (int vector);
+extern void ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect);
+extern void register_percpu_irq (ia64_vector vec, struct irqaction *action);
+#ifdef XEN
+extern int xen_do_IRQ(ia64_vector vector);
+extern int setup_vector(unsigned int vec, struct irqaction *action);
+#endif
+
+static inline void
+hw_resend_irq (struct hw_interrupt_type *h, unsigned int vector)
+{
+       platform_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
+}
+
+/*
+ * Default implementations for the irq-descriptor API:
+ */
+
+extern irq_desc_t irq_desc[NR_IRQS];
+
+#ifndef CONFIG_IA64_GENERIC
+static inline unsigned int
+__ia64_local_vector_to_irq (ia64_vector vec)
+{
+       return (unsigned int) vec;
+}
+#endif
+
+/*
+ * Next follows the irq descriptor interface.  On IA-64, each CPU supports 256 interrupt
+ * vectors.  On smaller systems, there is a one-to-one correspondence between interrupt
+ * vectors and the Linux irq numbers.  However, larger systems may have multiple interrupt
+ * domains meaning that the translation from vector number to irq number depends on the
+ * interrupt domain that a CPU belongs to.  This API abstracts such platform-dependent
+ * differences and provides a uniform means to translate between vector and irq numbers
+ * and to obtain the irq descriptor for a given irq number.
+ */
+
+/* Return a pointer to the irq descriptor for IRQ.  */
+static inline irq_desc_t *
+irq_descp (int irq)
+{
+       return irq_desc + irq;
+}
+
+/*
+ * Convert the local IA-64 vector to the corresponding irq number.  This translation is
+ * done in the context of the interrupt domain that the currently executing CPU belongs
+ * to.
+ */
+static inline unsigned int
+local_vector_to_irq (ia64_vector vec)
+{
+       return platform_local_vector_to_irq(vec);
+}
+
+#endif /* _ASM_IA64_HW_IRQ_H */
index 1beb6162ba6e11eff71e71defbf8dce488ba80f5..46dd30ec53e15787d13e146334f6d0a0ca9de830 100644 (file)
@@ -83,12 +83,25 @@ static inline int find_iosapic_by_addr(unsigned long addr)
 
 static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
 {
+#ifdef XEN
+       if(iommu_enabled && (reg >= 10)){
+               int apic = find_iosapic_by_addr((unsigned long)iosapic);
+               return io_apic_read_remap_rte(apic, reg);
+       }
+#endif
        writel(reg, iosapic + IOSAPIC_REG_SELECT);
        return readl(iosapic + IOSAPIC_WINDOW);
 }
 
 static inline void iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
 {
+#ifdef XEN
+       if (iommu_enabled && (reg >= 10)){
+               int apic = find_iosapic_by_addr((unsigned long)iosapic);
+               iommu_update_ire_from_apic(apic, reg, val);
+               return;
+       }
+#endif
        writel(reg, iosapic + IOSAPIC_REG_SELECT);
        writel(val, iosapic + IOSAPIC_WINDOW);
 }
@@ -175,5 +188,8 @@ extern unsigned long ia64_vector_mask[];
 extern unsigned long ia64_xen_vector[];
 #endif /* XEN */
 
+#define IO_APIC_BASE(idx) ((unsigned int *)iosapic_lists[idx].addr)
+#define IO_APIC_ID(idx)   (iosapic_lists[idx].id)
+
 # endif /* !__ASSEMBLY__ */
 #endif /* __ASM_IA64_IOSAPIC_H */
index 28ec0596056e21d28e2a9d369bba6dd7026ba1eb..14a7216ad25b883fb0c453b4dae843df9edcb174 100644 (file)
@@ -99,6 +99,7 @@ do {                                          \
 
 #define virt_addr_valid(kaddr) mfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 
+#ifndef XEN
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 extern int ia64_mfn_valid (unsigned long pfn);
 #else
@@ -119,6 +120,7 @@ extern unsigned long max_low_pfn;
 
 #define page_to_maddr(page)    (page_to_mfn(page) << PAGE_SHIFT)
 #define virt_to_page(kaddr)    mfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#endif
 
 typedef union ia64_va {
        struct {
index 2bc246726ef13c97cfad244072132753f936b3e8..adfd12a3bd7419e5ca034c5bf5e25649f29bf01b 100644 (file)
 #include <asm/system.h>
 #include <asm/types.h>
 #ifdef XEN
+#include <asm/xenpage.h>
 #ifndef __ASSEMBLY__
 #include <xen/sched.h> /* needed for mm_struct (via asm/domain.h) */
 #endif
 #endif
 
+#ifndef XEN
 #define IA64_MAX_PHYS_BITS     50      /* max. number of physical address bits (architected) */
+#endif
 
 /*
  * First, define the various bits in a PTE.  Note that the PTE format
index a436fedc960daba1c9b2b70ba7b519a6f76cbcce..e54e612c71ef353a6497e65b13911192a8aad542 100644 (file)
@@ -198,6 +198,14 @@ DECLARE_PER_CPU(struct cpuinfo_ia64, cpu_info);
 #define local_cpu_data         (&__ia64_per_cpu_var(cpu_info))
 #define cpu_data(cpu)          (&per_cpu(cpu_info, cpu))
 
+#ifdef CONFIG_SMP
+#define cpu_to_core(cpu)    (cpu_data(cpu)->core_id)
+#define cpu_to_socket(cpu)  (cpu_data(cpu)->socket_id)
+#else
+#define cpu_to_core(cpu)    0
+#define cpu_to_socket(cpu)  0
+#endif
+
 extern void identify_cpu (struct cpuinfo_ia64 *);
 extern void print_cpu_info (struct cpuinfo_ia64 *);
 
index a8a143a54b7854fafca27bb62bcab4bbf303098c..e571562a88864dadcaadc5058f650fa32b3669fc 100644 (file)
@@ -130,8 +130,13 @@ extern void smp_do_timer (struct pt_regs *regs);
 extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info,
                                     int retry, int wait);
 extern void smp_send_reschedule (int cpu);
+#ifdef XEN
+extern void lock_ipi_calllock(unsigned long *flags);
+extern void unlock_ipi_calllock(unsigned long flags);
+#else
 extern void lock_ipi_calllock(void);
 extern void unlock_ipi_calllock(void);
+#endif
 extern void identify_siblings (struct cpuinfo_ia64 *);
 
 #else
index 412479e242eccb9d49c4e2ced5bac3593037176d..14bf3a3c1b6e07361f4d0cf32abcabdc09649b80 100644 (file)
@@ -27,25 +27,16 @@ typedef struct {
 #ifdef DEBUG_SPINLOCK
        void *locker;
 #endif
-#ifdef XEN
-       unsigned char recurse_cpu;
-       unsigned char recurse_cnt;
-#endif
-} spinlock_t;
+} raw_spinlock_t;
 
 #ifdef XEN
 #ifdef DEBUG_SPINLOCK
-#define SPIN_LOCK_UNLOCKED     /*(spinlock_t)*/ { 0, NULL, -1, 0 }
+#define _RAW_SPIN_LOCK_UNLOCKED        /*(raw_spinlock_t)*/ { 0, NULL }
 #else
-#define SPIN_LOCK_UNLOCKED     /*(spinlock_t)*/ { 0, -1, 0 }
+#define _RAW_SPIN_LOCK_UNLOCKED        /*(raw_spinlock_t)*/ { 0 }
 #endif
-static inline void spin_lock_init(spinlock_t *lock)
-{
-       *lock = ((spinlock_t)SPIN_LOCK_UNLOCKED);
-}
 #else
-#define SPIN_LOCK_UNLOCKED                     /*(spinlock_t)*/ { 0 }
-#define spin_lock_init(x)                      ((x)->lock = 0)
+#define _RAW_SPIN_LOCK_UNLOCKED        /*(raw_spinlock_t)*/ { 0 }
 #endif
 
 #ifdef ASM_SUPPORTED
@@ -59,7 +50,7 @@ static inline void spin_lock_init(spinlock_t *lock)
 #define IA64_SPINLOCK_CLOBBERS "ar.ccv", "ar.pfs", "p14", "p15", "r27", "r28", "r29", "r30", "b6", "memory"
 
 static inline void
-_raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
+_raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
        register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
@@ -136,10 +127,9 @@ do {                                                                                       \
 } while (0)
 #endif /* !ASM_SUPPORTED */
 
-#define spin_is_locked(x)      ((x)->lock != 0)
-#define _raw_spin_unlock(x)    do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0)
+#define _raw_spin_is_locked(x) ((x)->lock != 0)
+#define _raw_spin_unlock(x)    do { barrier(); (x)->lock = 0; } while (0)
 #define _raw_spin_trylock(x)   (cmpxchg_acq(&(x)->lock, 0, 1) == 0)
-#define spin_unlock_wait(x)    do { barrier(); } while ((x)->lock)
 
 typedef struct {
        volatile unsigned int read_counter      : 31;
@@ -147,16 +137,12 @@ typedef struct {
 #ifdef CONFIG_PREEMPT
        unsigned int break_lock;
 #endif
-} rwlock_t;
-#define RW_LOCK_UNLOCKED /*(rwlock_t)*/ { 0, 0 }
-
-#define rwlock_init(x)         do { *(x) = (rwlock_t) RW_LOCK_UNLOCKED; } while(0)
-#define read_can_lock(rw)      (*(volatile int *)(rw) >= 0)
-#define write_can_lock(rw)     (*(volatile int *)(rw) == 0)
+} raw_rwlock_t;
+#define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { 0, 0 }
 
 #define _raw_read_lock(rw)                                                             \
 do {                                                                                   \
-       rwlock_t *__read_lock_ptr = (rw);                                               \
+       raw_rwlock_t *__read_lock_ptr = (rw);                                           \
                                                                                        \
        while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {          \
                ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);                        \
@@ -167,7 +153,7 @@ do {                                                                                        \
 
 #define _raw_read_unlock(rw)                                   \
 do {                                                           \
-       rwlock_t *__read_lock_ptr = (rw);                       \
+       raw_rwlock_t *__read_lock_ptr = (rw);                   \
        ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);        \
 } while (0)
 
@@ -230,7 +216,6 @@ do {                                                                                \
        clear_bit(31, (x));                                                             \
 })
 
-#ifdef XEN
-#include <asm/xenspinlock.h>
-#endif
+#define _raw_rw_is_locked(x) (*(int *)(x) != 0)
+
 #endif /*  _ASM_IA64_SPINLOCK_H */
index 5dc6fad6f6dccebbd0e666514ed31d4b666d20ef..4d53746fbc7b4deda095c364494ab3bcbbf4153d 100644 (file)
@@ -23,6 +23,9 @@
 
 #include <asm/page.h>
 #include <asm/system.h>
+#ifdef XEN
+#include <asm/pgtable.h>
+#endif
 
 #define EFI_SUCCESS            0
 #define EFI_LOAD_ERROR          ( 1 | (1UL << (BITS_PER_LONG-1)))
index 6f1aece4361ea477906fbe7534306553dc91d5b7..2990ace0a53c555e8dfb38a7432d594ab38d9eb5 100644 (file)
@@ -52,10 +52,10 @@ struct irqaction {
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
-extern int request_irq(unsigned int,
+extern int request_irq_vector(unsigned int,
                       irqreturn_t (*handler)(int, void *, struct pt_regs *),
                       unsigned long, const char *, void *);
-extern void free_irq(unsigned int, void *);
+extern void release_irq_vector(unsigned int, void *);
 #endif
 
 
index 118201a9a760925dc451a0e479b902966cfec199..e2cac1b8508b90cec7def69b2bb73a966f6cf465 100644 (file)
  *     7:3 = slot
  *     2:0 = function
  */
+
+#ifndef XEN
 #define PCI_DEVFN(slot,func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
 #define PCI_SLOT(devfn)                (((devfn) >> 3) & 0x1f)
 #define PCI_FUNC(devfn)                ((devfn) & 0x07)
+#endif
 
 /* Ioctls for /proc/bus/pci/X/Y nodes. */
 #define PCIIOC_BASE            ('P' << 24 | 'C' << 16 | 'I' << 8)
@@ -112,7 +115,11 @@ struct pci_cap_saved_state {
 /*
  * The pci_dev structure is used to describe PCI devices.
  */
+#ifdef XEN
+struct sn_pci_dev {
+#else
 struct pci_dev {
+#endif
        struct list_head global_list;   /* node in list of all PCI devices */
        struct list_head bus_list;      /* node in per-bus list */
        struct pci_bus  *bus;           /* bus this device is on */
@@ -178,6 +185,7 @@ struct pci_dev {
        struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
 };
 
+#ifndef XEN
 #define pci_dev_g(n) list_entry(n, struct pci_dev, global_list)
 #define pci_dev_b(n) list_entry(n, struct pci_dev, bus_list)
 #define        to_pci_dev(n) container_of(n, struct pci_dev, dev)
@@ -206,6 +214,7 @@ static inline void pci_remove_saved_cap(struct pci_cap_saved_state *cap)
 {
        hlist_del(&cap->next);
 }
+#endif
 
 /*
  *  For PCI devices, the region numbers are assigned this way:
@@ -230,7 +239,11 @@ struct pci_bus {
        struct pci_bus  *parent;        /* parent bus this bridge is on */
        struct list_head children;      /* list of child buses */
        struct list_head devices;       /* list of devices on this bus */
+#ifdef XEN
+       struct sn_pci_dev       *self;  /* bridge device as seen by parent */
+#else
        struct pci_dev  *self;          /* bridge device as seen by parent */
+#endif
        struct resource *resource[PCI_BUS_NUM_RESOURCES];
                                        /* address space routed to this bus */
 
@@ -341,7 +354,7 @@ struct pci_error_handlers
 };
 
 /* ---------------------------------------------------------------- */
-
+#ifndef XEN
 struct module;
 struct pci_driver {
        struct list_head node;
@@ -715,9 +728,11 @@ static inline void pci_unblock_user_cfg_access(struct pci_dev *dev) { }
 #endif /* CONFIG_PCI */
 
 /* Include architecture-dependent settings and functions */
+#endif
 
 #include <asm/pci.h>
 
+#ifndef XEN
 /* these helpers provide future and backwards compatibility
  * for accessing popular PCI BAR info */
 #define pci_resource_start(dev,bar)   ((dev)->resource[(bar)].start)
@@ -808,6 +823,7 @@ enum pci_fixup_pass {
 void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev);
 
 extern int pci_pci_problems;
+#endif
 #define PCIPCI_FAIL            1       /* No PCI PCI DMA */
 #define PCIPCI_TRITON          2
 #define PCIPCI_NATOMA          4
index 612cec3f566f21b015970dd6d5b61bbc26ff9724..778d9a823d6e1fe3837a8a59e66c9d90c54a2a21 100644 (file)
@@ -17,7 +17,6 @@ dma.h                 -> linux/include/asm-ia64/dma.h
 fpswa.h                        -> linux/include/asm-ia64/fpswa.h
 fpu.h                  -> linux/include/asm-ia64/fpu.h
 hdreg.h                        -> linux/include/asm-ia64/hdreg.h
-hw_irq.h               -> linux/include/asm-ia64/hw_irq.h
 intrinsics.h           -> linux/include/asm-ia64/intrinsics.h
 ioctl.h                        -> linux/include/asm-ia64/ioctl.h
 irq.h                  -> linux/include/asm-ia64/irq.h
index 687f49fefacc5ffad91db073a46a59658e660dac..9a1c1d0b4b54673d0ce09558cc8bdc18ac1b78a3 100644 (file)
@@ -11,8 +11,8 @@
  * 02/29/00     D.Mosberger    moved most things into hw_irq.h
  */
 
+#define NR_VECTORS     256
 #define NR_IRQS                256
-#define NR_IRQ_VECTORS NR_IRQS
 
 static __inline__ int
 irq_canonicalize (int irq)
index c321316f1bc7e5cd50bfe41f557ef01b18e76f4f..c05dc22641ef55b1902f41a38cf214741a9d20eb 100644 (file)
 #define  PCI_PM_CAP_PME_D3cold 0x8000  /* PME# from D3 (cold) */
 #define PCI_PM_CTRL            4       /* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK        0x0003  /* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET     0x0004  /* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_NO_SOFT_RESET     0x0008  /* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE        0x0100  /* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK     0x1e00  /* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK   0x6000  /* Data scale (??) */
index 28e7387d0453e381d59594fa1d5075471dbeed22..b7b2d392174d24e59f5ea14ceb5656538b2e1ee3 100644 (file)
@@ -13,7 +13,6 @@
 #include <xen/list.h>
 #include <xen/spinlock.h>
 #include <xen/perfc.h>
-#include <xen/sched.h>
 
 #include <asm/processor.h>
 #include <asm/atomic.h>
@@ -40,45 +39,76 @@ typedef unsigned long page_flags_t;
 
 #define PRtype_info "016lx"
 
+#ifdef CONFIG_IA64_SHRINK_PAGE_LIST
+/*
+ * See include/xen/mm.h.
+ * To compress page_list_entry, all the physical address must
+ * be addressed by (32 + PAGE_SHIFT) .
+ * However this is lower than IA64_MAX_PHYS_BITS = 50.
+ */
+#undef page_list_entry
+struct page_list_entry
+{
+    u32 next, prev;
+};
+#endif
+
+#ifdef CONFIG_IA64_PICKLE_DOMAIN
+typedef u32 __ia64_domain_t;
+#else
+typedef unsigned long __ia64_domain_t;
+#endif
+
 struct page_info
 {
     /* Each frame can be threaded onto a doubly-linked list. */
-    struct list_head list;
+    struct page_list_entry list;
 
     /* Reference count and various PGC_xxx flags and fields. */
-    u32 count_info;
+    unsigned long count_info;
 
     /* Context-dependent fields follow... */
     union {
 
         /* Page is in use: ((count_info & PGC_count_mask) != 0). */
         struct {
-            /* Owner of this page (NULL if page is anonymous). */
-            u32 _domain; /* pickled format */
             /* Type reference count and various PGT_xxx flags and fields. */
             unsigned long type_info;
-        } __attribute__ ((packed)) inuse;
+            /* Owner of this page (NULL if page is anonymous). */
+            __ia64_domain_t _domain; /* pickled format */
+        } inuse;
 
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
         struct {
             /* Order-size of the free chunk this page is the head of. */
             u32 order;
-            /* Mask of possibly-tainted TLBs. */
-            cpumask_t cpumask;
-        } __attribute__ ((packed)) free;
+            /* Do TLBs need flushing for safety before next page use? */
+            bool_t need_tlbflush;
+        } free;
 
     } u;
 
     /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
     u32 tlbflush_timestamp;
+};
 
-#if 0
-// following added for Linux compiling
-    page_flags_t flags;
-    atomic_t _count;
-    struct list_head lru;      // is this the same as above "list"?
+#ifndef page_list_entry
+static inline void
+page_list_splice_init(struct page_list_head *list, struct page_list_head *head)
+{
+    if ( !page_list_empty(list) )
+    {
+        if ( head->next )
+            head->tail->list.next = page_to_mfn(list->next);
+        else
+            head->next = list->next;
+        head->tail = list->tail;
+        INIT_PAGE_LIST_HEAD(list);
+    }
+}
+#else
+# define page_list_splice_init list_splice_init
 #endif
-};
 
 #define set_page_count(p,v)    atomic_set(&(p)->_count, v - 1)
 
@@ -86,50 +116,78 @@ struct page_info
  * Still small set of flags defined by far on IA-64.
  * IA-64 should make it a definition same as x86_64.
  */
+#define PG_shift(idx)   (BITS_PER_LONG - (idx))
+#define PG_mask(x, idx) (x ## UL << PG_shift(idx))
+
 /* The following page types are MUTUALLY EXCLUSIVE. */
-#define PGT_none            (0UL<<29) /* no special uses of this page */
-#define PGT_l1_page_table   (1UL<<29) /* using this page as an L1 page table? */
-#define PGT_l2_page_table   (2UL<<29) /* using this page as an L2 page table? */
-#define PGT_l3_page_table   (3UL<<29) /* using this page as an L3 page table? */
-#define PGT_l4_page_table   (4UL<<29) /* using this page as an L4 page table? */
+#define PGT_none          PG_mask(0, 3) /* no special uses of this page */
+#define PGT_l1_page_table PG_mask(1, 3) /* using as an L1 page table? */
+#define PGT_l2_page_table PG_mask(2, 3) /* using as an L2 page table? */
+#define PGT_l3_page_table PG_mask(3, 3) /* using as an L3 page table? */
+#define PGT_l4_page_table PG_mask(4, 3) /* using as an L4 page table? */
  /* Value 5 reserved. See asm-x86/mm.h */
  /* Value 6 reserved. See asm-x86/mm.h */
-#define PGT_writable_page   (7UL<<29) /* has writable mappings of this page? */
-#define PGT_type_mask       (7UL<<29) /* Bits 29-31. */
+#define PGT_writable_page PG_mask(7, 3) /* has writable mappings? */
+#define PGT_type_mask     PG_mask(7, 3) /* Bits 29-31. */
 
- /* Has this page been validated for use as its current type? */
-#define _PGT_validated      28
-#define PGT_validated       (1UL<<_PGT_validated)
  /* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned         27
-#define PGT_pinned          (1UL<<_PGT_pinned)
+#define _PGT_pinned       PG_shift(4)
+#define PGT_pinned        PG_mask(1, 4)
+ /* Has this page been validated for use as its current type? */
+#define _PGT_validated    PG_shift(5)
+#define PGT_validated     PG_mask(1, 5)
 
- /* 16-bit count of uses of this frame as its current type. */
-#define PGT_count_mask      ((1UL<<16)-1)
+ /* Count of uses of this frame as its current type. */
+#define PGT_count_width   PG_shift(7)
+#define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
  /* Cleared when the owning guest 'frees' this page. */
-#define _PGC_allocated      31
-#define PGC_allocated       (1UL<<_PGC_allocated)
- /* Bit 30 reserved. See asm-x86/mm.h */
- /* Bit 29 reserved. See asm-x86/mm.h */
- /* 29-bit count of references to this frame. */
-#define PGC_count_mask      ((1UL<<29)-1)
-
-#define is_xen_heap_mfn(mfn)   (((mfn) < paddr_to_pfn(xenheap_phys_end)) \
-                                && ((mfn) >= paddr_to_pfn(xen_pstart)))
-#define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page))
-
-extern void* xen_pickle_offset;
-#define __pickle(a)    ((unsigned long)a - (unsigned long)xen_pickle_offset)
-#define __unpickle(a)  (void *)(a + xen_pickle_offset)
-
-static inline struct domain *unpickle_domptr(u64 _d)
-{ return (_d == 0) ? NULL : __unpickle(_d); }
-static inline u32 pickle_domptr(struct domain *_d)
-{ return (_d == NULL) ? 0 : (u32)__pickle(_d); }
-
-#define page_get_owner(_p)     (unpickle_domptr((_p)->u.inuse._domain))
-#define page_set_owner(_p, _d) ((_p)->u.inuse._domain = pickle_domptr(_d))
+#define _PGC_allocated    PG_shift(1)
+#define PGC_allocated     PG_mask(1, 1)
+ /* Page is Xen heap? */
+# define _PGC_xen_heap    PG_shift(2)
+# define PGC_xen_heap     PG_mask(1, 2)
+ /* bit PG_shift(3) reserved. See asm-x86/mm.h */
+ /* PG_mask(7, 6) reserved. See asm-x86/mm.h*/
+
+ /* Page is broken? */
+#define _PGC_broken       PG_shift(7)
+#define PGC_broken        PG_mask(1, 7)
+ /* Page is offline pending ? */
+#define _PGC_offlining    PG_shift(8)
+#define PGC_offlining     PG_mask(1, 8)
+ /* Page is offlined */
+#define _PGC_offlined     PG_shift(9)
+#define PGC_offlined      PG_mask(1, 9)
+#define PGC_offlined_broken (PGC_offlined | PGC_broken)
+
+#define is_page_offlining(page) ((page)->count_info & PGC_offlining)
+#define is_page_offlined(page)  ((page)->count_info & PGC_offlined)
+#define is_page_broken(page)    ((page)->count_info & PGC_broken)
+#define is_page_online(page)    (!is_page_offlined(page))
+
+ /* Count of references to this frame. */
+#define PGC_count_width   PG_shift(9)
+#define PGC_count_mask    ((1UL<<PGC_count_width)-1)
+
+extern unsigned long xen_fixed_mfn_start;
+extern unsigned long xen_fixed_mfn_end;
+#define is_xen_heap_page(page)  ((page)->count_info & PGC_xen_heap)
+#define is_xen_heap_mfn(mfn)    (mfn_valid(mfn) &&                      \
+                                 is_xen_heap_page(mfn_to_page(mfn)))
+#define is_xen_fixed_mfn(mfn)                                           \
+    (xen_fixed_mfn_start <= (mfn) && (mfn) <= xen_fixed_mfn_end)
+
+#ifdef CONFIG_IA64_PICKLE_DOMAIN
+#define page_get_owner(_p)                                              \
+    ((struct domain *)((_p)->v.inuse._domain ?                          \
+                       mfn_to_virt((_p)->v.inuse._domain) : NULL))
+#define page_set_owner(_p,_d)                                           \
+    ((_p)->v.inuse._domain = (_d) ? virt_to_mfn(_d) : 0)
+#else
+#define page_get_owner(_p)      ((struct domain *)(_p)->u.inuse._domain)
+#define page_set_owner(_p, _d) ((_p)->u.inuse._domain = (unsigned long)(_d))
+#endif
 
 #define XENSHARE_writable 0
 #define XENSHARE_readonly 1
@@ -151,46 +209,67 @@ void add_to_domain_alloc_list(unsigned long ps, unsigned long pe);
 
 static inline void put_page(struct page_info *page)
 {
-    u32 nx, x, y = page->count_info;
+    unsigned long nx, x, y = page->count_info;
 
     do {
-       x = y;
-       nx = x - 1;
+        x = y;
+        nx = x - 1;
     }
     while (unlikely((y = cmpxchg_rel(&page->count_info, x, nx)) != x));
 
     if (unlikely((nx & PGC_count_mask) == 0))
-       free_domheap_page(page);
+        free_domheap_page(page);
+}
+
+static inline struct domain *page_get_owner_and_reference(
+    struct page_info *page)
+{
+    unsigned long x, y = page->count_info;
+
+    do {
+        x = y;
+        /*
+         * Count ==  0: Page is not allocated, so we cannot take a reference.
+         * Count == -1: Reference count would wrap, which is invalid.
+         * Count == -2: Remaining unused ref is reserved for get_page_light().
+         */
+        /*
+         * On ia64, get_page_light() isn't defined so that it doesn't
+         * make sense to take care of Count == -2.
+         * Just for consistency with x86.
+         */
+        if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
+            return NULL;
+        y = cmpxchg_acq(&page->count_info, x, x + 1);
+    } while (unlikely(y != x));
+
+    return page_get_owner(page);
 }
 
 /* count_info and ownership are checked atomically. */
 static inline int get_page(struct page_info *page,
                            struct domain *domain)
 {
-    u64 x, nx, y = *((u64*)&page->count_info);
-    u32 _domain = pickle_domptr(domain);
+    struct domain *owner = page_get_owner_and_reference(page);
 
-    do {
-       x = y;
-       nx = x + 1;
-       if (unlikely((x & PGC_count_mask) == 0) ||      /* Not allocated? */
-           unlikely((nx & PGC_count_mask) == 0) ||     /* Count overflow? */
-           unlikely((x >> 32) != _domain)) {           /* Wrong owner? */
-
-           gdprintk(XENLOG_INFO, "Error pfn %lx: rd=%p, od=%p, caf=%016lx, taf=%"
-               PRtype_info "\n", page_to_mfn(page), domain,
-               unpickle_domptr(x >> 32), x, page->u.inuse.type_info);
-           return 0;
-       }
-    }
-    while(unlikely((y = cmpxchg_acq((u64*)&page->count_info, x, nx)) != x));
-    return 1;
+    if (likely(owner == domain))
+        return 1;
+
+    if (owner != NULL)
+        put_page(page);
+
+    /* if (!domain->is_dying) */ /* XXX: header inclusion hell */
+    gdprintk(XENLOG_INFO,
+             "Error pfn %lx: rd=%p, od=%p, caf=%016lx, taf=%" PRtype_info "\n",
+             page_to_mfn(page), domain,
+             owner, page->count_info, page->u.inuse.type_info);
+    return 0;
 }
 
 int is_iomem_page(unsigned long mfn);
 
 extern void put_page_type(struct page_info *page);
-extern int get_page_type(struct page_info *page, u32 type);
+extern int get_page_type(struct page_info *page, unsigned long type);
 
 static inline void put_page_and_type(struct page_info *page)
 {
@@ -201,7 +280,7 @@ static inline void put_page_and_type(struct page_info *page)
 
 static inline int get_page_and_type(struct page_info *page,
                                     struct domain *domain,
-                                    u32 type)
+                                    unsigned long type)
 {
     int rc = get_page(page, domain);
 
@@ -428,6 +507,8 @@ extern void assign_new_domain0_page(struct domain *d, unsigned long mpaddr);
 extern int __assign_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr, unsigned long flags);
 extern void assign_domain_page(struct domain *d, unsigned long mpaddr, unsigned long physaddr);
 extern void assign_domain_io_page(struct domain *d, unsigned long mpaddr, unsigned long flags);
+extern int deassign_domain_mmio_page(struct domain *d, unsigned long mpaddr,
+                        unsigned long phys_addr, unsigned long size);
 struct p2m_entry;
 extern unsigned long lookup_domain_mpa(struct domain *d, unsigned long mpaddr, struct p2m_entry* entry);
 extern void *domain_mpa_to_imva(struct domain *d, unsigned long mpaddr);
@@ -460,6 +541,12 @@ extern unsigned long dom0vp_get_memmap(domid_t domid, XEN_GUEST_HANDLE(char) buf
 #define dom0vp_get_memmap(domid, buffer)               (-ENOSYS)
 #endif
 
+int
+p2m_pod_decrease_reservation(struct domain *d,
+                             xen_pfn_t gpfn, unsigned int order);
+int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                          unsigned int order);
+
 extern volatile unsigned long *mpt_table;
 extern unsigned long gmfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
 extern u64 translate_domain_pte(u64 pteval, u64 address, u64 itir__,
diff --git a/xen/include/asm-ia64/msi.h b/xen/include/asm-ia64/msi.h
new file mode 100644 (file)
index 0000000..2adc2ab
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef __ASM_MSI_H
+#define __ASM_MSI_H
+
+/*
+ * MSI Defined Data Structures
+ */
+#define MSI_ADDRESS_HEADER             0xfee
+#define MSI_ADDRESS_HEADER_SHIFT       12
+#define MSI_ADDRESS_HEADER_MASK                0xfff000
+#define MSI_ADDRESS_DEST_ID_MASK       0xfff0000f
+#define MSI_TARGET_CPU_MASK            0xff
+#define MSI_TARGET_CPU_SHIFT           4
+#define MSI_DELIVERY_MODE              0
+#define MSI_LEVEL_MODE                 1       /* Edge always assert */
+#define MSI_TRIGGER_MODE               0       /* MSI is edge sensitive */
+#define MSI_PHYSICAL_MODE              0
+#define MSI_LOGICAL_MODE               1
+#define MSI_REDIRECTION_HINT_MODE      0
+
+#endif /* __ASM_MSI_H */
index 82d55b744953afa461d23adff58f329658fe8802..503e7ef4cd1ec164aeb8cbdcb50c4999cee442f8 100644 (file)
@@ -72,7 +72,7 @@ struct tlb_track {
     unsigned int                limit;
     unsigned int                num_entries;
     unsigned int                num_free;
-    struct list_head            page_list;
+    struct page_list_head       page_list;
 
     /* XXX hash table size */
     spinlock_t                  hash_lock;
index 00b72235e03b66fff38c7201796608f236283065..bb9f2e3772937eb17537fd9a5d46f27f5e422f8f 100644 (file)
@@ -1,7 +1,8 @@
 #ifndef __FLUSHTLB_H__
 #define __FLUSHTLB_H__
 
-#include <xen/sched.h>
+struct vcpu;
+struct domain;
 
 /* TLB flushes can be either local (current vcpu only) or domain wide (on
    all vcpus).
index 869c284e4fb7390e9624ac38a83eb86aebc73b18..7004e6e5b9d725a4b5d46550a8b3c290f5809d4f 100644 (file)
@@ -59,7 +59,7 @@ struct viosapic {
     spinlock_t lock;
     struct vcpu * lowest_vcpu;
     uint64_t base_address;
-    union viosapic_rte redirtbl[VIOSAPIC_NUM_PINS];
+    union vioapic_redir_entry redirtbl[VIOSAPIC_NUM_PINS];
 };
 
 void viosapic_init(struct domain *d);
@@ -70,5 +70,7 @@ void viosapic_write(struct vcpu *v, unsigned long addr,
 
 unsigned long viosapic_read(struct vcpu *v, unsigned long addr,
                             unsigned long length);
+void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
+                          union vioapic_redir_entry *ent);
 
 #endif /* __ASM_IA64_VMX_VIOSAPIC_H__ */
index a1a74353b1b47cdd29345f103a4a734af9144a18..29386cfcbf7dcbe87803db7407a6671da64e98df 100644 (file)
@@ -23,6 +23,8 @@
 #define _ASM_IA64_VT_H
 
 #include <public/hvm/ioreq.h>
+#include <asm/ia64_int.h>
+
 #define vmx_user_mode(regs) (((struct ia64_psr *)&(regs)->cr_ipsr)->vm == 1)
 
 #define VCPU_LID(v) (((u64)(v)->vcpu_id)<<24)
@@ -36,7 +38,7 @@ extern void vmx_load_state(struct vcpu *v);
 extern int vmx_setup_platform(struct domain *d);
 extern void vmx_do_resume(struct vcpu *v);
 extern void vmx_io_assist(struct vcpu *v);
-extern int ia64_hypercall (struct pt_regs *regs);
+extern IA64FAULT ia64_hypercall (struct pt_regs *regs);
 extern unsigned long __gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
 extern void set_privileged_operation_isr (struct vcpu *vcpu,int inst);
 extern void set_rsv_reg_field_isr (struct vcpu *vcpu);
@@ -45,6 +47,7 @@ extern void vmx_relinquish_vcpu_resources(struct vcpu *v);
 extern void vmx_send_assist_req(struct vcpu *v);
 extern void deliver_pal_init(struct vcpu *vcpu);
 extern void vmx_pend_pal_init(struct domain *d);
+extern void vmx_lazy_load_fpu(struct vcpu *vcpu);
 
 static inline vcpu_iodata_t *get_vio(struct vcpu *v)
 {
index deff5c29cfc45569faf02452cf52da7c8f8a6ad3..09f173aa4fb9569170cf60cca2f80bd47dc21c40 100644 (file)
 
 #include <public/xen.h>
 #include <public/hvm/params.h>
+#include <asm/hvm/irq.h>
 #include <asm/viosapic.h>
 #include <asm/hvm/vacpi.h>
+#include <xen/hvm/iommu.h>
 
 struct vmx_ioreq_page {
     spinlock_t          lock;
@@ -41,6 +43,9 @@ typedef struct virtual_platform_def {
     /* One IOSAPIC now... */
     struct viosapic             viosapic;
     struct vacpi                vacpi;
+    /* Pass-throgh VT-d */
+    struct hvm_irq              irq;
+    struct hvm_iommu            hvm_iommu;
 } vir_plat_t;
 
 static inline int __fls(uint32_t word)
index be9e2142b69c255dfcf6eea0ecb3fe2e237543a7..0eea015df6583aa5ae9e0c4583d9b701decaf585 100644 (file)
@@ -1,12 +1,11 @@
 #ifndef _ASM_IA64_XENPAGE_H
 #define _ASM_IA64_XENPAGE_H
 
+/* moved from xen/include/asm-ia64/linux-xen/asm/pgtable.h to compile */
+#define IA64_MAX_PHYS_BITS     50      /* max. number of physical address bits (architected) */
+
 #ifndef __ASSEMBLY__
-#undef mfn_valid
-#undef page_to_mfn
-#undef mfn_to_page
 #ifdef CONFIG_VIRTUAL_FRAME_TABLE
-#undef ia64_mfn_valid
 extern int ia64_mfn_valid (unsigned long pfn);
 # define mfn_valid(_pfn)       (((_pfn) < max_page) && ia64_mfn_valid(_pfn))
 #else
@@ -18,19 +17,26 @@ extern int ia64_mfn_valid (unsigned long pfn);
 
 #include <asm/xensystem.h>
 
-static inline unsigned long __virt_to_maddr(unsigned long va)
-{
-       if (va - KERNEL_START < xenheap_size)
-               return xen_pstart + (va - KERNEL_START);
-       else
-               return (va & ((1UL << 60) - 1));
-}
+/*
+ * macro: avoid header inclustion hell
+ * static inline unsigned long __virt_to_maddr(unsigned long va)
+ */
+/*
+ * Because the significant 8 bits of VA are used by Xen,
+ * and xen uses cached/uncached identity mapping.
+ * IA64_MAX_PHYS_BITS can't be larger than 56
+ */
+#define __virt_to_maddr(va)                                            \
+       ({                                                              \
+               unsigned long __va__ = (va);                            \
+               (__va__ - KERNEL_START < KERNEL_TR_PAGE_SIZE) ?         \
+                       xen_pstart + (__va__ - KERNEL_START) :          \
+                       (__va__ & ((1UL << IA64_MAX_PHYS_BITS) - 1));   \
+       })
 
 #define virt_to_maddr(va)      (__virt_to_maddr((unsigned long)va))
 
 
-#undef page_to_maddr
-#undef virt_to_page
 #define page_to_maddr(page)    (page_to_mfn(page) << PAGE_SHIFT)
 #define virt_to_page(kaddr)    (mfn_to_page(virt_to_maddr(kaddr) >> PAGE_SHIFT))
 
@@ -89,8 +95,6 @@ static inline u64 pa_clear_uc(u64 paddr)
     return (paddr << 1) >> 1;
 }
 
-#undef __pa
-#undef __va
 #define __pa(x)                (virt_to_maddr(x))
 #define __va(x)                ({xen_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;})
 
index 5ae806faa984c44a7e210608a5963ab9dd02ca5d..459978e49150398ef270416f0ca0ef55235c67a4 100644 (file)
@@ -166,4 +166,7 @@ extern u8 x86_acpiid_to_apicid[];
 
 extern int acpi_dmar_init(void);
 
+/* Incremented whenever we transition through S3. Value is 1 during boot. */
+extern uint32_t system_reset_counter;
+
 #endif /*__X86_ASM_ACPI_H*/
index 134ac890e4bcf50a71ca85864f58c80c026559fc..0eb84d90d7f188fc59a2a5c65445be1dd2787dad 100644 (file)
@@ -80,6 +80,8 @@
 #define                APIC_LVTTHMR    0x330
 #define                APIC_LVTPC      0x340
 #define                APIC_LVT0       0x350
+#define                APIC_CMCI       0x2F0
+
 #define                        APIC_LVT_TIMER_BASE_MASK        (0x3<<18)
 #define                        GET_APIC_TIMER_BASE(x)          (((x)>>18)&0x3)
 #define                        SET_APIC_TIMER_BASE(x)          (((x)<<18))
index 68d82791f1877f2feb02b11f9849dba899f95aa4..9d140ef891d0696ade4334dfd2e3a9f829a9a0e8 100644 (file)
@@ -22,7 +22,7 @@
 #define CONFIG_X86_IO_APIC 1
 #define CONFIG_X86_PM_TIMER 1
 #define CONFIG_HPET_TIMER 1
-#define CONFIG_X86_MCE_P4THERMAL 1
+#define CONFIG_X86_MCE_THERMAL 1
 #define CONFIG_NUMA 1
 #define CONFIG_DISCONTIGMEM 1
 #define CONFIG_NUMA_EMU 1
 #define CONFIG_HOTPLUG 1
 #define CONFIG_HOTPLUG_CPU 1
 
-/*
- * Avoid deep recursion when tearing down pagetables during domain destruction,
- * causing dom0 to become unresponsive and Xen to miss time-critical softirq
- * deadlines. This will ultimately be replaced by built-in preemptibility of
- * get_page_type().
- */
-#define DOMAIN_DESTRUCT_AVOID_RECURSION 1
-
 #define HZ 100
 
 #define OPT_CONSOLE_STR "vga"
 #define NR_CPUS 32
 #endif
 
+#ifdef MAX_PHYS_IRQS
+#define NR_IRQS MAX_PHYS_IRQS
+#else
+#define NR_IRQS 256
+#endif
+
 #if defined(__i386__) && (NR_CPUS > 32)
 #error "Maximum of 32 physical processors supported by Xen on x86_32"
 #endif
@@ -119,8 +117,6 @@ extern unsigned int video_mode, video_flags;
 
 #define asmlinkage
 
-#define XENHEAP_DEFAULT_MB (16)
-
 #define PML4_ENTRY_BITS  39
 #ifndef __ASSEMBLY__
 #define PML4_ENTRY_BYTES (1UL << PML4_ENTRY_BITS)
@@ -312,7 +308,6 @@ extern unsigned int video_mode, video_flags;
 #define RO_MPT_VIRT_END                FRAMETABLE_VIRT_START
 #define RO_MPT_VIRT_START      (RO_MPT_VIRT_END - (MACHPHYS_MBYTES<<20))
 
-#define XENHEAP_DEFAULT_MB     (DIRECTMAP_MBYTES)
 #define DIRECTMAP_PHYS_END     (DIRECTMAP_MBYTES<<20)
 
 /* Maximum linear address accessible via guest memory segments. */
@@ -342,7 +337,10 @@ extern unsigned int video_mode, video_flags;
 #endif /* __i386__ */
 
 #ifndef __ASSEMBLY__
-extern unsigned long xen_phys_start, xenheap_phys_start, xenheap_phys_end;
+extern unsigned long xen_phys_start;
+#if defined(__i386__)
+extern unsigned long xenheap_phys_end;
+#endif
 #endif
 
 /* GDT/LDT shadow mapping area. The first per-domain-mapping sub-area. */
index 18f6aff015b85b8524cff1437deedda4dc9aa8be..4c6eeabc55dc1f4e46896ce584fed972fd7d1cc2 100644 (file)
@@ -74,6 +74,7 @@
 #define X86_FEATURE_P3         (3*32+ 6) /* P3 */
 #define X86_FEATURE_P4         (3*32+ 7) /* P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_NOSTOP_TSC (3*32+ 9) /* TSC does not stop in C states */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3       (4*32+ 0) /* Streaming SIMD Extensions-3 */
@@ -94,6 +95,8 @@
 #define X86_FEATURE_SSE4_2     (4*32+20) /* Streaming SIMD Extensions 4.2 */
 #define X86_FEATURE_X2APIC     (4*32+21) /* Extended xAPIC */
 #define X86_FEATURE_POPCNT     (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_XSAVE      (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
 #define X86_FEATURE_XSTORE     (5*32+ 2) /* on-CPU RNG present (xstore insn) */
index f59f372092d79293231677eb2cac107e9b1e841e..20eb8dbe5fffa1e3b0bfdfa49c59b49c9e11a01b 100644 (file)
@@ -57,8 +57,6 @@
 
 #ifndef __ASSEMBLY__
 
-#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (TSS_ENTRY<<3) )
-
 #if defined(__x86_64__)
 #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
 #elif defined(__i386__)
@@ -219,7 +217,7 @@ DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
 #endif
 
 extern void set_intr_gate(unsigned int irq, void * addr);
-extern void set_tss_desc(unsigned int n, void *addr);
+extern void load_TR(void);
 
 #endif /* !__ASSEMBLY__ */
 
index 3acab04f279463bfd38ceea2734d8aedd7166802..2bf5b1c82325be5152a1e063d017c5ecc96f78ba 100644 (file)
@@ -6,7 +6,6 @@
 #include <asm/hvm/vcpu.h>
 #include <asm/hvm/domain.h>
 #include <asm/e820.h>
-#include <asm/pirq.h>
 
 #define has_32bit_shinfo(d)    ((d)->arch.has_32bit_shinfo)
 #define is_pv_32bit_domain(d)  ((d)->arch.is_32bit_pv)
@@ -17,7 +16,6 @@
 #define is_pv_32on64_domain(d) (0)
 #endif
 #define is_pv_32on64_vcpu(v)   (is_pv_32on64_domain((v)->domain))
-#define IS_COMPAT(d)           (is_pv_32on64_domain(d))
 
 struct trap_bounce {
     uint32_t      error_code;
@@ -80,11 +78,11 @@ struct shadow_domain {
     int               locker; /* processor which holds the lock */
     const char       *locker_function; /* Func that took it */
     unsigned int      opt_flags;    /* runtime tunable optimizations on/off */
-    struct list_head  pinned_shadows;
+    struct page_list_head pinned_shadows;
 
     /* Memory allocation */
-    struct list_head  freelists[SHADOW_MAX_ORDER + 1];
-    struct list_head  p2m_freelist;
+    struct page_list_head freelists[SHADOW_MAX_ORDER + 1];
+    struct page_list_head p2m_freelist;
     unsigned int      total_pages;  /* number of pages allocated */
     unsigned int      free_pages;   /* number of pages on freelists */
     unsigned int      p2m_pages;    /* number of pages allocates to p2m */
@@ -93,7 +91,7 @@ struct shadow_domain {
     pagetable_t unpaged_pagetable;
 
     /* Shadow hashtable */
-    struct shadow_page_info **hash_table;
+    struct page_info **hash_table;
     int hash_walking;  /* Some function is walking the hash table */
 
     /* Fast MMIO path heuristic */
@@ -144,7 +142,7 @@ struct hap_domain {
     int               locker;
     const char       *locker_function;
 
-    struct list_head  freelist;
+    struct page_list_head freelist;
     unsigned int      total_pages;  /* number of pages allocated */
     unsigned int      free_pages;   /* number of pages on freelists */
     unsigned int      p2m_pages;    /* number of pages allocates to p2m */
@@ -205,6 +203,31 @@ typedef xen_domctl_cpuid_t cpuid_input_t;
 
 struct p2m_domain;
 
+/* Define for GUEST MCA handling */
+#define MAX_NR_BANKS 30
+
+/* This entry is for recording bank nodes for the impacted domain,
+ * put into impact_header list. */
+struct bank_entry {
+    struct list_head list;
+    int32_t cpu;
+    uint16_t bank;
+    uint64_t mci_status;
+    uint64_t mci_addr;
+    uint64_t mci_misc;
+};
+
+struct domain_mca_msrs
+{
+    /* Guest should not change below values after DOM boot up */
+    uint64_t mcg_cap;
+    uint64_t mcg_ctl;
+    uint64_t mcg_status;
+    uint64_t mci_ctl[MAX_NR_BANKS];
+    uint16_t nr_injection;
+    struct list_head impact_header;
+};
+
 struct arch_domain
 {
     l1_pgentry_t *mm_perdomain_pt;
@@ -222,6 +245,8 @@ struct arch_domain
     unsigned int hv_compat_vstart;
 #endif
 
+    bool_t s3_integrity;
+
     /* I/O-port admin-specified access capabilities. */
     struct rangeset *ioport_caps;
     uint32_t pci_cf8;
@@ -237,7 +262,11 @@ struct arch_domain
 
     /* NB. protected by d->event_lock and by irq_desc[vector].lock */
     int vector_pirq[NR_VECTORS];
-    int pirq_vector[NR_PIRQS];
+    s16 pirq_vector[NR_IRQS];
+
+    /* Shared page for notifying that explicit PIRQ EOI is required. */
+    unsigned long *pirq_eoi_map;
+    unsigned long pirq_eoi_map_mfn;
 
     /* Pseudophysical e820 map (XENMEM_memory_map).  */
     struct e820entry e820[3];
@@ -262,9 +291,12 @@ struct arch_domain
         RELMEM_l2,
         RELMEM_done,
     } relmem;
-    struct list_head relmem_list;
+    struct page_list_head relmem_list;
 
     cpuid_input_t cpuids[MAX_CPUID_INPUT];
+
+    /* For Guest vMCA handling */
+    struct domain_mca_msrs vmca_msrs;
 } __cacheline_aligned;
 
 #define has_arch_pdevs(d)    (!list_empty(&(d)->arch.pdev_list))
@@ -349,6 +381,7 @@ struct arch_vcpu
 
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
+    spinlock_t shadow_ldt_lock;
 
     struct paging_vcpu paging;
 
@@ -389,6 +422,13 @@ void domain_cpuid(struct domain *d,
                   unsigned int  *ecx,
                   unsigned int  *edx);
 
+int construct_dom0(
+    struct domain *d,
+    unsigned long image_base,
+    unsigned long image_start, unsigned long image_len,
+    unsigned long initrd_start, unsigned long initrd_len,
+    char *cmdline);
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
index 8602ca85ac9fd2b5a80e5fc58db0b27c2f0f4a84..a420ba9f80a7e4d26f71bfb30e9e9bbab004506a 100644 (file)
@@ -24,6 +24,9 @@ struct e820map {
 };
 
 extern int reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e);
+extern int e820_change_range_type(
+    struct e820map *e820, uint64_t s, uint64_t e,
+    uint32_t orig_type, uint32_t new_type);
 extern unsigned long init_e820(const char *, struct e820entry *, int *);
 extern struct e820map e820;
 
index b1323089b1ed4b8ad245db60f31456b4521d5c4f..606ec6df0612fe88f53ff3690601dc8d4f6e7fbd 100644 (file)
 
 #include <xen/shared.h>
 
-static inline void vcpu_kick(struct vcpu *v)
-{
-    /*
-     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
-     * pending flag. These values may fluctuate (after all, we hold no
-     * locks) but the key insight is that each change will cause
-     * evtchn_upcall_pending to be polled.
-     * 
-     * NB2. We save the running flag across the unblock to avoid a needless
-     * IPI for domains that we IPI'd to unblock.
-     */
-    int running = v->is_running;
-    vcpu_unblock(v);
-    if ( running )
-        smp_send_event_check_cpu(v->processor);
-}
-
-static inline void vcpu_mark_events_pending(struct vcpu *v)
-{
-    int already_pending = test_and_set_bit(
-        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
-
-    if ( already_pending )
-        return;
-
-    if ( is_hvm_vcpu(v) )
-        hvm_assert_evtchn_irq(v);
-    else
-        vcpu_kick(v);
-}
+void vcpu_kick(struct vcpu *v);
+void vcpu_mark_events_pending(struct vcpu *v);
 
 int hvm_local_events_need_delivery(struct vcpu *v);
 static inline int local_events_need_delivery(void)
index 4917336a715de34d7d3c3f249cbabeb2764a5700..1ef9a692028518b61d44d411af8e9e5cdef4c4d1 100644 (file)
@@ -29,6 +29,7 @@
  * from the end of virtual memory backwards.
  */
 enum fixed_addresses {
+    FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */
 #ifdef __i386__
     FIX_PAE_HIGHMEM_0,
     FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
@@ -49,7 +50,7 @@ enum fixed_addresses {
     FIX_IOMMU_MMIO_END = FIX_IOMMU_MMIO_BASE_0 + IOMMU_PAGES -1,
     FIX_TBOOT_SHARED_BASE,
     FIX_MSIX_IO_RESERV_BASE,
-    FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + MAX_MSIX_PAGES -1,
+    FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + FIX_MSIX_MAX_PAGES -1,
     __end_of_fixed_addresses
 };
 
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
new file mode 100644 (file)
index 0000000..16a8b75
--- /dev/null
@@ -0,0 +1,291 @@
+/******************************************************************************
+ * xen/asm-x86/guest_pt.h
+ *
+ * Types and accessors for guest pagetable entries, as distinct from
+ * Xen's pagetable types. 
+ *
+ * Users must #define GUEST_PAGING_LEVELS to 2, 3 or 4 before including
+ * this file.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _XEN_ASM_GUEST_PT_H
+#define _XEN_ASM_GUEST_PT_H
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(unsigned long,gfn)
+#define PRI_gfn "05lx"
+
+#define VALID_GFN(m) (m != INVALID_GFN)
+
+static inline int
+valid_gfn(gfn_t m)
+{
+    return VALID_GFN(gfn_x(m));
+}
+
+static inline paddr_t
+gfn_to_paddr(gfn_t gfn)
+{
+    return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
+}
+
+/* Override gfn_to_mfn to work with gfn_t */
+#undef gfn_to_mfn
+#define gfn_to_mfn(d, g, t) _gfn_to_mfn_type((d), gfn_x(g), (t), p2m_alloc)
+
+
+/* Types of the guest's page tables and access functions for them */
+
+#if GUEST_PAGING_LEVELS == 2
+
+#define GUEST_L1_PAGETABLE_ENTRIES     1024
+#define GUEST_L2_PAGETABLE_ENTRIES     1024
+#define GUEST_L1_PAGETABLE_SHIFT         12
+#define GUEST_L2_PAGETABLE_SHIFT         22
+
+typedef uint32_t guest_intpte_t;
+typedef struct { guest_intpte_t l1; } guest_l1e_t;
+typedef struct { guest_intpte_t l2; } guest_l2e_t;
+
+#define PRI_gpte "08x"
+
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return ((paddr_t) gl1e.l1) & (PADDR_MASK & PAGE_MASK); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return ((paddr_t) gl2e.l2) & (PADDR_MASK & PAGE_MASK); }
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(guest_l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(guest_l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return gl1e.l1 & 0xfff; }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return gl2e.l2 & 0xfff; }
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return (guest_l1e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return (guest_l2e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; }
+
+#define guest_l1_table_offset(_va)                                           \
+    (((_va) >> GUEST_L1_PAGETABLE_SHIFT) & (GUEST_L1_PAGETABLE_ENTRIES - 1))
+#define guest_l2_table_offset(_va)                                           \
+    (((_va) >> GUEST_L2_PAGETABLE_SHIFT) & (GUEST_L2_PAGETABLE_ENTRIES - 1))
+
+#else /* GUEST_PAGING_LEVELS != 2 */
+
+#if GUEST_PAGING_LEVELS == 3
+#define GUEST_L1_PAGETABLE_ENTRIES      512
+#define GUEST_L2_PAGETABLE_ENTRIES      512
+#define GUEST_L3_PAGETABLE_ENTRIES        4
+#define GUEST_L1_PAGETABLE_SHIFT         12
+#define GUEST_L2_PAGETABLE_SHIFT         21
+#define GUEST_L3_PAGETABLE_SHIFT         30
+#else /* GUEST_PAGING_LEVELS == 4 */
+#define GUEST_L1_PAGETABLE_ENTRIES      512
+#define GUEST_L2_PAGETABLE_ENTRIES      512
+#define GUEST_L3_PAGETABLE_ENTRIES      512
+#define GUEST_L4_PAGETABLE_ENTRIES      512
+#define GUEST_L1_PAGETABLE_SHIFT         12
+#define GUEST_L2_PAGETABLE_SHIFT         21
+#define GUEST_L3_PAGETABLE_SHIFT         30
+#define GUEST_L4_PAGETABLE_SHIFT         39
+#endif
+
+typedef l1_pgentry_t guest_l1e_t;
+typedef l2_pgentry_t guest_l2e_t;
+typedef l3_pgentry_t guest_l3e_t;
+#if GUEST_PAGING_LEVELS >= 4
+typedef l4_pgentry_t guest_l4e_t;
+#endif
+typedef intpte_t guest_intpte_t;
+
+#define PRI_gpte "016"PRIx64
+
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr(gl2e); }
+static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
+{ return l3e_get_paddr(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
+{ return l4e_get_paddr(gl4e); }
+#endif
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
+{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
+{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
+#endif
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags(gl2e); }
+static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
+{ return l3e_get_flags(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
+{ return l4e_get_flags(gl4e); }
+#endif
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
+{ return l3e_from_pfn(gfn_x(gfn), flags); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
+{ return l4e_from_pfn(gfn_x(gfn), flags); }
+#endif
+
+#define guest_l1_table_offset(a) l1_table_offset(a)
+#define guest_l2_table_offset(a) l2_table_offset(a)
+#define guest_l3_table_offset(a) l3_table_offset(a)
+#define guest_l4_table_offset(a) l4_table_offset(a)
+
+#endif /* GUEST_PAGING_LEVELS != 2 */
+
+
+/* Which pagetable features are supported on this vcpu? */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+     * CR4.PSE is set or the guest is in PAE or long mode. 
+     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
+    return (is_hvm_vcpu(v) && 
+            (GUEST_PAGING_LEVELS != 2 
+             || !hvm_paging_enabled(v)
+             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
+        return 0;
+    if ( !is_hvm_vcpu(v) )
+        return cpu_has_nx;
+    return hvm_nx_enabled(v);
+}
+
+
+
+/* Type used for recording a walk through guest pagetables.  It is
+ * filled in by the pagetable walk function, and also used as a cache
+ * for later walks.  When we encounter a superpage l2e, we fabricate an
+ * l1e for propagation to the shadow (for splintering guest superpages
+ * into many shadow l1 entries).  */
+typedef struct guest_pagetable_walk walk_t;
+struct guest_pagetable_walk
+{
+    unsigned long va;           /* Address we were looking for */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+    guest_l4e_t l4e;            /* Guest's level 4 entry */
+#endif
+    guest_l3e_t l3e;            /* Guest's level 3 entry */
+#endif
+    guest_l2e_t l2e;            /* Guest's level 2 entry */
+    guest_l1e_t l1e;            /* Guest's level 1 entry (or fabrication) */
+#if GUEST_PAGING_LEVELS >= 4
+    mfn_t l4mfn;                /* MFN that the level 4 entry was in */
+    mfn_t l3mfn;                /* MFN that the level 3 entry was in */
+#endif
+    mfn_t l2mfn;                /* MFN that the level 2 entry was in */
+    mfn_t l1mfn;                /* MFN that the level 1 entry was in */
+};
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+        return _gfn(INVALID_GFN);
+    return guest_l1e_get_gfn(gw->l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+        return 0;
+    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
+}
+
+/* Walk the guest pagetables, after the manner of a hardware walker. 
+ *
+ * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
+ *         pointer to a pagefault code, the MFN of the guest's 
+ *         top-level pagetable, and a mapping of the 
+ *         guest's top-level pagetable.
+ * 
+ * We walk the vcpu's guest pagetables, filling the walk_t with what we
+ * see and adding any Accessed and Dirty bits that are needed in the
+ * guest entries.  Using the pagefault code, we check the permissions as
+ * we go.  For the purposes of reading pagetables we treat all non-RAM
+ * memory as contining zeroes.
+ * 
+ * Returns 0 for success, or the set of permission bits that we failed on 
+ * if the walk did not complete. */
+
+/* Macro-fu so you can call guest_walk_tables() and get the right one. */
+#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels
+#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
+#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
+
+extern uint32_t 
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                  uint32_t pfec, mfn_t top_mfn, void *top_map);
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+    gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    gdprintk(XENLOG_INFO, "   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
+    gdprintk(XENLOG_INFO, "   l4e=%" PRI_gpte "\n", gw->l4e.l4);
+    gdprintk(XENLOG_INFO, "   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
+#endif /* PAE or 64... */
+    gdprintk(XENLOG_INFO, "   l3e=%" PRI_gpte "\n", gw->l3e.l3);
+#endif /* All levels... */
+    gdprintk(XENLOG_INFO, "   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
+    gdprintk(XENLOG_INFO, "   l2e=%" PRI_gpte "\n", gw->l2e.l2);
+    gdprintk(XENLOG_INFO, "   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
+    gdprintk(XENLOG_INFO, "   l1e=%" PRI_gpte "\n", gw->l1e.l1);
+}
+
+#endif /* _XEN_ASM_GUEST_PT_H */
index b63f56805d82341aa4d143f921f00add6381243d..f962ab794877164f689e8888b48f4c48b1f4c646 100644 (file)
 #define HPET_T2_CMP    0x148
 #define HPET_T2_ROUTE  0x150
 
+#define HPET_Tn_CFG(n)      (HPET_T0_CFG + n * 0x20)
+#define HPET_Tn_CMP(n)      (HPET_T0_CMP + n * 0x20)
+#define HPET_Tn_ROUTE(n)    (HPET_T0_ROUTE + n * 0x20)
+
 #define HPET_ID_VENDOR 0xffff0000
 #define HPET_ID_LEGSUP 0x00008000
 #define HPET_ID_NUMBER 0x00001f00
 #define HPET_TN_PERIODIC_CAP   0x010
 #define HPET_TN_SETVAL         0x040
 #define HPET_TN_32BIT          0x100
+#define HPET_TN_ROUTE          0x3e00
+#define HPET_TN_FSB            0x4000
+#define HPET_TN_FSB_CAP                0x8000
+#define HPET_TN_ROUTE_SHIFT    9
+
 
 #define hpet_read32(x)    \
     (*(volatile u32 *)(fix_to_virt(FIX_HPET_BASE) + (x)))
index 475c031df0e464dee549d3b73541543db90b04df..70e38bff809e078aa07c9712c9d3c67d781f4471 100644 (file)
@@ -75,6 +75,10 @@ struct hvm_domain {
     /* Pass-through */
     struct hvm_iommu       hvm_iommu;
 
+    /* hypervisor intercepted msix table */
+    struct list_head       msixtbl_list;
+    spinlock_t             msixtbl_list_lock;
+
     struct viridian_domain viridian;
 
     bool_t                 hap_enabled;
index a9bee16e10b57277a996f2079e4c061a5d02a819..b61c5660949ea57b2dcf6622f2b44197e95c9517 100644 (file)
@@ -321,4 +321,6 @@ static inline void hvm_set_info_guest(struct vcpu *v)
         return hvm_funcs.set_info_guest(v);
 }
 
+int hvm_debug_op(struct vcpu *v, int32_t op);
+
 #endif /* __ASM_X86_HVM_HVM_H__ */
index 0f17390e9cacf6dd86a5619341ab2bc67aedf02e..1f2312427900fd2416c3036e576d54906ba01db2 100644 (file)
 #ifndef __ASM_X86_HVM_IRQ_H__
 #define __ASM_X86_HVM_IRQ_H__
 
-#include <xen/types.h>
-#include <xen/spinlock.h>
-#include <asm/irq.h>
-#include <asm/pirq.h>
+#include <xen/hvm/irq.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/vpic.h>
 #include <asm/hvm/vioapic.h>
-#include <public/hvm/save.h>
-
-struct dev_intx_gsi_link {
-    struct list_head list;
-    uint8_t device;
-    uint8_t intx;
-    uint8_t gsi;
-    uint8_t link;
-};
-
-#define _HVM_IRQ_DPCI_MSI  0x1
-
-struct hvm_gmsi_info {
-    uint32_t gvec;
-    uint32_t gflags;
-};
-
-struct hvm_mirq_dpci_mapping {
-    uint32_t flags;
-    int pending;
-    struct list_head digl_list;
-    struct domain *dom;
-    struct hvm_gmsi_info gmsi;
-};
-
-struct hvm_girq_dpci_mapping {
-    uint8_t valid;
-    uint8_t device;
-    uint8_t intx;
-    uint8_t machine_gsi;
-};
-
-#define NR_ISAIRQS  16
-#define NR_LINK     4
-/* Protected by domain's event_lock */
-struct hvm_irq_dpci {
-    /* Machine IRQ to guest device/intx mapping. */
-    DECLARE_BITMAP(mapping, NR_PIRQS);
-    struct hvm_mirq_dpci_mapping mirq[NR_IRQS];
-    /* Guest IRQ to guest device/intx mapping. */
-    struct hvm_girq_dpci_mapping girq[NR_IRQS];
-    uint8_t msi_gvec_pirq[NR_VECTORS];
-    DECLARE_BITMAP(dirq_mask, NR_IRQS);
-    /* Record of mapped ISA IRQs */
-    DECLARE_BITMAP(isairq_map, NR_ISAIRQS);
-    /* Record of mapped Links */
-    uint8_t link_cnt[NR_LINK];
-    struct timer hvm_timer[NR_IRQS];
-};
 
 struct hvm_irq {
     /*
@@ -149,27 +97,16 @@ struct hvm_irq {
 
 #define hvm_isa_irq_to_gsi(isa_irq) ((isa_irq) ? : 2)
 
-/* Modify state of a PCI INTx wire. */
-void hvm_pci_intx_assert(
-    struct domain *d, unsigned int device, unsigned int intx);
-void hvm_pci_intx_deassert(
-    struct domain *d, unsigned int device, unsigned int intx);
-
-/* Modify state of an ISA device's IRQ wire. */
-void hvm_isa_irq_assert(
-    struct domain *d, unsigned int isa_irq);
-void hvm_isa_irq_deassert(
-    struct domain *d, unsigned int isa_irq);
-
-void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
-
-void hvm_maybe_deassert_evtchn_irq(void);
-void hvm_assert_evtchn_irq(struct vcpu *v);
-void hvm_set_callback_via(struct domain *d, uint64_t via);
-
 /* Check/Acknowledge next pending interrupt. */
 struct hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v);
 struct hvm_intack hvm_vcpu_ack_pending_irq(struct vcpu *v,
                                            struct hvm_intack intack);
 
+/*
+ * Currently IA64 Xen doesn't support MSI. So for x86, we define this macro
+ * to control the conditional compilation of some MSI-related functions.
+ * This macro will be removed once IA64 has MSI support.
+ */
+#define SUPPORT_MSI_REMAPPING 1
+
 #endif /* __ASM_X86_HVM_IRQ_H__ */
index e9e47990c0fefb528844d7fdc3f34509de12c234..1722c340da19fa05c60d6cc03deb278792f94947 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <xen/sched.h>
 #include <asm/amd-iommu.h>
+#include <xen/domain_page.h>
 
 #define for_each_amd_iommu(amd_iommu) \
     list_for_each_entry(amd_iommu, \
@@ -59,17 +60,17 @@ int __init amd_iommu_setup_shared_tables(void);
 /* mapping functions */
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn);
 int amd_iommu_unmap_page(struct domain *d, unsigned long gfn);
-void *amd_iommu_get_vptr_from_page_table_entry(u32 *entry);
+u64 amd_iommu_get_next_table_from_pte(u32 *entry);
 int amd_iommu_reserve_domain_unity_map(struct domain *domain,
         unsigned long phys_addr, unsigned long size, int iw, int ir);
 int amd_iommu_sync_p2m(struct domain *d);
+void invalidate_all_iommu_pages(struct domain *d);
 
 /* device table functions */
 void amd_iommu_set_dev_table_entry(u32 *dte, u64 root_ptr, u64 intremap_ptr,
         u16 domain_id, u8 sys_mgt, u8 dev_ex, u8 paging_mode);
 int amd_iommu_is_dte_page_translation_valid(u32 *entry);
-void invalidate_dev_table_entry(struct amd_iommu *iommu,
-            u16 devic_id);
+void invalidate_dev_table_entry(struct amd_iommu *iommu, u16 devic_id);
 
 /* send cmd to iommu */
 int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]);
@@ -116,4 +117,36 @@ static inline unsigned long region_to_pages(unsigned long addr, unsigned long si
     return (PAGE_ALIGN(addr + size) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static inline struct page_info* alloc_amd_iommu_pgtable(void)
+{
+    struct page_info *pg;
+    void *vaddr;
+
+    pg = alloc_domheap_page(NULL, 0);
+    vaddr = map_domain_page(page_to_mfn(pg));
+    if ( !vaddr )
+        return 0;
+    memset(vaddr, 0, PAGE_SIZE);
+    unmap_domain_page(vaddr);
+    return pg;
+}
+
+static inline void free_amd_iommu_pgtable(struct page_info *pg)
+{
+    if ( pg != 0 )
+        free_domheap_page(pg);
+}
+
+static inline void* __alloc_amd_iommu_tables(int order)
+{
+    void *buf;
+    buf = alloc_xenheap_pages(order, 0);
+    return buf;
+}
+
+static inline void __free_amd_iommu_tables(void *table, int order)
+{
+    free_xenheap_pages(table, order);
+}
+
 #endif /* _ASM_X86_64_AMD_IOMMU_PROTO_H */
index 213eb1062919cbfa77699ca6d77029f502676261..9b0b7fd7a9df19679c0ef2e85f8432c4900890ca 100644 (file)
@@ -24,6 +24,7 @@
 #define DO_TRC_HVM_INJ_EXC     DEFAULT_HVM_INJECT
 #define DO_TRC_HVM_INJ_VIRQ    DEFAULT_HVM_INJECT
 #define DO_TRC_HVM_REINJ_VIRQ  DEFAULT_HVM_INJECT
+#define DO_TRC_HVM_INTR_WINDOW DEFAULT_HVM_INJECT
 #define DO_TRC_HVM_IO_READ     DEFAULT_HVM_IO
 #define DO_TRC_HVM_IO_WRITE    DEFAULT_HVM_IO
 #define DO_TRC_HVM_CR_READ     DEFAULT_HVM_REGACCESS
index 6eb9d67b731cc67d1dd7455c4f37bab446964bf5..faea392f92ba161b4c2b19d356b478f638425416 100644 (file)
@@ -59,6 +59,7 @@ struct hvm_vcpu {
 
     bool_t              flag_dr_dirty;
     bool_t              debug_state_latch;
+    bool_t              single_step;
 
     union {
         struct arch_vmx_struct vmx;
index 3f34e4795022664042e4bf6c0604dad0440e432e..8c36ed5a0092d11d0cd6d9cc1518e4309d15c7e9 100644 (file)
@@ -93,8 +93,7 @@ void vlapic_msr_set(struct vlapic *vlapic, uint64_t value);
 
 int vlapic_accept_pic_intr(struct vcpu *v);
 
-struct vlapic *apic_round_robin(
-    struct domain *d, uint8_t vector, uint32_t bitmap);
+struct vlapic *apic_lowest_prio(struct domain *d, uint32_t bitmap);
 
 int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda);
 
index 9d900996cbb9430963e972e110913b08b89efbfd..80640e0d5a0575d268acfeb3e021eff768f28486 100644 (file)
@@ -109,11 +109,16 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Is the guest in real mode? */
+    uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
-#define VMXEMUL_REALMODE 1  /* Yes, because CR0.PE == 0   */
-#define VMXEMUL_BAD_CS   2  /* Yes, because CS.RPL != CPL */
-#define VMXEMUL_BAD_SS   4  /* Yes, because SS.RPL != CPL */
-    uint8_t              vmxemul;
+    uint8_t              vmx_emulate;
+    /* Bitmask of segments that we can't safely use in virtual 8086 mode */
+    uint16_t             vm86_segment_mask;
+    /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */
+    struct segment_register vm86_saved_seg[x86_seg_tr + 1];
+    /* Remember EFLAGS while in virtual 8086 mode */
+    uint32_t             vm86_saved_eflags;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
@@ -137,6 +142,7 @@ void vmx_vmcs_exit(struct vcpu *v);
 #define CPU_BASED_MOV_DR_EXITING              0x00800000
 #define CPU_BASED_UNCOND_IO_EXITING           0x01000000
 #define CPU_BASED_ACTIVATE_IO_BITMAP          0x02000000
+#define CPU_BASED_MONITOR_TRAP_FLAG           0x08000000
 #define CPU_BASED_ACTIVATE_MSR_BITMAP         0x10000000
 #define CPU_BASED_MONITOR_EXITING             0x20000000
 #define CPU_BASED_PAUSE_EXITING               0x40000000
@@ -150,11 +156,14 @@ extern u32 vmx_pin_based_exec_control;
 
 #define VM_EXIT_IA32E_MODE              0x00000200
 #define VM_EXIT_ACK_INTR_ON_EXIT        0x00008000
+#define VM_EXIT_SAVE_GUEST_PAT          0x00040000
+#define VM_EXIT_LOAD_HOST_PAT           0x00080000
 extern u32 vmx_vmexit_control;
 
 #define VM_ENTRY_IA32E_MODE             0x00000200
 #define VM_ENTRY_SMM                    0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR     0x00000800
+#define VM_ENTRY_LOAD_GUEST_PAT         0x00004000
 extern u32 vmx_vmentry_control;
 
 #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
@@ -181,6 +190,10 @@ extern bool_t cpu_has_vmx_ins_outs_instr_info;
     (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)
 #define cpu_has_vmx_vpid \
     (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+#define cpu_has_monitor_trap_flag \
+    (vmx_cpu_based_exec_control & CPU_BASED_MONITOR_TRAP_FLAG)
+#define cpu_has_vmx_pat \
+    (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_PAT)
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
 #define VMX_INTR_SHADOW_STI             0x00000001
@@ -232,6 +245,8 @@ enum vmcs_field {
     VMCS_LINK_POINTER_HIGH          = 0x00002801,
     GUEST_IA32_DEBUGCTL             = 0x00002802,
     GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+    GUEST_PAT                       = 0x00002804,
+    GUEST_PAT_HIGH                  = 0x00002805,
     GUEST_PDPTR0                    = 0x0000280a,
     GUEST_PDPTR0_HIGH               = 0x0000280b,
     GUEST_PDPTR1                    = 0x0000280c,
@@ -240,6 +255,8 @@ enum vmcs_field {
     GUEST_PDPTR2_HIGH               = 0x0000280f,
     GUEST_PDPTR3                    = 0x00002810,
     GUEST_PDPTR3_HIGH               = 0x00002811,
+    HOST_PAT                        = 0x00002c00,
+    HOST_PAT_HIGH                   = 0x00002c01,
     PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
     CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
     EXCEPTION_BITMAP                = 0x00004004,
index 0430a46c1b47adb60410cbf8c0a536fce1195ee1..6fd989431dad21a8171065ead27894bf3c76ddba 100644 (file)
@@ -33,7 +33,8 @@ typedef union {
         u64 r       :   1,
         w           :   1,
         x           :   1,
-        emt         :   4,
+        emt         :   3,
+        igmt        :   1,
         sp_avail    :   1,
         avail1      :   4,
         mfn         :   45,
@@ -47,7 +48,11 @@ typedef union {
 #define EPTE_SUPER_PAGE_MASK    0x80
 #define EPTE_MFN_MASK           0x1fffffffffff000
 #define EPTE_AVAIL1_MASK        0xF00
-#define EPTE_EMT_MASK           0x78
+#define EPTE_EMT_MASK           0x38
+#define EPTE_IGMT_MASK          0x40
+#define EPTE_AVAIL1_SHIFT       8
+#define EPTE_EMT_SHIFT          3
+#define EPTE_IGMT_SHIFT         6
 
 void vmx_asm_vmexit_handler(struct cpu_user_regs);
 void vmx_asm_do_vmentry(void);
@@ -96,6 +101,7 @@ void vmx_realmode(struct cpu_user_regs *regs);
 #define EXIT_REASON_INVALID_GUEST_STATE 33
 #define EXIT_REASON_MSR_LOADING         34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MONITOR_TRAP_FLAG   37
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
 #define EXIT_REASON_MACHINE_CHECK       41
@@ -351,9 +357,9 @@ static inline int __vmxon(u64 addr)
     return rc;
 }
 
-void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code);
-void vmx_inject_extint(struct vcpu *v, int trap);
-void vmx_inject_nmi(struct vcpu *v);
+void vmx_inject_hw_exception(int trap, int error_code);
+void vmx_inject_extint(int trap);
+void vmx_inject_nmi(void);
 
 void ept_p2m_init(struct domain *d);
 
index c92f6548b9605128e1e58033102570e856dc82b0..9598b7d7948d32eab78a7f55717514a57f4a9010 100644 (file)
@@ -67,7 +67,7 @@ struct vpmu_struct {
 #define VPMU_CONTEXT_ALLOCATED              0x1
 #define VPMU_CONTEXT_LOADED                 0x2
 #define VPMU_RUNNING                        0x4
-
+#define PASSIVE_DOMAIN_ALLOCATED           0x8
 int vpmu_do_wrmsr(struct cpu_user_regs *regs);
 int vpmu_do_rdmsr(struct cpu_user_regs *regs);
 int vpmu_do_interrupt(struct cpu_user_regs *regs);
index 131675862b9eaa62d970a02414e6e50d7c822301..fc115a97b22d5c3b5e63276bbf6efdbc40aaf965 100644 (file)
 #ifndef __ASM_X86_HVM_VPMU_CORE_H_
 #define __ASM_X86_HVM_VPMU_CORE_H_
 
-/* Core 2 Non-architectual Performance Counter MSRs. */
-u32 core2_counters_msr[] =   {
-    MSR_CORE_PERF_FIXED_CTR0,
-    MSR_CORE_PERF_FIXED_CTR1,
-    MSR_CORE_PERF_FIXED_CTR2};
-
-/* Core 2 Non-architectual Performance Control MSRs. */
-u32 core2_ctrls_msr[] = {
-    MSR_CORE_PERF_FIXED_CTR_CTRL,
-    MSR_IA32_PEBS_ENABLE,
-    MSR_IA32_DS_AREA};
-
-struct pmumsr core2_counters = {
-    3,
-    core2_counters_msr
-};
-
-struct pmumsr core2_ctrls = {
-    3,
-    core2_ctrls_msr
-};
-
 struct arch_msr_pair {
     u64 counter;
     u64 control;
index b43f5aea2e8dfd1b20496fced607b7a624eab1a8..a3384aee9de751be46828e62e57f7e3292455b5d 100644 (file)
 #include <asm/hvm/irq.h>
 #include <public/hvm/save.h>
 
-struct HPETState;
-struct HPET_timer_fn_info {
-    struct HPETState *hs;
-    unsigned int tn;
-};
-
-struct hpet_registers {
-    /* Memory-mapped, software visible registers */
-    uint64_t capability;        /* capabilities */
-    uint64_t config;            /* configuration */
-    uint64_t isr;               /* interrupt status reg */
-    uint64_t mc64;              /* main counter */
-    struct {                    /* timers */
-        uint64_t config;        /* configuration/cap */
-        uint64_t cmp;           /* comparator */
-        uint64_t fsb;           /* FSB route, not supported now */
-    } timers[HPET_TIMER_NUM];
-
-    /* Hidden register state */
-    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
-};
-
-typedef struct HPETState {
-    struct hpet_registers hpet;
-    struct vcpu *vcpu;
-    uint64_t stime_freq;
-    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
-    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
-    uint64_t mc_offset;
-    struct timer timers[HPET_TIMER_NUM];
-    struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; 
-    spinlock_t lock;
-} HPETState;
-
-
 /*
  * Abstract layer of periodic time, one short time.
  */
@@ -108,6 +73,34 @@ typedef struct PITState {
     spinlock_t lock;
 } PITState;
 
+struct hpet_registers {
+    /* Memory-mapped, software visible registers */
+    uint64_t capability;        /* capabilities */
+    uint64_t config;            /* configuration */
+    uint64_t isr;               /* interrupt status reg */
+    uint64_t mc64;              /* main counter */
+    struct {                    /* timers */
+        uint64_t config;        /* configuration/cap */
+        uint64_t cmp;           /* comparator */
+        uint64_t fsb;           /* FSB route, not supported now */
+    } timers[HPET_TIMER_NUM];
+
+    /* Hidden register state */
+    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
+    uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */
+};
+
+typedef struct HPETState {
+    struct hpet_registers hpet;
+    struct vcpu *vcpu;
+    uint64_t stime_freq;
+    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
+    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
+    uint64_t mc_offset;
+    struct periodic_time pt[HPET_TIMER_NUM];
+    spinlock_t lock;
+} HPETState;
+
 typedef struct RTCState {
     /* Hardware state */
     struct hvm_hw_rtc hw;
@@ -160,13 +153,13 @@ void pt_migrate(struct vcpu *v);
  * The given periodic timer structure must be initialised with zero bytes,
  * except for the 'source' field which must be initialised with the
  * correct PTSRC_ value. The initialised timer structure can then be passed
- * to {create,destroy}_periodic_time() and number of times and in any order.
+ * to {create,destroy}_periodic_time() any number of times and in any order.
  * Note that, for a given periodic timer, invocations of these functions MUST
  * be serialised.
  */
 void create_periodic_time(
-    struct vcpu *v, struct periodic_time *pt, uint64_t period,
-    uint8_t irq, char one_shot, time_cb *cb, void *data);
+    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+    uint64_t period, uint8_t irq, time_cb *cb, void *data);
 void destroy_periodic_time(struct periodic_time *pt);
 
 int pv_pit_handler(int port, int data, int write);
@@ -185,7 +178,6 @@ void pmtimer_init(struct vcpu *v);
 void pmtimer_deinit(struct domain *d);
 void pmtimer_reset(struct domain *d);
 
-void hpet_migrate_timers(struct vcpu *v);
 void hpet_init(struct vcpu *v);
 void hpet_deinit(struct domain *d);
 void hpet_reset(struct domain *d);
index 6124d0fae06f18b4f610f62deaa7cfc0bde4609c..eaa77a8d7901acbeb011e03b3025936970989f2d 100644 (file)
@@ -20,6 +20,8 @@
                ((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \
                + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK)))
 
+#define IO_APIC_ID(idx) (mp_ioapics[idx].mpc_apicid)
+
 /*
  * The structure of the IO-APIC:
  */
@@ -180,13 +182,13 @@ extern int timer_uses_ioapic_pin_0;
 #endif /*CONFIG_ACPI_BOOT*/
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
-extern int ioapic_suspend(void);
-extern int ioapic_resume(void);
+extern void ioapic_suspend(void);
+extern void ioapic_resume(void);
 
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
-static inline int ioapic_suspend(void) {return 0};
-static inline int ioapic_resume(void) {return 0};
+static inline void ioapic_suspend(void) {}
+static inline void ioapic_resume(void) {}
 #endif
 
 extern int assign_irq_vector(int irq);
index c7463cb6f280fa3becc263ae3c3b1289380c1798..eee47228d4a583faabf24751e498b352020ed0cd 100644 (file)
@@ -14,7 +14,8 @@
 #define ioports_access_permitted(d, s, e)               \
     rangeset_contains_range((d)->arch.ioport_caps, s, e)
 
-#define cache_flush_permitted(d)                       \
-    (!rangeset_is_empty((d)->iomem_caps))
+#define cache_flush_permitted(d)                        \
+    (!rangeset_is_empty((d)->iomem_caps) ||             \
+     !rangeset_is_empty((d)->arch.ioport_caps))
 
 #endif /* __X86_IOCAP_H__ */
index 982f99f3c4e8d487623e78232e81e36e4c281909..108b065d17262782f440f821d7504d69d6064f0a 100644 (file)
 #define vector_to_irq(vec)  (vector_irq[vec])
 
 extern int vector_irq[NR_VECTORS];
-extern u8 irq_vector[NR_IRQ_VECTORS];
-#define AUTO_ASSIGN    -1
-#define NEVER_ASSIGN   -2
-#define FREE_TO_ASSIGN -3
+extern u8 irq_vector[NR_IRQS];
 
 #define platform_legacy_irq(irq)       ((irq) < 16)
 
@@ -33,6 +30,7 @@ fastcall void error_interrupt(void);
 fastcall void pmu_apic_interrupt(void);
 fastcall void spurious_interrupt(void);
 fastcall void thermal_interrupt(void);
+fastcall void cmci_interrupt(void);
 
 void disable_8259A_irq(unsigned int irq);
 void enable_8259A_irq(unsigned int irq);
@@ -51,7 +49,6 @@ extern unsigned long io_apic_irqs;
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
-int pirq_acktype(struct domain *d, int irq);
 int pirq_shared(struct domain *d , int irq);
 
 int map_domain_pirq(struct domain *d, int pirq, int vector, int type,
@@ -60,8 +57,12 @@ int unmap_domain_pirq(struct domain *d, int pirq);
 int get_free_pirq(struct domain *d, int type, int index);
 void free_domain_pirqs(struct domain *d);
 
-#define domain_irq_to_vector(d, irq) ((d)->arch.pirq_vector[(irq)])
-#define domain_vector_to_irq(d, vec) ((d)->arch.vector_pirq[(vec)])
+#define domain_irq_to_vector(d, irq) ((d)->arch.pirq_vector[irq] ?: \
+                                      IO_APIC_IRQ(irq) ? 0 : LEGACY_VECTOR(irq))
+#define domain_vector_to_irq(d, vec) ((d)->arch.vector_pirq[vec] ?: \
+                                      ((vec) < FIRST_LEGACY_VECTOR || \
+                                       (vec) > LAST_LEGACY_VECTOR) ? \
+                                      0 : LEGACY_IRQ_FROM_VECTOR(vec))
 
 int pirq_guest_force_unbind(struct domain *d, int irq);
 
index 90b4e1ef0e9ab7f18c244c6f1d6482148732c6ed..30c3f89daf17d4a6c34409e6f9634e515ea6a0c0 100644 (file)
 #define THERMAL_APIC_VECTOR    0xfa
 #define LOCAL_TIMER_VECTOR     0xf9
 #define PMU_APIC_VECTOR        0xf8
-
+#define CMCI_APIC_VECTOR       0xf7
 /*
  * High-priority dynamically-allocated vectors. For interrupts that
  * must be higher priority than any guest-bound interrupt.
  */
 #define FIRST_HIPRIORITY_VECTOR        0xf0
-#define LAST_HIPRIORITY_VECTOR  0xf7
+#define LAST_HIPRIORITY_VECTOR  0xf6
 
 /* Legacy PIC uses vectors 0xe0-0xef. */
 #define FIRST_LEGACY_VECTOR    0xe0
@@ -30,8 +30,4 @@
 
 #define NR_VECTORS 256
 
-/* Limited by number of trap vectors. */
-#define NR_IRQS        NR_VECTORS
-#define NR_IRQ_VECTORS NR_IRQS
-
 #endif /* _ASM_IRQ_VECTORS_H */
index 1ead2a7c37d68ad6b86c23ec2d79cdf1ae1a75dc..980da227dcd8d31d0a24e656092c25dd50f1bb47 100644 (file)
@@ -2,11 +2,12 @@
 #define ASM_X86__MICROCODE_H
 
 struct cpu_signature;
+struct ucode_cpu_info;
 
 struct microcode_ops {
-    int (*get_matching_microcode)(void *mc, int cpu);
+    int (*microcode_resume_match)(int cpu, struct cpu_signature *nsig);
     int (*cpu_request_microcode)(int cpu, const void *buf, size_t size);
-    int (*collect_cpu_info)(int cpu_num, struct cpu_signature *csig);
+    int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
     int (*apply_microcode)(int cpu);
 };
 
@@ -43,28 +44,29 @@ struct extended_sigtable {
 };
 
 struct equiv_cpu_entry {
-    unsigned int installed_cpu;
-    unsigned int fixed_errata_mask;
-    unsigned int fixed_errata_compare;
-    unsigned int equiv_cpu;
-};
+    uint32_t installed_cpu;
+    uint32_t fixed_errata_mask;
+    uint32_t fixed_errata_compare;
+    uint16_t equiv_cpu;
+    uint16_t reserved;
+} __attribute__((packed));
 
 struct microcode_header_amd {
-    unsigned int  data_code;
-    unsigned int  patch_id;
-    unsigned char mc_patch_data_id[2];
-    unsigned char mc_patch_data_len;
-    unsigned char init_flag;
-    unsigned int  mc_patch_data_checksum;
-    unsigned int  nb_dev_id;
-    unsigned int  sb_dev_id;
-    unsigned char processor_rev_id[2];
-    unsigned char nb_rev_id;
-    unsigned char sb_rev_id;
-    unsigned char bios_api_rev;
-    unsigned char reserved1[3];
-    unsigned int  match_reg[8];
-};
+    uint32_t data_code;
+    uint32_t patch_id;
+    uint8_t  mc_patch_data_id[2];
+    uint8_t  mc_patch_data_len;
+    uint8_t  init_flag;
+    uint32_t mc_patch_data_checksum;
+    uint32_t nb_dev_id;
+    uint32_t sb_dev_id;
+    uint16_t processor_rev_id;
+    uint8_t  nb_rev_id;
+    uint8_t  sb_rev_id;
+    uint8_t  bios_api_rev;
+    uint8_t  reserved1[3];
+    uint32_t match_reg[8];
+} __attribute__((packed));
 
 struct microcode_amd {
     struct microcode_header_amd hdr;
@@ -79,11 +81,10 @@ struct cpu_signature {
 
 struct ucode_cpu_info {
     struct cpu_signature cpu_sig;
-    int valid;
     union {
         struct microcode_intel *mc_intel;
         struct microcode_amd *mc_amd;
-        void *valid_mc;
+        void *mc_valid;
     } mc;
 };
 
index abec46fbb3e4518e6bf691d6775a122f59de2e89..6772b4050575dec0732e5ec309025400bf8c9393 100644 (file)
  * Per-page-frame information.
  * 
  * Every architecture must ensure the following:
- *  1. 'struct page_info' contains a 'struct list_head list'.
+ *  1. 'struct page_info' contains a 'struct page_list_entry list'.
  *  2. Provide a PFN_ORDER() macro for accessing the order of a free page.
  */
-#define PFN_ORDER(_pfn) ((_pfn)->u.free.order)
+#define PFN_ORDER(_pfn) ((_pfn)->v.free.order)
+
+/*
+ * This definition is solely for the use in struct page_info (and
+ * struct page_list_head), intended to allow easy adjustment once x86-64
+ * wants to support more than 16TB.
+ * 'unsigned long' should be used for MFNs everywhere else.
+ */
+#define __mfn_t unsigned int
+#define PRpgmfn "08x"
+
+#undef page_list_entry
+struct page_list_entry
+{
+    __mfn_t next, prev;
+};
 
 struct page_info
 {
-    /* Each frame can be threaded onto a doubly-linked list. */
-    struct list_head list;
+    union {
+        /* Each frame can be threaded onto a doubly-linked list.
+         *
+         * For unused shadow pages, a list of pages of this order; for
+         * pinnable shadows, if pinned, a list of other pinned shadows
+         * (see sh_type_is_pinnable() below for the definition of
+         * "pinnable" shadow types).
+         */
+        struct page_list_entry list;
+        /* For non-pinnable shadows, a higher entry that points at us. */
+        paddr_t up;
+    };
 
     /* Reference count and various PGC_xxx flags and fields. */
-    u32 count_info;
+    unsigned long count_info;
 
     /* Context-dependent fields follow... */
     union {
 
         /* Page is in use: ((count_info & PGC_count_mask) != 0). */
         struct {
-            /* Owner of this page (NULL if page is anonymous). */
-            u32 _domain; /* pickled format */
             /* Type reference count and various PGT_xxx flags and fields. */
             unsigned long type_info;
-        } __attribute__ ((packed)) inuse;
+        } inuse;
+
+        /* Page is in use as a shadow: count_info == 0. */
+        struct {
+            unsigned long type:5;   /* What kind of shadow is this? */
+            unsigned long pinned:1; /* Is the shadow pinned? */
+            unsigned long count:26; /* Reference count */
+        } sh;
 
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
         struct {
-            /* Order-size of the free chunk this page is the head of. */
-            u32 order;
-            /* Mask of possibly-tainted TLBs. */
-            cpumask_t cpumask;
-        } __attribute__ ((packed)) free;
+            /* Do TLBs need flushing for safety before next page use? */
+            bool_t need_tlbflush;
+        } free;
 
     } u;
 
-#if defined(__x86_64__)
-    spinlock_t lock;
-#endif
+    union {
+
+        /* Page is in use, but not as a shadow. */
+        struct {
+            /* Owner of this page (NULL if page is anonymous). */
+            u32 _domain; /* pickled format */
+        } inuse;
+
+        /* Page is in use as a shadow. */
+        struct {
+            /* GMFN of guest page we're a shadow of. */
+            __mfn_t back;
+        } sh;
+
+        /* Page is on a free list (including shadow code free lists). */
+        struct {
+            /* Order-size of the free chunk this page is the head of. */
+            unsigned int order;
+        } free;
+
+    } v;
 
     union {
         /*
@@ -61,12 +107,36 @@ struct page_info
         /*
          * When PGT_partial is true then this field is valid and indicates
          * that PTEs in the range [0, @nr_validated_ptes) have been validated.
-         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
-         * partially validated.
+         * An extra page reference must be acquired (or not dropped) whenever
+         * PGT_partial gets set, and it must be dropped when the flag gets
+         * cleared. This is so that a get() leaving a page in partially
+         * validated state (where the caller would drop the reference acquired
+         * due to the getting of the type [apparently] failing [-EAGAIN])
+         * would not accidentally result in a page left with zero general
+         * reference count, but non-zero type reference count (possible when
+         * the partial get() is followed immediately by domain destruction).
+         * Likewise, the ownership of the single type reference for partially
+         * (in-)validated pages is tied to this flag, i.e. the instance
+         * setting the flag must not drop that reference, whereas the instance
+         * clearing it will have to.
+         *
+         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
+         * been partially validated. This implies that the general reference
+         * to the page (acquired from get_page_from_lNe()) would be dropped
+         * (again due to the apparent failure) and hence must be re-acquired
+         * when resuming the validation, but must not be dropped when picking
+         * up the page for invalidation.
+         *
+         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
+         * been partially invalidated. This is basically the opposite case of
+         * above, i.e. the general reference to the page was not dropped in
+         * put_page_from_lNe() (due to the apparent failure), and hence it
+         * must be dropped when the put operation is resumed (and completes),
+         * but it must not be acquired if picking up the page for validation.
          */
         struct {
             u16 nr_validated_ptes;
-            bool_t partial_pte;
+            s8 partial_pte;
         };
 
         /*
@@ -75,69 +145,98 @@ struct page_info
          * tracked for TLB-flush avoidance when a guest runs in shadow mode.
          */
         u32 shadow_flags;
+
+        /* When in use as a shadow, next shadow in this hash chain. */
+        __mfn_t next_shadow;
     };
 };
 
+#undef __mfn_t
+
+#define PG_shift(idx)   (BITS_PER_LONG - (idx))
+#define PG_mask(x, idx) (x ## UL << PG_shift(idx))
+
  /* The following page types are MUTUALLY EXCLUSIVE. */
-#define PGT_none            (0U<<29) /* no special uses of this page */
-#define PGT_l1_page_table   (1U<<29) /* using this page as an L1 page table? */
-#define PGT_l2_page_table   (2U<<29) /* using this page as an L2 page table? */
-#define PGT_l3_page_table   (3U<<29) /* using this page as an L3 page table? */
-#define PGT_l4_page_table   (4U<<29) /* using this page as an L4 page table? */
-#define PGT_seg_desc_page   (5U<<29) /* using this page in a GDT/LDT? */
-#define PGT_writable_page   (7U<<29) /* has writable mappings of this page? */
-#define PGT_type_mask       (7U<<29) /* Bits 29-31. */
+#define PGT_none          PG_mask(0, 3) /* no special uses of this page */
+#define PGT_l1_page_table PG_mask(1, 3) /* using as an L1 page table? */
+#define PGT_l2_page_table PG_mask(2, 3) /* using as an L2 page table? */
+#define PGT_l3_page_table PG_mask(3, 3) /* using as an L3 page table? */
+#define PGT_l4_page_table PG_mask(4, 3) /* using as an L4 page table? */
+#define PGT_seg_desc_page PG_mask(5, 3) /* using this page in a GDT/LDT? */
+#define PGT_writable_page PG_mask(7, 3) /* has writable mappings? */
+#define PGT_type_mask     PG_mask(7, 3) /* Bits 29-31. */
 
  /* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned         28
-#define PGT_pinned          (1U<<_PGT_pinned)
+#define _PGT_pinned       PG_shift(4)
+#define PGT_pinned        PG_mask(1, 4)
  /* Has this page been validated for use as its current type? */
-#define _PGT_validated      27
-#define PGT_validated       (1U<<_PGT_validated)
+#define _PGT_validated    PG_shift(5)
+#define PGT_validated     PG_mask(1, 5)
  /* PAE only: is this an L2 page directory containing Xen-private mappings? */
-#define _PGT_pae_xen_l2     26
-#define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
+#define _PGT_pae_xen_l2   PG_shift(6)
+#define PGT_pae_xen_l2    PG_mask(1, 6)
 /* Has this page been *partially* validated for use as its current type? */
-#define _PGT_partial        25
-#define PGT_partial         (1U<<_PGT_partial)
+#define _PGT_partial      PG_shift(7)
+#define PGT_partial       PG_mask(1, 7)
+ /* Page is locked? */
+#define _PGT_locked       PG_shift(8)
+#define PGT_locked        PG_mask(1, 8)
 
- /* 25-bit count of uses of this frame as its current type. */
-#define PGT_count_mask      ((1U<<25)-1)
+ /* Count of uses of this frame as its current type. */
+#define PGT_count_width   PG_shift(8)
+#define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
  /* Cleared when the owning guest 'frees' this page. */
-#define _PGC_allocated      31
-#define PGC_allocated       (1U<<_PGC_allocated)
-#if defined(__i386__)
- /* Page is locked? */
-# define _PGC_locked        30
-# define PGC_locked         (1U<<_PGC_out_of_sync)
-#endif
+#define _PGC_allocated    PG_shift(1)
+#define PGC_allocated     PG_mask(1, 1)
+ /* Page is Xen heap? */
+#define _PGC_xen_heap     PG_shift(2)
+#define PGC_xen_heap      PG_mask(1, 2)
  /* Set when is using a page as a page table */
-#define _PGC_page_table     29
-#define PGC_page_table      (1U<<_PGC_page_table)
+#define _PGC_page_table   PG_shift(3)
+#define PGC_page_table    PG_mask(1, 3)
  /* 3-bit PAT/PCD/PWT cache-attribute hint. */
-#define PGC_cacheattr_base  26
-#define PGC_cacheattr_mask  (7U<<PGC_cacheattr_base)
- /* 26-bit count of references to this frame. */
-#define PGC_count_mask      ((1U<<26)-1)
+#define PGC_cacheattr_base PG_shift(6)
+#define PGC_cacheattr_mask PG_mask(7, 6)
+ /* Page is broken? */
+#define _PGC_broken         PG_shift(7)
+#define PGC_broken          PG_mask(1, 7)
+ /* Page is offline pending ? */
+#define _PGC_offlining      PG_shift(8)
+#define PGC_offlining       PG_mask(1, 8)
+ /* Page is offlined */
+#define _PGC_offlined       PG_shift(9)
+#define PGC_offlined        PG_mask(1, 9)
+#define PGC_offlined_broken (PGC_offlined | PGC_broken)
+
+ /* Count of references to this frame. */
+#define PGC_count_width   PG_shift(9)
+#define PGC_count_mask    ((1UL<<PGC_count_width)-1)
+
+#define is_page_offlining(page)  ((page)->count_info & PGC_offlining)
+#define is_page_offlined(page)   ((page)->count_info & PGC_offlined)
+#define is_page_broken(page)     ((page)->count_info & PGC_broken)
+#define is_page_online(page)     (!is_page_offlined(page))
 
+#if defined(__i386__)
 #define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page))
 #define is_xen_heap_mfn(mfn) ({                         \
     unsigned long _mfn = (mfn);                         \
-    ((_mfn >= paddr_to_pfn(xenheap_phys_start)) &&      \
-     (_mfn < paddr_to_pfn(xenheap_phys_end)));          \
+    (_mfn < paddr_to_pfn(xenheap_phys_end));            \
 })
+#else
+extern unsigned long allocator_bitmap_end;
+#define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
+#define is_xen_heap_mfn(mfn) \
+    (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn)))
+#define is_xen_fixed_mfn(mfn) \
+    ( (mfn << PAGE_SHIFT) >= __pa(&_start) &&    \
+          (mfn << PAGE_SHIFT) <= allocator_bitmap_end )
+#endif
 
 #if defined(__i386__)
-#define pickle_domptr(_d)   ((u32)(unsigned long)(_d))
-static inline struct domain *unpickle_domptr(u32 _domain)
-{ return (_domain & 1) ? NULL : (void *)_domain; }
 #define PRtype_info "08lx" /* should only be used for printk's */
 #elif defined(__x86_64__)
-static inline struct domain *unpickle_domptr(u32 _domain)
-{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); }
-static inline u32 pickle_domptr(struct domain *domain)
-{ return (domain == NULL) ? 0 : (u32)__pa(domain); }
 #define PRtype_info "016lx"/* should only be used for printk's */
 #endif
 
@@ -150,8 +249,11 @@ static inline u32 pickle_domptr(struct domain *domain)
 /* OOS fixup entries */
 #define SHADOW_OOS_FIXUPS 2
 
-#define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
-#define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
+#define page_get_owner(_p)                                              \
+    ((struct domain *)((_p)->v.inuse._domain ?                          \
+                       mfn_to_virt((_p)->v.inuse._domain) : NULL))
+#define page_set_owner(_p,_d)                                           \
+    ((_p)->v.inuse._domain = (_d) ? virt_to_mfn(_d) : 0)
 
 #define maddr_get_owner(ma)   (page_get_owner(maddr_to_page((ma))))
 #define vaddr_get_owner(va)   (page_get_owner(virt_to_page((va))))
@@ -176,6 +278,7 @@ void cleanup_page_cacheattr(struct page_info *page);
 
 int is_iomem_page(unsigned long mfn);
 
+struct domain *page_get_owner_and_reference(struct page_info *page);
 void put_page(struct page_info *page);
 int  get_page(struct page_info *page, struct domain *domain);
 void put_page_type(struct page_info *page);
@@ -239,6 +342,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
 
 int check_descriptor(const struct domain *, struct desc_struct *d);
 
+extern int opt_allow_hugepage;
 
 /******************************************************************************
  * With shadow pagetables, the different kinds of address start 
index c72f9d69c5aa6243f74226fc690e214edb5c1e32..56387c6004e79ac2f89bdab91f2a979c400b110c 100644 (file)
@@ -49,9 +49,9 @@
 
 /* MAX fixed pages reserved for mapping MSIX tables. */
 #if defined(__x86_64__)
-#define MAX_MSIX_PAGES              512
+#define FIX_MSIX_MAX_PAGES              512
 #else
-#define MAX_MSIX_PAGES              32
+#define FIX_MSIX_MAX_PAGES              32
 #endif
 
 struct msi_info {
@@ -68,13 +68,20 @@ struct msi_msg {
        u32     data;           /* 16 bits of msi message data */
 };
 
+struct msi_desc;
 /* Helper functions */
-extern void mask_msi_irq(unsigned int irq);
-extern void unmask_msi_irq(unsigned int irq);
-extern void set_msi_irq_affinity(unsigned int irq, cpumask_t mask);
-extern int pci_enable_msi(struct msi_info *msi);
-extern void pci_disable_msi(int vector);
+extern void mask_msi_vector(unsigned int vector);
+extern void unmask_msi_vector(unsigned int vector);
+extern void set_msi_affinity(unsigned int vector, cpumask_t mask);
+extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
+extern void pci_disable_msi(struct msi_desc *desc);
 extern void pci_cleanup_msi(struct pci_dev *pdev);
+extern int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
+extern void teardown_msi_vector(int vector);
+extern int msi_free_vector(struct msi_desc *entry);
+extern int pci_restore_msi_state(struct pci_dev *pdev);
+
+extern unsigned int pci_msix_get_table_len(struct pci_dev *pdev);
 
 struct msi_desc {
        struct {
@@ -88,7 +95,7 @@ struct msi_desc {
 
        struct list_head list;
 
-       void __iomem *mask_base;
+       void __iomem *mask_base;        /* va for the entry in mask table */
        struct pci_dev *dev;
        int vector;
 
@@ -97,6 +104,8 @@ struct msi_desc {
        int remap_index;                /* index in interrupt remapping table */
 };
 
+int msi_maskable_irq(const struct msi_desc *);
+
 /*
  * Assume the maximum number of hot plug slots supported by the system is about
  * ten. The worstcase is that each of these slots is hot-added with a device,
index d3fc446271d90987cbe27f962f9313486e9e3150..f6b3fd1ed1e0b5ca8350d62e38184f2ead6d3ddb 100644 (file)
 #define MSR_IA32_MC0_STATUS            0x00000401
 #define MSR_IA32_MC0_ADDR              0x00000402
 #define MSR_IA32_MC0_MISC              0x00000403
+#define MSR_IA32_MC0_CTL2              0x00000280
+#define CMCI_EN                        (1UL<<30)
+#define CMCI_THRESHOLD_MASK            0x7FFF
 
 #define MSR_IA32_MC1_CTL               0x00000404
+#define MSR_IA32_MC1_CTL2              0x00000281
 #define MSR_IA32_MC1_STATUS            0x00000405
 #define MSR_IA32_MC1_ADDR              0x00000406
 #define MSR_IA32_MC1_MISC              0x00000407
 
 #define MSR_IA32_MC2_CTL               0x00000408
+#define MSR_IA32_MC2_CTL2              0x00000282
 #define MSR_IA32_MC2_STATUS            0x00000409
 #define MSR_IA32_MC2_ADDR              0x0000040A
 #define MSR_IA32_MC2_MISC              0x0000040B
 
+#define MSR_IA32_MC3_CTL2              0x00000283
 #define MSR_IA32_MC3_CTL               0x0000040C
 #define MSR_IA32_MC3_STATUS            0x0000040D
 #define MSR_IA32_MC3_ADDR              0x0000040E
 #define MSR_IA32_MC3_MISC              0x0000040F
 
+#define MSR_IA32_MC4_CTL2              0x00000284
 #define MSR_IA32_MC4_CTL               0x00000410
 #define MSR_IA32_MC4_STATUS            0x00000411
 #define MSR_IA32_MC4_ADDR              0x00000412
 #define MSR_IA32_MC4_MISC              0x00000413
 
+#define MSR_IA32_MC5_CTL2              0x00000285
 #define MSR_IA32_MC5_CTL               0x00000414
 #define MSR_IA32_MC5_STATUS            0x00000415
 #define MSR_IA32_MC5_ADDR              0x00000416
 #define MSR_IA32_MC5_MISC              0x00000417
 
+#define MSR_IA32_MC6_CTL2              0x00000286
+#define MSR_IA32_MC6_CTL               0x00000418
+#define MSR_IA32_MC6_STATUS            0x00000419
+#define MSR_IA32_MC6_ADDR              0x0000041A
+#define MSR_IA32_MC6_MISC              0x0000041B
+
+#define MSR_IA32_MC7_CTL2              0x00000287
+#define MSR_IA32_MC7_CTL               0x0000041C
+#define MSR_IA32_MC7_STATUS            0x0000041D
+#define MSR_IA32_MC7_ADDR              0x0000041E
+#define MSR_IA32_MC7_MISC              0x0000041F
+
+#define MSR_IA32_MC8_CTL2              0x00000288
+#define MSR_IA32_MC8_CTL               0x00000420
+#define MSR_IA32_MC8_STATUS            0x00000421
+#define MSR_IA32_MC8_ADDR              0x00000422
+#define MSR_IA32_MC8_MISC              0x00000423
+
 #define MSR_P6_PERFCTR0                        0x000000c1
 #define MSR_P6_PERFCTR1                        0x000000c2
 #define MSR_P6_EVNTSEL0                        0x00000186
index 625a06bdeb1382dd0ce26628ca74625ffde3273d..03285db5bda71b2f91c3509a2e3e32952de8b8cc 100644 (file)
@@ -11,6 +11,7 @@
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_NUM_TYPES       7
 #define MEMORY_NUM_TYPES     MTRR_NUM_TYPES
+#define NO_HARDCODE_MEM_TYPE    MTRR_NUM_TYPES
 
 #define NORMAL_CACHE_MODE          0
 #define NO_FILL_CACHE_MODE         2
@@ -63,10 +64,12 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size);
 extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
 extern u32 get_pat_flags(struct vcpu *v, u32 gl1e_flags, paddr_t gpaddr,
-                  paddr_t spaddr);
-extern uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, unsigned long mfn);
-extern void ept_change_entry_emt_with_range(struct domain *d, unsigned long start_gfn,
-                 unsigned long end_gfn);
+                  paddr_t spaddr, uint8_t gmtrr_mtype);
+extern uint8_t epte_get_entry_emt(
+    struct domain *d, unsigned long gfn, unsigned long mfn,
+    uint8_t *igmt, int direct_mmio);
+extern void ept_change_entry_emt_with_range(
+    struct domain *d, unsigned long start_gfn, unsigned long end_gfn);
 extern unsigned char pat_type_2_pte_flags(unsigned char pat_type);
 
 #endif /* __ASM_X86_MTRR_H__ */
index 4909c065910596160b4a014ebaeb2f9350282503..303191ee482d5874716d10d9faac5c46647cd8be 100644 (file)
@@ -64,8 +64,15 @@ typedef enum {
     p2m_ram_ro = 3,             /* Read-only; writes are silently dropped */
     p2m_mmio_dm = 4,            /* Reads and write go to the device model */
     p2m_mmio_direct = 5,        /* Read/write mapping of genuine MMIO area */
+    p2m_populate_on_demand = 6, /* Place-holder for empty memory */
 } p2m_type_t;
 
+typedef enum {
+    p2m_query = 0,              /* Do not populate a PoD entries      */
+    p2m_alloc = 1,              /* Automatically populate PoD entries */
+    p2m_guest = 2,              /* Guest demand-fault; implies alloc  */
+} p2m_query_t;
+
 /* We use bitmaps and maks to handle groups of types */
 #define p2m_to_mask(_t) (1UL << (_t))
 
@@ -82,12 +89,20 @@ typedef enum {
 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
                       | p2m_to_mask(p2m_ram_ro))
 
+#define P2M_MAGIC_TYPES (p2m_to_mask(p2m_populate_on_demand))
+
 /* Useful predicates */
 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_magic(_t) (p2m_to_mask(_t) & P2M_MAGIC_TYPES)
 #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
 
+/* Populate-on-demand */
+#define POPULATE_ON_DEMAND_MFN  (1<<9)
+#define POD_PAGE_ORDER 9
+
+
 struct p2m_domain {
     /* Lock that protects updates to the p2m */
     spinlock_t         lock;
@@ -95,7 +110,7 @@ struct p2m_domain {
     const char        *locker_function; /* Func that took it */
 
     /* Pages used to construct the p2m */
-    struct list_head   pages;
+    struct page_list_head pages;
 
     /* Functions to call to get or free pages for the p2m */
     struct page_info * (*alloc_page  )(struct domain *d);
@@ -105,15 +120,42 @@ struct p2m_domain {
                                        mfn_t mfn, unsigned int page_order,
                                        p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
-                                       p2m_type_t *p2mt);
+                                       p2m_type_t *p2mt,
+                                       p2m_query_t q);
     mfn_t              (*get_entry_current)(unsigned long gfn,
-                                            p2m_type_t *p2mt);
+                                            p2m_type_t *p2mt,
+                                            p2m_query_t q);
     void               (*change_entry_type_global)(struct domain *d,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
 
     /* Highest guest frame that's ever been mapped in the p2m */
     unsigned long max_mapped_pfn;
+
+    /* Populate-on-demand variables
+     * NB on locking.  {super,single,count} are
+     * covered by d->page_alloc_lock, since they're almost always used in
+     * conjunction with that functionality.  {entry_count} is covered by
+     * the domain p2m lock, since it's almost always used in conjunction
+     * with changing the p2m tables.
+     *
+     * At this point, both locks are held in two places.  In both,
+     * the order is [p2m,page_alloc]:
+     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
+     *   which grabs page_alloc
+     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
+     *   double-demand-populating of pages, the page_alloc lock to
+     *   protect moving stuff from the PoD cache to the domain page list.
+     */
+    struct {
+        struct page_list_head super,   /* List of superpages                */
+                         single;       /* Non-super lists                   */
+        int              count,        /* # of pages in cache lists         */
+                         entry_count;  /* # of pages in p2m marked pod      */
+        unsigned         reclaim_super; /* Last gpfn of a scan */
+        unsigned         reclaim_single; /* Last gpfn of a scan */
+        unsigned         max_guest;    /* gpfn of max guest demand-populate */
+    } pod;
 };
 
 /* Extract the type from the PTE flags that store it */
@@ -123,23 +165,26 @@ static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
     return (flags >> 9) & 0x7;
 }
 
-/* Read the current domain's p2m table. */
-static inline mfn_t gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
+/* Read the current domain's p2m table.  Do not populate PoD pages. */
+static inline mfn_t gfn_to_mfn_type_current(unsigned long gfn, p2m_type_t *t,
+                                            p2m_query_t q)
 {
-    return current->domain->arch.p2m->get_entry_current(gfn, t);
+    return current->domain->arch.p2m->get_entry_current(gfn, t, q);
 }
 
-/* Read another domain's P2M table, mapping pages as we go */
+/* Read another domain's P2M table, mapping pages as we go.
+ * Do not populate PoD pages. */
 static inline
-mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
+mfn_t gfn_to_mfn_type_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t,
+                              p2m_query_t q)
 {
-    return d->arch.p2m->get_entry(d, gfn, t);
+    return d->arch.p2m->get_entry(d, gfn, t, q);
 }
 
 /* General conversion function from gfn to mfn */
-#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), (g), (t))
-static inline mfn_t _gfn_to_mfn(struct domain *d,
-                                unsigned long gfn, p2m_type_t *t)
+static inline mfn_t _gfn_to_mfn_type(struct domain *d,
+                                     unsigned long gfn, p2m_type_t *t,
+                                     p2m_query_t q)
 {
     if ( !paging_mode_translate(d) )
     {
@@ -149,11 +194,18 @@ static inline mfn_t _gfn_to_mfn(struct domain *d,
         return _mfn(gfn);
     }
     if ( likely(current->domain == d) )
-        return gfn_to_mfn_current(gfn, t);
+        return gfn_to_mfn_type_current(gfn, t, q);
     else
-        return gfn_to_mfn_foreign(d, gfn, t);
+        return gfn_to_mfn_type_foreign(d, gfn, t, q);
 }
 
+#define gfn_to_mfn(d, g, t) _gfn_to_mfn_type((d), (g), (t), p2m_alloc)
+#define gfn_to_mfn_query(d, g, t) _gfn_to_mfn_type((d), (g), (t), p2m_query)
+#define gfn_to_mfn_guest(d, g, t) _gfn_to_mfn_type((d), (g), (t), p2m_guest)
+
+#define gfn_to_mfn_current(g, t) gfn_to_mfn_type_current((g), (t), p2m_alloc)
+#define gfn_to_mfn_foreign(d, g, t) gfn_to_mfn_type_foreign((d), (g), (t), p2m_alloc)
+
 /* Compatibility function exporting the old untyped interface */
 static inline unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
 {
@@ -202,11 +254,33 @@ int p2m_alloc_table(struct domain *d,
 void p2m_teardown(struct domain *d);
 void p2m_final_teardown(struct domain *d);
 
+/* Dump PoD information about the domain */
+void p2m_pod_dump_data(struct domain *d);
+
+/* Move all pages from the populate-on-demand cache to the domain page_list
+ * (usually in preparation for domain destruction) */
+void p2m_pod_empty_cache(struct domain *d);
+
+/* Set populate-on-demand cache size so that the total memory allocated to a
+ * domain matches target */
+int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
+
+/* Call when decreasing memory reservation to handle PoD entries properly.
+ * Will return '1' if all entries were handled and nothing more need be done.*/
+int
+p2m_pod_decrease_reservation(struct domain *d,
+                             xen_pfn_t gpfn,
+                             unsigned int order);
+
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
                             unsigned long mfn, unsigned int page_order, 
                             p2m_type_t t);
 
+/* Set a p2m range as populate-on-demand */
+int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
+                                          unsigned int order);
+
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
index c6b2e53d21d7a569e48438d555936b2931be1346..64b2246995f6ab1f850cb322e318700d4f0fb600 100644 (file)
@@ -215,33 +215,52 @@ void clear_page_sse2(void *);
 #define clear_page(_p)      (cpu_has_xmm2 ?                             \
                              clear_page_sse2((void *)(_p)) :            \
                              (void)memset((void *)(_p), 0, PAGE_SIZE))
-#define copy_page(_t,_f)    memcpy((void *)(_t), (void *)(_f), PAGE_SIZE)
+void copy_page_sse2(void *, const void *);
+#define copy_page(_t,_f)    (cpu_has_xmm2 ?                             \
+                             copy_page_sse2(_t, _f) :                   \
+                             (void)memcpy(_t, _f, PAGE_SIZE))
 
-#define mfn_valid(mfn)      ((mfn) < max_page)
+#define __mfn_valid(mfn)    ((mfn) < max_page)
 
 /* Convert between Xen-heap virtual addresses and machine addresses. */
 #define __pa(x)             (virt_to_maddr(x))
 #define __va(x)             (maddr_to_virt(x))
 
 /* Convert between Xen-heap virtual addresses and machine frame numbers. */
-#define virt_to_mfn(va)     (virt_to_maddr(va) >> PAGE_SHIFT)
-#define mfn_to_virt(mfn)    (maddr_to_virt(mfn << PAGE_SHIFT))
+#define __virt_to_mfn(va)   (virt_to_maddr(va) >> PAGE_SHIFT)
+#define __mfn_to_virt(mfn)  (maddr_to_virt((paddr_t)(mfn) << PAGE_SHIFT))
 
 /* Convert between machine frame numbers and page-info structures. */
-#define mfn_to_page(mfn)    (frame_table + (mfn))
-#define page_to_mfn(pg)     ((unsigned long)((pg) - frame_table))
+#define __mfn_to_page(mfn)  (frame_table + (mfn))
+#define __page_to_mfn(pg)   ((unsigned long)((pg) - frame_table))
 
 /* Convert between machine addresses and page-info structures. */
-#define maddr_to_page(ma)   (frame_table + ((ma) >> PAGE_SHIFT))
-#define page_to_maddr(pg)   ((paddr_t)((pg) - frame_table) << PAGE_SHIFT)
+#define __maddr_to_page(ma) (frame_table + ((ma) >> PAGE_SHIFT))
+#define __page_to_maddr(pg) ((paddr_t)((pg) - frame_table) << PAGE_SHIFT)
 
 /* Convert between Xen-heap virtual addresses and page-info structures. */
-#define virt_to_page(va)    (frame_table + (__pa(va) >> PAGE_SHIFT))
-#define page_to_virt(pg)    (maddr_to_virt(page_to_maddr(pg)))
+#define __virt_to_page(va)  (frame_table + (__pa(va) >> PAGE_SHIFT))
+#define __page_to_virt(pg)  (maddr_to_virt(page_to_maddr(pg)))
 
 /* Convert between frame number and address formats.  */
-#define pfn_to_paddr(pfn)   ((paddr_t)(pfn) << PAGE_SHIFT)
-#define paddr_to_pfn(pa)    ((unsigned long)((pa) >> PAGE_SHIFT))
+#define __pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
+#define __paddr_to_pfn(pa)  ((unsigned long)((pa) >> PAGE_SHIFT))
+
+/*
+ * We define non-underscored wrappers for above conversion functions. These are
+ * overridden in various source files while underscored versions remain intact.
+ */
+#define mfn_valid(mfn)      __mfn_valid(mfn)
+#define virt_to_mfn(va)     __virt_to_mfn(va)
+#define mfn_to_virt(mfn)    __mfn_to_virt(mfn)
+#define mfn_to_page(mfn)    __mfn_to_page(mfn)
+#define page_to_mfn(pg)     __page_to_mfn(pg)
+#define maddr_to_page(ma)   __maddr_to_page(ma)
+#define page_to_maddr(pg)   __page_to_maddr(pg)
+#define virt_to_page(va)    __virt_to_page(va)
+#define page_to_virt(pg)    __page_to_virt(pg)
+#define pfn_to_paddr(pfn)   __pfn_to_paddr(pfn)
+#define paddr_to_pfn(pa)    __paddr_to_pfn(pa)
 
 #endif /* !defined(__ASSEMBLY__) */
 
@@ -278,7 +297,6 @@ extern unsigned int   m2p_compat_vstart;
 #endif
 void paging_init(void);
 void setup_idle_pagetable(void);
-unsigned long clone_idle_pagetable(struct vcpu *);
 #endif /* !defined(__ASSEMBLY__) */
 
 #define _PAGE_PRESENT  0x001U
@@ -314,6 +332,9 @@ unsigned long clone_idle_pagetable(struct vcpu *);
 #define __PAGE_HYPERVISOR_NOCACHE \
     (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
 
+#define GRANT_PTE_FLAGS \
+    (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX | _PAGE_GNTTAB)
+
 #ifndef __ASSEMBLY__
 
 static inline int get_order_from_bytes(paddr_t size)
index d3970f1f825e01f676b60f058d1c7aa3d8a17fa1..11e96ee957370e4edbac830bb6133edfe5d941ec 100644 (file)
@@ -336,7 +336,7 @@ void paging_dump_vcpu_info(struct vcpu *v);
  * Access to the guest pagetables */
 
 /* Get a mapping of a PV guest's l1e for this virtual address. */
-static inline void *
+static inline l1_pgentry_t *
 guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn)
 {
     l2_pgentry_t l2e;
@@ -354,15 +354,14 @@ guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn)
          != _PAGE_PRESENT )
         return NULL;
     *gl1mfn = l2e_get_pfn(l2e);
-    return &__linear_l1_table[l1_linear_offset(addr)];
+    return (l1_pgentry_t *)map_domain_page(*gl1mfn) + l1_table_offset(addr);
 }
 
 /* Pull down the mapping we got from guest_map_l1e() */
 static inline void
 guest_unmap_l1e(struct vcpu *v, void *p)
 {
-    if ( unlikely(paging_mode_translate(v->domain)) )
-        unmap_domain_page(p);
+    unmap_domain_page(p);
 }
 
 /* Read the guest's l1e that maps this address. */
index 5d71b2aabf9a66a9eba521e717203af0fdd2acb8..31bcdfbbd0f39996e045a3d1404cfa049887b7ff 100644 (file)
@@ -1,6 +1,5 @@
 #ifndef __ASM_PERFC_H__
 #define __ASM_PERFC_H__
-#include <asm/mm.h>
 
 static inline void arch_perfc_printall(void)
 {
index 784aa9eb5a21ccc987fc2e9455c8d73fe89a8553..99a95cdb51067e9873ee9dcc046b4b673ba3156c 100644 (file)
@@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations,        "writable pt emulations")
 
 PERFCOUNTER(exception_fixed,        "pre-exception fixed")
 
+PERFCOUNTER(guest_walk,            "guest pagetable walks")
 
 /* Shadow counters */
 PERFCOUNTER(shadow_alloc,          "calls to shadow_alloc")
@@ -92,7 +93,6 @@ PERFCOUNTER(shadow_unshadow,       "shadow unshadows a page")
 PERFCOUNTER(shadow_up_pointer,     "shadow unshadow by up-pointer")
 PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
 PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
-PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
 PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
 PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
 PERFCOUNTER(shadow_rm_write_flush_tlb,
@@ -127,4 +127,7 @@ PERFCOUNTER(mshv_wrmsr_icr,             "MS Hv wrmsr icr")
 PERFCOUNTER(mshv_wrmsr_tpr,             "MS Hv wrmsr tpr")
 PERFCOUNTER(mshv_wrmsr_eoi,             "MS Hv wrmsr eoi")
 
+PERFCOUNTER(realmode_emulations, "realmode instructions emulated")
+PERFCOUNTER(realmode_exits,      "vmexits from realmode")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
index a4a64a7399c44b3c4f8b96e43788c94432155bd0..b55bc0234a10b638849cd12990cbbe48c35bf15c 100644 (file)
@@ -169,6 +169,7 @@ struct cpuinfo_x86 {
     int  x86_power;
     __u32 x86_max_cores; /* cpuid returned max cores value */
     __u32 booted_cores;  /* number of cores as seen by OS */
+    __u32 x86_num_siblings; /* cpuid logical cpus per chip value */
     __u32 apicid;
     unsigned short x86_clflush_size;
 } __cacheline_aligned;
@@ -187,10 +188,12 @@ extern struct cpuinfo_x86 cpu_data[];
 #define current_cpu_data boot_cpu_data
 #endif
 
+extern u64 host_pat;
 extern int phys_proc_id[NR_CPUS];
 extern int cpu_core_id[NR_CPUS];
 
 extern void identify_cpu(struct cpuinfo_x86 *);
+extern void setup_clear_cpu_cap(unsigned int);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern void dodgy_tsc(void);
@@ -201,6 +204,9 @@ extern void detect_ht(struct cpuinfo_x86 *c);
 static always_inline void detect_ht(struct cpuinfo_x86 *c) {}
 #endif
 
+#define cpu_to_core(_cpu)   (cpu_core_id[_cpu])
+#define cpu_to_socket(_cpu) (phys_proc_id[_cpu])
+
 /*
  * Generic CPUID function
  * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
@@ -540,6 +546,8 @@ extern void mtrr_bp_init(void);
 
 void mcheck_init(struct cpuinfo_x86 *c);
 asmlinkage void do_machine_check(struct cpu_user_regs *regs);
+void cpu_mcheck_distribute_cmci(void);
+void cpu_mcheck_disable(void);
 
 int cpuid_hypervisor_leaves(
     uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
@@ -549,6 +557,7 @@ int wrmsr_hypervisor_regs(
     uint32_t idx, uint32_t eax, uint32_t edx);
 
 int microcode_update(XEN_GUEST_HANDLE(const_void), unsigned long len);
+int microcode_resume_cpu(int cpu);
 
 #endif /* !__ASSEMBLY__ */
 
index 2078d441ec62f766ebaa90015e3e7f82c691fe12..c62c53fce6c4f2354bbd171b72908cd05a94571c 100644 (file)
@@ -32,7 +32,6 @@
  
 extern void smp_alloc_memory(void);
 extern int pic_mode;
-extern int smp_num_siblings;
 extern cpumask_t cpu_sibling_map[];
 extern cpumask_t cpu_core_map[];
 
index 149dea1543a65d0558394ce6bafa27175e25f3ee..43878039100f93c9d2a8ff3fd5e9434eef05ef91 100644 (file)
@@ -3,7 +3,9 @@
 
 #define NMI_MCE_SOFTIRQ        (NR_COMMON_SOFTIRQS + 0)
 #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
+#define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
 
-#define NR_ARCH_SOFTIRQS       2
+#define MACHINE_CHECK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 3)
+#define NR_ARCH_SOFTIRQS       4
 
 #endif /* __ASM_SOFTIRQ_H__ */
index 550edcb4e5ac027e91d0ad397c401e10dac84885..66c4d51435ff5c9728c462511a07bd42b3d93aaa 100644 (file)
 #include <xen/config.h>
 #include <xen/lib.h>
 #include <asm/atomic.h>
-#include <asm/rwlock.h>
 
 typedef struct {
     volatile s16 lock;
-    s8 recurse_cpu;
-    u8 recurse_cnt;
-} spinlock_t;
+} raw_spinlock_t;
 
-#define SPIN_LOCK_UNLOCKED /*(spinlock_t)*/ { 1, -1, 0 }
+#define _RAW_SPIN_LOCK_UNLOCKED /*(raw_spinlock_t)*/ { 1 }
 
-#define spin_lock_init(x)      do { *(x) = (spinlock_t) SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_is_locked(x)      (*(volatile char *)(&(x)->lock) <= 0)
+#define _raw_spin_is_locked(x) ((x)->lock <= 0)
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static always_inline void _raw_spin_lock(raw_spinlock_t *lock)
 {
-    __asm__ __volatile__ (
-        "1:  lock; decb %0         \n"
-        "    js 2f                 \n"
-        ".section .text.lock,\"ax\"\n"
+    asm volatile (
+        "1:  lock; decw %0         \n"
+        "    jns 3f                \n"
         "2:  rep; nop              \n"
-        "    cmpb $0,%0            \n"
+        "    cmpw $0,%0            \n"
         "    jle 2b                \n"
         "    jmp 1b                \n"
-        ".previous"
+        "3:"
         : "=m" (lock->lock) : : "memory" );
 }
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static always_inline void _raw_spin_unlock(raw_spinlock_t *lock)
 {
-    ASSERT(spin_is_locked(lock));
-    __asm__ __volatile__ (
-       "movb $1,%0" 
+    ASSERT(_raw_spin_is_locked(lock));
+    asm volatile (
+        "movw $1,%0" 
         : "=m" (lock->lock) : : "memory" );
 }
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
 {
-    char oldval;
-    __asm__ __volatile__(
-        "xchgb %b0,%1"
-        :"=q" (oldval), "=m" (lock->lock)
-        :"0" (0) : "memory");
-    return oldval > 0;
+    s16 oldval;
+    asm volatile (
+        "xchgw %w0,%1"
+        :"=r" (oldval), "=m" (lock->lock)
+        :"0" (0) : "memory" );
+    return (oldval > 0);
 }
 
-/*
- * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be
- * reentered recursively on the same CPU. All critical regions that may form
- * part of a recursively-nested set must be protected by these forms. If there
- * are any critical regions that cannot form part of such a set, they can use
- * standard spin_[un]lock().
- */
-#define _raw_spin_lock_recursive(_lock)            \
-    do {                                           \
-        int cpu = smp_processor_id();              \
-        if ( likely((_lock)->recurse_cpu != cpu) ) \
-        {                                          \
-            spin_lock(_lock);                      \
-            (_lock)->recurse_cpu = cpu;            \
-        }                                          \
-        (_lock)->recurse_cnt++;                    \
-    } while ( 0 )
-
-#define _raw_spin_unlock_recursive(_lock)          \
-    do {                                           \
-        if ( likely(--(_lock)->recurse_cnt == 0) ) \
-        {                                          \
-            (_lock)->recurse_cpu = -1;             \
-            spin_unlock(_lock);                    \
-        }                                          \
-    } while ( 0 )
-
-
 typedef struct {
     volatile unsigned int lock;
-} rwlock_t;
+} raw_rwlock_t;
 
-#define RW_LOCK_UNLOCKED /*(rwlock_t)*/ { RW_LOCK_BIAS }
+#define RW_LOCK_BIAS            0x01000000
+#define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { RW_LOCK_BIAS }
+
+static always_inline void _raw_read_lock(raw_rwlock_t *rw)
+{
+    asm volatile (
+        "1:  lock; decl %0         \n"
+        "    jns 3f                \n"
+        "    lock; incl %0         \n"
+        "2:  rep; nop              \n"
+        "    cmpl $1,%0            \n"
+        "    js 2b                 \n"
+        "    jmp 1b                \n"
+        "3:"
+        : "=m" (rw->lock) : : "memory" );
+}
 
-#define rwlock_init(x) do { *(x) = (rwlock_t) RW_LOCK_UNLOCKED; } while(0)
+static always_inline void _raw_write_lock(raw_rwlock_t *rw)
+{
+    asm volatile (
+        "1:  lock; subl %1,%0      \n"
+        "    jz 3f                 \n"
+        "    lock; addl %1,%0      \n"
+        "2:  rep; nop              \n"
+        "    cmpl %1,%0            \n"
+        "    jne 2b                \n"
+        "    jmp 1b                \n"
+        "3:"
+        : "=m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory" );
+}
 
-/*
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
- */
-static inline void _raw_read_lock(rwlock_t *rw)
+static always_inline void _raw_read_unlock(raw_rwlock_t *rw)
 {
-    __build_read_lock(rw, "__read_lock_failed");
+    asm volatile (
+        "lock ; incl %0"
+        : "=m" ((rw)->lock) : : "memory" );
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static always_inline void _raw_write_unlock(raw_rwlock_t *rw)
 {
-    __build_write_lock(rw, "__write_lock_failed");
+    asm volatile (
+        "lock ; addl %1,%0"
+        : "=m" ((rw)->lock) : "i" (RW_LOCK_BIAS) : "memory" );
 }
 
-#define _raw_read_unlock(rw)                       \
-    __asm__ __volatile__ (                         \
-        "lock ; incl %0" :                         \
-        "=m" ((rw)->lock) : : "memory" )
-#define _raw_write_unlock(rw)                      \
-    __asm__ __volatile__ (                         \
-        "lock ; addl $" RW_LOCK_BIAS_STR ",%0" :   \
-        "=m" ((rw)->lock) : : "memory" )
+#define _raw_rw_is_locked(x) ((x)->lock < RW_LOCK_BIAS)
 
 #endif /* __ASM_SPINLOCK_H */
index c257513dadca89bdd09e17907d32ff329451936d..ced68d063323d7e867c63f2d0b2ece430f2a5e03 100644 (file)
@@ -1,8 +1,7 @@
 #ifndef __ASM_SYSTEM_H
 #define __ASM_SYSTEM_H
 
-#include <xen/config.h>
-#include <xen/types.h>
+#include <xen/lib.h>
 #include <asm/bitops.h>
 
 #define read_segment_register(name)                             \
@@ -171,10 +170,27 @@ static always_inline unsigned long __cmpxchg(
 /* used when interrupts are already enabled or to shutdown the processor */
 #define halt()          asm volatile ( "hlt" : : : "memory" )
 
+#define local_save_flags(x)                                      \
+({                                                               \
+    BUILD_BUG_ON(sizeof(x) != sizeof(long));                     \
+    asm volatile ( "pushf" __OS " ; pop" __OS " %0" : "=g" (x)); \
+})
+#define local_irq_save(x)                                        \
+({                                                               \
+    local_save_flags(x);                                         \
+    local_irq_disable();                                         \
+})
+#define local_irq_restore(x)                                     \
+({                                                               \
+    BUILD_BUG_ON(sizeof(x) != sizeof(long));                     \
+    asm volatile ( "push" __OS " %0 ; popf" __OS                 \
+                   : : "g" (x) : "memory", "cc" );               \
+})
+
 static inline int local_irq_is_enabled(void)
 {
     unsigned long flags;
-    __save_flags(flags);
+    local_save_flags(flags);
     return !!(flags & (1<<9)); /* EFLAGS_IF */
 }
 
index 64883ceb7f3ce9c8424a51f168bf6ec916e58299..0474da0f7b9148b1585e7201585fcb8fc8d939ed 100644 (file)
 #ifndef __TBOOT_H__
 #define __TBOOT_H__
 
-typedef struct __attribute__ ((__packed__)) {
+#include <xen/acpi.h>
+
+#ifndef __packed
+#define __packed   __attribute__ ((packed))
+#endif
+
+typedef struct __packed {
   uint32_t    data1;
   uint16_t    data2;
   uint16_t    data3;
@@ -47,31 +53,52 @@ typedef struct __attribute__ ((__packed__)) {
 
 /* used to communicate between tboot and the launched kernel (i.e. Xen) */
 
-typedef struct __attribute__ ((__packed__)) {
-    uint16_t pm1a_cnt;
-    uint16_t pm1b_cnt;
-    uint16_t pm1a_evt;
-    uint16_t pm1b_evt;
+#define TB_KEY_SIZE             64   /* 512 bits */
+
+#define MAX_TB_MAC_REGIONS      32
+typedef struct __packed {
+    uint64_t  start;         /* must be 64 byte -aligned */
+    uint32_t  size;          /* must be 64 byte -granular */
+} tboot_mac_region_t;
+
+/* GAS - Generic Address Structure (ACPI 2.0+) */
+typedef struct __packed {
+       uint8_t  space_id;
+       uint8_t  bit_width;
+       uint8_t  bit_offset;
+       uint8_t  access_width;
+       uint64_t address;
+} tboot_acpi_generic_address_t;
+
+typedef struct __packed {
+    tboot_acpi_generic_address_t pm1a_cnt_blk;
+    tboot_acpi_generic_address_t pm1b_cnt_blk;
+    tboot_acpi_generic_address_t pm1a_evt_blk;
+    tboot_acpi_generic_address_t pm1b_evt_blk;
     uint16_t pm1a_cnt_val;
     uint16_t pm1b_cnt_val;
-} tboot_acpi_sleep_info;
+    uint64_t wakeup_vector;
+    uint32_t vector_width;
+    uint64_t kernel_s3_resume_vector;
+} tboot_acpi_sleep_info_t;
 
-typedef struct __attribute__ ((__packed__)) {
-    /* version 0x01+ fields: */
+typedef struct __packed {
+    /* version 3+ fields: */
     uuid_t    uuid;              /* {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} */
-    uint32_t  version;           /* Version number: 0x01, 0x02, ... */
+    uint32_t  version;           /* Version number; currently supports 0.4 */
     uint32_t  log_addr;          /* physical addr of tb_log_t log */
-    uint32_t  shutdown_entry32;  /* entry point for tboot shutdown from 32b */
-    uint32_t  shutdown_entry64;  /* entry point for tboot shutdown from 64b */
+    uint32_t  shutdown_entry;    /* entry point for tboot shutdown */
     uint32_t  shutdown_type;     /* type of shutdown (TB_SHUTDOWN_*) */
-    uint32_t  s3_tb_wakeup_entry;/* entry point for tboot s3 wake up */
-    uint32_t  s3_k_wakeup_entry; /* entry point for xen s3 wake up */
-    tboot_acpi_sleep_info
+    tboot_acpi_sleep_info_t
               acpi_sinfo;        /* where kernel put acpi sleep info in Sx */
-    uint8_t   reserved[52];      /* this pad is for compat with old field */
-    /* version 0x02+ fields: */
     uint32_t  tboot_base;        /* starting addr for tboot */
     uint32_t  tboot_size;        /* size of tboot */
+    uint8_t   num_mac_regions;   /* number mem regions to MAC on S3 */
+                                 /* contig regions memory to MAC on S3 */
+    tboot_mac_region_t mac_regions[MAX_TB_MAC_REGIONS];
+    /* version 4+ fields: */
+                                 /* populated by tboot; will be encrypted */
+    uint8_t   s3_key[TB_KEY_SIZE];
 } tboot_shared_t;
 
 #define TB_SHUTDOWN_REBOOT      0
@@ -89,6 +116,9 @@ extern tboot_shared_t *g_tboot_shared;
 void tboot_probe(void);
 void tboot_shutdown(uint32_t shutdown_type);
 int tboot_in_measured_env(void);
+int tboot_protect_mem_regions(void);
+int tboot_parse_dmar_table(acpi_table_handler dmar_handler);
+int tboot_s3_resume(void);
 
 #endif /* __TBOOT_H__ */
 
index 0477f2b2b7d1b3fbefc51e159d52aba04cb0276e..1d687a8060a8dcd5867ca31efa279e19255b7bce 100644 (file)
@@ -38,4 +38,7 @@ void pit_broadcast_enter(void);
 void pit_broadcast_exit(void);
 int pit_broadcast_is_available(void);
 
+uint64_t acpi_pm_tick_to_ns(uint64_t ticks);
+uint64_t ns_to_acpi_pm_tick(uint64_t ns);
+
 #endif /* __X86_TIME_H__ */
index 2d055301f2aa9afa50e73edbe842709ca267040c..85a422363fbf54d10a46471737c04acca367f458 100644 (file)
@@ -28,7 +28,7 @@ struct softirq_trap {
 
 struct cpu_user_regs;
 
-extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code);
+extern void machine_check_vector(struct cpu_user_regs *regs, long error_code);
  
 /**
  * guest_has_trap_callback
index 6f32f99cda50977d20ce9f305f59b1c187356074..aef51f51af1d2f26726207c30ec143175fc8058b 100644 (file)
@@ -105,9 +105,6 @@ extern unsigned int PAGE_HYPERVISOR_NOCACHE;
 #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF))
 
-#define GRANT_PTE_FLAGS \
-    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB)
-
 /*
  * Disallow unused flag bits plus PAT/PSE, PCD, PWT and GLOBAL.
  * Permit the NX bit if the hardware supports it.
@@ -115,7 +112,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCACHE;
 #define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX)
 
 #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
-#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
+#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
 #define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */
 
 #endif /* __X86_32_PAGE_H__ */
index 5707af8e8643f6b4ec40598b2358641eeeede84b..56ef751ec7fbe06e15b962e358fc7c377d67b0cc 100644 (file)
@@ -101,14 +101,4 @@ static inline void atomic_write64(uint64_t *p, uint64_t v)
 #define mb()                    \
     asm volatile ( "lock; addl $0,0(%%esp)" : : : "memory" )
 
-#define __save_flags(x)         \
-    asm volatile ( "pushfl ; popl %0" : "=g" (x) : )
-#define __restore_flags(x)      \
-    asm volatile ( "pushl %0 ; popfl" : : "g" (x) : "memory", "cc" )
-
-#define local_irq_save(x)       \
-    asm volatile ( "pushfl ; popl %0 ; cli" : "=g" (x) : : "memory" )
-#define local_irq_restore(x)    \
-    __restore_flags(x)
-
 #endif /* __X86_32_SYSTEM_H__ */
index f20f053dd51f4630b981be67702c7711efc486ac..8899fe777c935b1692dd22ad536a7a78e21c3757 100644 (file)
@@ -40,7 +40,7 @@ static inline unsigned long __virt_to_maddr(unsigned long va)
     ASSERT(va >= XEN_VIRT_START);
     ASSERT(va < DIRECTMAP_VIRT_END);
     ASSERT((va < XEN_VIRT_END) || (va >= DIRECTMAP_VIRT_START));
-    if ( va > DIRECTMAP_VIRT_START )
+    if ( va >= DIRECTMAP_VIRT_START )
         return va - DIRECTMAP_VIRT_START;
     return va - XEN_VIRT_START + xen_phys_start;
 }
@@ -115,18 +115,15 @@ typedef l4_pgentry_t root_pgentry_t;
 #define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX)
 
 #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
-#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
+#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
 #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
 #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
 
-#define COMPAT_L3_DISALLOW_MASK 0xFFFFF1FEU
+#define COMPAT_L3_DISALLOW_MASK 0xFFFFF198U
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
 
-#define GRANT_PTE_FLAGS \
-    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB|_PAGE_USER)
-
 #define USER_MAPPINGS_ARE_GLOBAL
 #ifdef USER_MAPPINGS_ARE_GLOBAL
 /*
index 229fc15292ab175b954f3e3475033b44f9f87183..fa9b3118b063b4a78e086c40d70b7c286915f25d 100644 (file)
@@ -55,14 +55,4 @@ static inline void atomic_write64(uint64_t *p, uint64_t v)
 #define mb()                    \
     asm volatile ( "mfence" : : : "memory" )
 
-#define __save_flags(x)         \
-    asm volatile ( "pushfq ; popq %q0" : "=g" (x) : :"memory" )
-#define __restore_flags(x)      \
-    asm volatile ( "pushq %0 ; popfq" : : "g" (x) : "memory", "cc" )
-
-#define local_irq_save(x)       \
-    asm volatile ( "pushfq ; popq %0 ; cli" : "=g" (x) : : "memory" )
-#define local_irq_restore(x)    \
-    __restore_flags(x)
-
 #endif /* __X86_64_SYSTEM_H__ */
index 3e318eba69098862fb38c309d07bc58fb6e1f4aa..77086658f5bf55f614123e5a45778c3751b19d23 100644 (file)
@@ -64,6 +64,9 @@ void xenoprof_backtrace(
                  "xenoprof/x86 with autotranslated mode enabled"    \
                  "isn't supported yet\n");                          \
     } while (0)
+int passive_domain_do_rdmsr(struct cpu_user_regs *regs);
+int passive_domain_do_wrmsr(struct cpu_user_regs *regs);
+void passive_domain_destroy(struct vcpu *v);
 
 #endif /* __ASM_X86_XENOPROF_H__ */
 
diff --git a/xen/include/crypto/rijndael.h b/xen/include/crypto/rijndael.h
new file mode 100644 (file)
index 0000000..2974602
--- /dev/null
@@ -0,0 +1,58 @@
+/*     $OpenBSD: rijndael.h,v 1.13 2008/06/09 07:49:45 djm Exp $ */
+
+/**
+ * rijndael-alg-fst.h
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __RIJNDAEL_H
+#define __RIJNDAEL_H
+
+#define AES_MAXKEYBITS (256)
+#define AES_MAXKEYBYTES        (AES_MAXKEYBITS/8)
+/* for 256-bit keys, fewer for less */
+#define AES_MAXROUNDS  14
+
+//typedef unsigned char        u8;
+//typedef unsigned short       u16;
+//typedef unsigned int u32;
+
+/*  The structure for key information */
+typedef struct {
+       int     enc_only;               /* context contains only encrypt schedule */
+       int     Nr;                     /* key-length-dependent number of rounds */
+       u32     ek[4*(AES_MAXROUNDS + 1)];      /* encrypt key schedule */
+       u32     dk[4*(AES_MAXROUNDS + 1)];      /* decrypt key schedule */
+} rijndael_ctx;
+
+int     rijndael_set_key(rijndael_ctx *, const u_char *, int);
+int     rijndael_set_key_enc_only(rijndael_ctx *, const u_char *, int);
+void    rijndael_decrypt(rijndael_ctx *, const u_char *, u_char *);
+void    rijndael_encrypt(rijndael_ctx *, const u_char *, u_char *);
+
+int    rijndaelKeySetupEnc(unsigned int [], const unsigned char [], int);
+int    rijndaelKeySetupDec(unsigned int [], const unsigned char [], int);
+void   rijndaelEncrypt(const unsigned int [], int, const unsigned char [],
+           unsigned char []);
+
+#endif /* __RIJNDAEL_H */
diff --git a/xen/include/crypto/vmac.h b/xen/include/crypto/vmac.h
new file mode 100644 (file)
index 0000000..9e92b3e
--- /dev/null
@@ -0,0 +1,178 @@
+#ifndef HEADER_VMAC_H
+#define HEADER_VMAC_H
+
+/* --------------------------------------------------------------------------
+ * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
+ * This implementation is herby placed in the public domain.
+ * The authors offers no warranty. Use at your own risk.
+ * Please send bug reports to the authors.
+ * Last modified: 17 APR 08, 1700 PDT
+ * ----------------------------------------------------------------------- */
+
+/* --------------------------------------------------------------------------
+ * User definable settings.
+ * ----------------------------------------------------------------------- */
+#define VMAC_TAG_LEN   64 /* Must be 64 or 128 - 64 sufficient for most    */
+#define VMAC_KEY_LEN  128 /* Must be 128, 192 or 256                       */
+#define VMAC_NHBYTES  128 /* Must 2^i for any 3 < i < 13. Standard = 128   */
+#define VMAC_PREFER_BIG_ENDIAN  0  /* Prefer non-x86 */
+
+#define VMAC_USE_OPENSSL  0 /* Set to non-zero to use OpenSSL's AES        */
+#define VMAC_CACHE_NONCES 1 /* Set to non-zero to cause caching            */
+                            /* of consecutive nonces on 64-bit tags        */
+
+#define VMAC_RUN_TESTS 0  /* Set to non-zero to check vectors and speed    */
+#define VMAC_HZ (448e6)  /* Set to hz of host machine to get speed        */
+#define VMAC_HASH_ONLY 0  /* Set to non-zero to time hash only (not-mac)   */
+/* Speeds of cpus I have access to
+#define hz (2400e6)  glyme Core 2 "Conroe"
+#define hz (2000e6)  jupiter G5
+#define hz (1592e6)  titan
+#define hz (2793e6)  athena/gaia
+#define hz (1250e6)  isis G4
+#define hz (2160e6)  imac Core 2 "Merom"
+#define hz (266e6)   ppc/arm
+#define hz (400e6)   mips
+*/
+
+/* --------------------------------------------------------------------------
+ * This implementation uses uint32_t and uint64_t as names for unsigned 32-
+ * and 64-bit integer types. These are defined in C99 stdint.h. The
+ * following may need adaptation if you are not running a C99 or
+ * Microsoft C environment.
+ * ----------------------------------------------------------------------- */
+#define VMAC_USE_STDINT 1  /* Set to zero if system has no stdint.h        */
+#if VMAC_USE_STDINT && !_MSC_VER /* Try stdint.h if non-Microsoft          */
+#ifdef  __cplusplus
+#define __STDC_CONSTANT_MACROS
+#endif
+//#include <stdint.h>
+#elif (_MSC_VER)                  /* Microsoft C does not have stdint.h    */
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#define UINT64_C(v) v ## UI64
+#else                             /* Guess sensibly - may need adaptation  */
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#define UINT64_C(v) v ## ULL
+#endif
+
+/* --------------------------------------------------------------------------
+ * This implementation supports two free AES implementations: OpenSSL's and
+ * Paulo Barreto's. To use OpenSSL's, you will need to include the OpenSSL
+ * crypto library (eg, gcc -lcrypto foo.c). For Barreto's, you will need
+ * to compile rijndael-alg-fst.c, last seen at http://www.iaik.tu-graz.ac.at/
+ * research/krypto/AES/old/~rijmen/rijndael/rijndael-fst-3.0.zip and
+ * http://homes.esat.kuleuven.be/~rijmen/rijndael/rijndael-fst-3.0.zip.
+ * To use a different implementation, use these definitions as a model.
+ * ----------------------------------------------------------------------- */
+#if VMAC_USE_OPENSSL
+
+#include <openssl/aes.h>
+typedef AES_KEY aes_int_key;
+
+#define aes_encryption(in,out,int_key)                  \
+               AES_encrypt((unsigned char *)(in),(unsigned char *)(out),(int_key))
+#define aes_key_setup(key,int_key)                      \
+               AES_set_encrypt_key((key),VMAC_KEY_LEN,(int_key))
+
+#else
+
+//#include "rijndael-alg-fst.h"
+typedef uint64_t  vmac_t;
+#include "rijndael.h"
+typedef u32 aes_int_key[4*(VMAC_KEY_LEN/32+7)];
+
+#define aes_encryption(in,out,int_key)                  \
+               rijndaelEncrypt((u32 *)(int_key),           \
+                               ((VMAC_KEY_LEN/32)+6),      \
+                                           (u8 *)(in), (u8 *)(out))
+#define aes_key_setup(user_key,int_key)                 \
+               rijndaelKeySetupEnc((u32 *)(int_key),       \
+                                   (u8 *)(user_key), \
+                                   VMAC_KEY_LEN)
+#endif
+
+/* --------------------------------------------------------------------- */
+
+typedef struct {
+       uint64_t nhkey  [(VMAC_NHBYTES/8)+2*(VMAC_TAG_LEN/64-1)];
+       uint64_t polykey[2*VMAC_TAG_LEN/64];
+       uint64_t l3key  [2*VMAC_TAG_LEN/64];
+       uint64_t polytmp[2*VMAC_TAG_LEN/64];
+       aes_int_key cipher_key;
+       #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
+       uint64_t cached_nonce[2];
+       uint64_t cached_aes[2];
+       #endif
+       int first_block_processed;
+} vmac_ctx_t;
+
+/* --------------------------------------------------------------------- */
+#ifdef  __cplusplus
+extern "C" {
+#endif
+/* --------------------------------------------------------------------------
+ *                        <<<<< USAGE NOTES >>>>>
+ *
+ * Given msg m (mbytes in length) and nonce buffer n
+ * this function returns a tag as its output. The tag is returned as
+ * a number. When VMAC_TAG_LEN == 64, the 'return'ed integer is the tag,
+ * and *tagl is meaningless. When VMAC_TAG_LEN == 128 the tag is the
+ * number y * 2^64 + *tagl where y is the function's return value.
+ * If you want to consider tags to be strings, then you must do so with
+ * an agreed upon endian orientation for interoperability, and convert
+ * the results appropriately. VHASH hashes m without creating any tag.
+ * Consecutive substrings forming a prefix of a message may be passed
+ * to vhash_update, with vhash or vmac being called with the remainder
+ * to produce the output.
+ *
+ * Requirements:
+ * - On 32-bit architectures with SSE2 instructions, ctx and m MUST be
+ *   begin on 16-byte memory boundaries.
+ * - m MUST be your message followed by zeroes to the nearest 16-byte
+ *   boundary. If m is a length multiple of 16 bytes, then it is already
+ *   at a 16-byte boundary and needs no padding. mbytes should be your
+ *   message length without any padding. 
+ * - The first bit of the nonce buffer n must be 0. An i byte nonce, is made
+ *   as the first 16-i bytes of n being zero, and the final i the nonce.
+ * - vhash_update MUST have mbytes be a positive multiple of VMAC_NHBYTES
+ * ----------------------------------------------------------------------- */
+
+#define vmac_update vhash_update
+
+void vhash_update(unsigned char m[],
+          unsigned int mbytes,
+          vmac_ctx_t *ctx);
+
+uint64_t vmac(unsigned char m[],
+         unsigned int mbytes,
+         unsigned char n[16],
+         uint64_t *tagl,
+         vmac_ctx_t *ctx);
+
+uint64_t vhash(unsigned char m[],
+          unsigned int mbytes,
+          uint64_t *tagl,
+          vmac_ctx_t *ctx);
+
+/* --------------------------------------------------------------------------
+ * When passed a VMAC_KEY_LEN bit user_key, this function initialazies ctx.
+ * ----------------------------------------------------------------------- */
+
+void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx);
+
+/* --------------------------------------------------------------------------
+ * This function aborts current hash and resets ctx, ready for a new message.
+ * ----------------------------------------------------------------------- */
+
+void vhash_abort(vmac_ctx_t *ctx);
+
+/* --------------------------------------------------------------------- */
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* HEADER_AES_H */
index c88783ad03369a53532ff2e87986240e2634e5a2..f058df104b2e0bcbfd489095c618e29e9152beee 100644 (file)
@@ -198,6 +198,15 @@ struct mapped_regs {
             unsigned long rrs[8]; // region registers
             unsigned long krs[8]; // kernel registers
             unsigned long tmp[16]; // temp registers (e.g. for hyperprivops)
+
+            /* itc paravirtualization
+             * vAR.ITC = mAR.ITC + itc_offset
+             * itc_last is one which was lastly passed to
+             * the guest OS in order to prevent it from
+             * going backwords.
+             */
+            unsigned long itc_offset;
+            unsigned long itc_last;
         };
     };
 };
@@ -392,6 +401,7 @@ struct vcpu_guest_context {
 #define VGCF_EXTRA_REGS (1UL << 1)     /* Set extra regs.  */
 #define VGCF_SET_CR_IRR (1UL << 2)     /* Set cr_irr[0:3]. */
 #define VGCF_online     (1UL << 3)  /* make this vcpu online */
+#define VGCF_SET_AR_ITC (1UL << 4)  /* set pv ar.itc. itc_offset, itc_last */
     unsigned long flags;       /* VGCF_* flags */
 
     struct vcpu_guest_context_regs regs;
index a2650b591692184d7c48fc8f7be67038fe0929c3..d422752137f88b19897ec132face6b48ebf77320 100644 (file)
@@ -23,8 +23,8 @@
 #ifndef __XEN_PUBLIC_HVM_SAVE_IA64_H__
 #define __XEN_PUBLIC_HVM_SAVE_IA64_H__
 
-#include <public/hvm/save.h>
-#include <public/arch-ia64.h>
+#include "../../hvm/save.h"
+#include "../../arch-ia64.h"
 
 /* 
  * Save/restore header: general info about the save file. 
@@ -106,7 +106,11 @@ DECLARE_HVM_SAVE_TYPE(VTIME, 5, struct hvm_hw_ia64_vtime);
  */
 #define VIOSAPIC_NUM_PINS     48
 
-union viosapic_rte
+/* To share VT-d code which uses vioapic_redir_entry.
+ * Although on ia64 this is for vsapic, but we have to vioapic_redir_entry
+ * instead of viosapic_redir_entry.
+ */
+union vioapic_redir_entry
 {
     uint64_t bits;
     struct {
@@ -124,7 +128,7 @@ union viosapic_rte
 
         uint8_t reserved[3];
         uint16_t dest_id;
-    }
+    } fields;
 };
 
 struct hvm_hw_ia64_viosapic {
@@ -134,7 +138,7 @@ struct hvm_hw_ia64_viosapic {
     uint32_t    pad;
     uint64_t    lowest_vcpu_id;
     uint64_t    base_address;
-    union viosapic_rte  redirtbl[VIOSAPIC_NUM_PINS];
+    union vioapic_redir_entry  redirtbl[VIOSAPIC_NUM_PINS];
 };
 DECLARE_HVM_SAVE_TYPE(VIOSAPIC, 6, struct hvm_hw_ia64_viosapic);
   
index 9b78787b44857e164dace59a30c35fafd7b5eeda..bfdc7267c95ea29f488b01011bb446258ad1578a 100644 (file)
@@ -287,7 +287,7 @@ struct hvm_hw_pci_irqs {
      * Indexed by: device*4 + INTx#.
      */
     union {
-        DECLARE_BITMAP(i, 32*4);
+        unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */
         uint64_t pad[2];
     };
 };
@@ -300,7 +300,7 @@ struct hvm_hw_isa_irqs {
      * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing).
      */
     union {
-        DECLARE_BITMAP(i, 16);
+        unsigned long i[1];  /* DECLARE_BITMAP(i, 16); */
         uint64_t pad[1];
     };
 };
index 103d41fd3d32ba778838be9d5beb259e71037e3a..b02ebf0e07bc5121d26d9c8ae47a8d20323c812e 100644 (file)
 /* Hypercall */
 #define __HYPERVISOR_mca __HYPERVISOR_arch_0
 
-#define XEN_MCA_INTERFACE_VERSION 0x03000001
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc002
 
-/* IN: Dom0 calls hypercall from MC event handler. */
-#define XEN_MC_CORRECTABLE  0x0
-/* IN: Dom0/DomU calls hypercall from MC trap handler. */
-#define XEN_MC_TRAP         0x1
-/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT  0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT     0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK        0x0004
 
 /* OUT: All is ok */
 #define XEN_MC_OK           0x0
 #define MC_TYPE_GLOBAL          0
 #define MC_TYPE_BANK            1
 #define MC_TYPE_EXTENDED        2
+#define MC_TYPE_RECOVERY        3
 
 struct mcinfo_common {
     uint16_t type;      /* structure type */
@@ -106,7 +114,11 @@ struct mcinfo_common {
 
 #define MC_FLAG_CORRECTABLE     (1 << 0)
 #define MC_FLAG_UNCORRECTABLE   (1 << 1)
-
+#define MC_FLAG_RECOVERABLE    (1 << 2)
+#define MC_FLAG_POLLED         (1 << 3)
+#define MC_FLAG_RESET          (1 << 4)
+#define MC_FLAG_CMCI           (1 << 5)
+#define MC_FLAG_MCE            (1 << 6)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -115,6 +127,7 @@ struct mcinfo_global {
     uint16_t mc_domid;
     uint32_t mc_socketid; /* physical socket of the physical core */
     uint16_t mc_coreid; /* physical impacted core */
+    uint32_t mc_apicid;
     uint16_t mc_core_threadid; /* core thread of physical core */
     uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
     uint64_t mc_gstatus; /* global status */
@@ -132,6 +145,8 @@ struct mcinfo_bank {
     uint64_t mc_addr;   /* bank address, only valid
                          * if addr bit is set in mc_status */
     uint64_t mc_misc;
+    uint64_t mc_ctrl2;
+    uint64_t mc_tsc;
 };
 
 
@@ -150,9 +165,76 @@ struct mcinfo_extended {
      * multiple times. */
 
     uint32_t mc_msrs; /* Number of msr with valid values. */
-    struct mcinfo_msr mc_msr[5];
+    /*
+     * Currently Intel extended MSR (32/64) including all gp registers
+     * and E(R)DI, E(R)BP, E(R)SP, E(R)FLAGS, E(R)IP, E(R)MISC, only 10
+     * of them might be useful. So expend this array to 10.
+    */
+    struct mcinfo_msr mc_msr[10];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/* Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/* Below interface used between XEN/DOM0 for passing XEN's recovery action 
+ * information to DOM0. 
+ * usage Senario: After offlining broken page, XEN might pass its page offline
+ * recovery action result to DOM0. DOM0 will save the information in 
+ * non-volatile memory for further proactive actions, such as offlining the
+ * easy broken page earlier when doing next reboot.
+*/
+struct page_offline_action
+{
+    /* Params for passing the offlined page number to DOM0 */
+    uint64_t mfn;
+    uint64_t status;
+};
+
+struct cpu_offline_action
+{
+    /* Params for passing the identity of the offlined CPU to DOM0 */
+    uint32_t mc_socketid;
+    uint16_t mc_coreid;
+    uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mc_recovery
+{
+    uint16_t mc_bank; /* bank nr */
+    uint8_t action_flags;
+    uint8_t action_types;
+    union {
+        struct page_offline_action page_retire;
+        struct cpu_offline_action cpu_offline;
+        uint8_t pad[MAX_UNION_SIZE];
+    } action_info;
 };
 
+struct mcinfo_recovery
+{
+    struct mcinfo_common common;
+    struct mc_recovery mc_action;
+};
+
+
 #define MCINFO_HYPERCALLSIZE   1024
 #define MCINFO_MAXSIZE         768
 
@@ -163,7 +245,43 @@ struct mc_info {
     uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
 };
 typedef struct mc_info mc_info_t;
-
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_NMSRS 1
+#define MC_NCAPS       7       /* 7 CPU feature flag words */
+#define MC_CAPS_STD_EDX        0       /* cpuid level 0x00000001 (%edx) */
+#define MC_CAPS_AMD_EDX        1       /* cpuid level 0x80000001 (%edx) */
+#define MC_CAPS_TM     2       /* cpuid level 0x80860001 (TransMeta) */
+#define MC_CAPS_LINUX  3       /* Linux-defined */
+#define MC_CAPS_STD_ECX        4       /* cpuid level 0x00000001 (%ecx) */
+#define MC_CAPS_VIA    5       /* cpuid level 0xc0000001 */
+#define MC_CAPS_AMD_ECX        6       /* cpuid level 0x80000001 (%ecx) */
+
+typedef struct mcinfo_logical_cpu {
+    uint32_t mc_cpunr;          
+    uint32_t mc_chipid; 
+    uint16_t mc_coreid;
+    uint16_t mc_threadid;
+    uint32_t mc_apicid;
+    uint32_t mc_clusterid;
+    uint32_t mc_ncores;
+    uint32_t mc_ncores_active;
+    uint32_t mc_nthreads;
+    int32_t mc_cpuid_level;
+    uint32_t mc_family;
+    uint32_t mc_vendor;
+    uint32_t mc_model;
+    uint32_t mc_step;
+    char mc_vendorid[16];
+    char mc_brandid[64];
+    uint32_t mc_cpu_caps[MC_NCAPS];
+    uint32_t mc_cache_size;
+    uint32_t mc_cache_alignment;
+    int32_t mc_nmsrvals;
+    struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+} xen_mc_logical_cpu_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t);
 
 
 /* 
@@ -228,14 +346,14 @@ typedef struct mc_info mc_info_t;
 #define XEN_MC_fetch            1
 struct xen_mc_fetch {
     /* IN/OUT variables. */
-    uint32_t flags;
-
-/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
-/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint32_t flags;    /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+                           XEN_MC_ACK if ack'ing an earlier fetch */
+                       /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+                          XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */
 
     /* OUT variables. */
-    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
-    struct mc_info mc_info;
+    XEN_GUEST_HANDLE(mc_info_t) data;
 };
 typedef struct xen_mc_fetch xen_mc_fetch_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
@@ -250,7 +368,6 @@ struct xen_mc_notifydomain {
     uint16_t mc_domid;    /* The unprivileged domain to notify. */
     uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
                            * Usually echo'd value from the fetch hypercall. */
-    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
 
     /* IN/OUT variables. */
     uint32_t flags;
@@ -261,15 +378,46 @@ struct xen_mc_notifydomain {
 typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
 
+#define XEN_MC_physcpuinfo 3
+struct xen_mc_physcpuinfo {
+       /* IN/OUT */
+       uint32_t ncpus;
+       uint32_t pad0;
+       /* OUT */
+       XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
+};
+
+#define XEN_MC_msrinject    4
+#define MC_MSRINJ_MAXMSRS       8
+struct xen_mc_msrinject {
+       /* IN */
+       unsigned int mcinj_cpunr;       /* target processor id */
+       uint32_t mcinj_flags;           /* see MC_MSRINJ_F_* below */
+       uint32_t mcinj_count;           /* 0 .. count-1 in array are valid */
+       uint32_t mcinj_pad0;
+       struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE   0x1
+
+#define XEN_MC_mceinject    5
+struct xen_mc_mceinject {
+       unsigned int mceinj_cpunr;      /* target processor id */
+};
+
+typedef union {
+    struct xen_mc_fetch        mc_fetch;
+    struct xen_mc_notifydomain mc_notifydomain;
+    struct xen_mc_physcpuinfo  mc_physcpuinfo;
+    struct xen_mc_msrinject    mc_msrinject;
+    struct xen_mc_mceinject    mc_mceinject;
+} xen_mc_arg_t;
 
 struct xen_mc {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
-    union {
-        struct xen_mc_fetch        mc_fetch;
-        struct xen_mc_notifydomain mc_notifydomain;
-        uint8_t pad[MCINFO_HYPERCALLSIZE];
-    } u;
+    xen_mc_arg_t u;
 };
 typedef struct xen_mc xen_mc_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
index b7075ac447e54372b745729aaf344b7a388b4048..8574302bae992691c7346a3ecabb26b3824bcd8f 100644 (file)
@@ -51,11 +51,14 @@ struct xen_domctl_createdomain {
     uint32_t ssidref;
     xen_domain_handle_t handle;
  /* Is this an HVM guest (as opposed to a PV guest)? */
-#define _XEN_DOMCTL_CDF_hvm_guest 0
-#define XEN_DOMCTL_CDF_hvm_guest  (1U<<_XEN_DOMCTL_CDF_hvm_guest)
+#define _XEN_DOMCTL_CDF_hvm_guest     0
+#define XEN_DOMCTL_CDF_hvm_guest      (1U<<_XEN_DOMCTL_CDF_hvm_guest)
  /* Use hardware-assisted paging if available? */
-#define _XEN_DOMCTL_CDF_hap       1
-#define XEN_DOMCTL_CDF_hap        (1U<<_XEN_DOMCTL_CDF_hap)
+#define _XEN_DOMCTL_CDF_hap           1
+#define XEN_DOMCTL_CDF_hap            (1U<<_XEN_DOMCTL_CDF_hap)
+ /* Should domain memory integrity be verifed by tboot during Sx? */
+#define _XEN_DOMCTL_CDF_s3_integrity  2
+#define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
     uint32_t flags;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
@@ -466,6 +469,7 @@ typedef enum pt_irq_type_e {
     PT_IRQ_TYPE_PCI,
     PT_IRQ_TYPE_ISA,
     PT_IRQ_TYPE_MSI,
+    PT_IRQ_TYPE_MSI_TRANSLATE,
 } pt_irq_type_t;
 struct xen_domctl_bind_pt_irq {
     uint32_t machine_irq;
@@ -484,6 +488,7 @@ struct xen_domctl_bind_pt_irq {
         struct {
             uint8_t gvec;
             uint32_t gflags;
+            uint64_aligned_t gtable;
         } msi;
     } u;
 };
@@ -619,6 +624,28 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t);
  */
 #define XEN_DOMCTL_suppress_spurious_page_faults 53
 
+#define XEN_DOMCTL_debug_op    54
+#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF         0
+#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON          1
+struct xen_domctl_debug_op {
+    uint32_t op;   /* IN */
+    uint32_t vcpu; /* IN */
+};
+typedef struct xen_domctl_debug_op xen_domctl_debug_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_debug_op_t);
+
+/*
+ * Request a particular record from the HVM context
+ */
+#define XEN_DOMCTL_gethvmcontext_partial   55
+typedef struct xen_domctl_hvmcontext_partial {
+    uint32_t type;                      /* IN: Type of record required */
+    uint32_t instance;                  /* IN: Instance of that type */
+    XEN_GUEST_HANDLE_64(uint8) buffer;  /* OUT: buffer to write record into */
+} xen_domctl_hvmcontext_partial_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+
+
 struct xen_domctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
@@ -646,6 +673,7 @@ struct xen_domctl {
         struct xen_domctl_settimeoffset     settimeoffset;
         struct xen_domctl_real_mode_area    real_mode_area;
         struct xen_domctl_hvmcontext        hvmcontext;
+        struct xen_domctl_hvmcontext_partial hvmcontext_partial;
         struct xen_domctl_address_size      address_size;
         struct xen_domctl_sendtrigger       sendtrigger;
         struct xen_domctl_get_device_group  get_device_group;
@@ -658,6 +686,7 @@ struct xen_domctl {
         struct xen_domctl_set_opt_feature   set_opt_feature;
         struct xen_domctl_set_target        set_target;
         struct xen_domctl_subscribe         subscribe;
+        struct xen_domctl_debug_op          debug_op;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
index 77be41bb4b31aa4fc9b48e21c02367480dd9e245..3888cc0c836c7ae64a3b77dad078a94eb8399b92 100644 (file)
  */
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
+/*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be able to handle the page table pages used for
+ * this mapping not being accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M      15
+
 /*
  * The number of the highest elfnote defined.
  */
-#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
 
 /*
  * System information exported through crash notes.
index 05fc5dc4644273c74aae21aac68a86e0bdefd399..879131cda12a53c25199c3f3e9575d266b13d76b 100644 (file)
 /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
 #define XENFEAT_mmu_pt_update_preserve_ad  5
 
+/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
+#define XENFEAT_highmem_assist             6
+
+/*
+ * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel
+ * available pte bits.
+ */
+#define XENFEAT_gnttab_map_avail_bits      7
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
index 26f2c35b1813d38e99b0086161055ed46c3f2bac..ad116e71e18e0ff1dca14745a4b782ad05000cea 100644 (file)
@@ -360,7 +360,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
 
 
 /*
- * Bitfield values for update_pin_status.flags.
+ * Bitfield values for gnttab_map_grant_ref.flags.
  */
  /* Map the grant entry for access by I/O devices. */
 #define _GNTMAP_device_map      (0)
@@ -387,6 +387,13 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
 #define _GNTMAP_contains_pte    (4)
 #define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
 
+/*
+ * Bits to be placed in guest kernel available PTE bits (architecture
+ * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set).
+ */
+#define _GNTMAP_guest_avail0    (16)
+#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0)
+
 /*
  * Values for error status returns. All errors are -ve.
  */
index dfe34db1e5ca351b14cf290ad2b6d5b5758c8c3d..b89845515db672670d295fad2f5ab60f8170f535 100644 (file)
@@ -33,9 +33,37 @@ struct hvm_info_table {
     char        signature[8]; /* "HVM INFO" */
     uint32_t    length;
     uint8_t     checksum;
+
+    /* Should firmware build ACPI tables? */
     uint8_t     acpi_enabled;
+
+    /* Should firmware build APIC descriptors (APIC MADT / MP BIOS)? */
     uint8_t     apic_mode;
+
+    /* How many CPUs does this domain have? */
     uint32_t    nr_vcpus;
+
+    /*
+     * MEMORY MAP provided by HVM domain builder.
+     * Notes:
+     *  1. page_to_phys(x) = x << 12
+     *  2. If a field is zero, the corresponding range does not exist.
+     */
+    /*
+     *  0x0 to page_to_phys(low_mem_pgend)-1:
+     *    RAM below 4GB (except for VGA hole 0xA0000-0xBFFFF)
+     */
+    uint32_t    low_mem_pgend;
+    /*
+     *  page_to_phys(reserved_mem_pgstart) to 0xFFFFFFFF:
+     *    Reserved for special memory mappings
+     */
+    uint32_t    reserved_mem_pgstart;
+    /*
+     *  0x100000000 to page_to_phys(high_mem_pgend)-1:
+     *    RAM above 4GB
+     */
+    uint32_t    high_mem_pgend;
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
index 62b0d858a8179183b1f4148cd36e1311b3cfbc1e..15d828fe14dc1363101732cad034e27d1101d3b4 100644 (file)
 /* ACPI S state: currently support S0 and S3 on x86. */
 #define HVM_PARAM_ACPI_S_STATE 14
 
-#define HVM_NR_PARAMS          15
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS     15
+
+/* Boolean: Enable aligning all periodic vpts to reduce interrupts */
+#define HVM_PARAM_VPT_ALIGN    16
+
+#define HVM_NR_PARAMS          17
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
index 04ef928464a465185eea9df8f80378dbf4579d20..260aac78597252f40073a14bdb4aa52fed2f9679 100644 (file)
@@ -185,7 +185,8 @@ DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response);
 
 #define STATE_INITIALISED     "init"
 #define STATE_READY           "ready"
-
+#define STATE_CLOSING         "closing"
+#define STATE_CLOSED          "closed"
 
 
 #endif
index 0a0ffcc6e2c4f3ba2e8e37de323b73baf1213380..c4177f3949c10fc5088df729985067a8421c45a8 100644 (file)
 
 /* xen_pci_sharedinfo flags */
 #define _XEN_PCIF_active     (0)
-#define XEN_PCIF_active      (1<<_XEN_PCI_active)
+#define XEN_PCIF_active      (1<<_XEN_PCIF_active)
+#define _XEN_PCIB_AERHANDLER (1)
+#define XEN_PCIB_AERHANDLER  (1<<_XEN_PCIB_AERHANDLER)
+#define _XEN_PCIB_active     (2)
+#define XEN_PCIB_active      (1<<_XEN_PCIB_active)
 
 /* xen_pci_op commands */
-#define XEN_PCI_OP_conf_read    (0)
-#define XEN_PCI_OP_conf_write   (1)
-#define XEN_PCI_OP_enable_msi   (2)
-#define XEN_PCI_OP_disable_msi  (3)
-#define XEN_PCI_OP_enable_msix  (4)
-#define XEN_PCI_OP_disable_msix (5)
+#define XEN_PCI_OP_conf_read           (0)
+#define XEN_PCI_OP_conf_write          (1)
+#define XEN_PCI_OP_enable_msi          (2)
+#define XEN_PCI_OP_disable_msi         (3)
+#define XEN_PCI_OP_enable_msix         (4)
+#define XEN_PCI_OP_disable_msix        (5)
+#define XEN_PCI_OP_aer_detected        (6)
+#define XEN_PCI_OP_aer_resume          (7)
+#define XEN_PCI_OP_aer_mmio            (8)
+#define XEN_PCI_OP_aer_slotreset       (9)
 
 /* xen_pci_op error numbers */
 #define XEN_PCI_ERR_success          (0)
@@ -82,10 +90,25 @@ struct xen_pci_op {
     struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC];
 };
 
+/*used for pcie aer handling*/
+struct xen_pcie_aer_op
+{
+
+    /* IN: what action to perform: XEN_PCI_OP_* */
+    uint32_t cmd;
+    /*IN/OUT: return aer_op result or carry error_detected state as input*/
+    int32_t err;
+
+    /* IN: which device to touch */
+    uint32_t domain; /* PCI Domain/Segment*/
+    uint32_t bus;
+    uint32_t devfn;
+};
 struct xen_pci_sharedinfo {
     /* flags - XEN_PCIF_* */
     uint32_t flags;
     struct xen_pci_op op;
+    struct xen_pcie_aer_op aer_op;
 };
 
 #endif /* __XEN_PCI_COMMON_H__ */
diff --git a/xen/include/public/io/usbif.h b/xen/include/public/io/usbif.h
new file mode 100644 (file)
index 0000000..511b368
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * usbif.h
+ *
+ * USB I/O interface for Xen guest OSes.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_IO_USBIF_H__
+#define __XEN_PUBLIC_IO_USBIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/*
+ *  USB pipe in usbif_request
+ *
+ *  bits 0-5 are specific bits for virtual USB driver.
+ *  bits 7-31 are standard urb pipe.
+ *
+ *  - port number(NEW):        bits 0-4
+ *                             (USB_MAXCHILDREN is 31)
+ *
+ *  - operation flag(NEW):     bit 5
+ *                             (0 = submit urb,
+ *                              1 = unlink urb)
+ *
+ *  - direction:               bit 7
+ *                             (0 = Host-to-Device [Out]
+ *                           1 = Device-to-Host [In])
+ *
+ *  - device address:  bits 8-14
+ *
+ *  - endpoint:                bits 15-18
+ *
+ *  - pipe type:               bits 30-31
+ *                             (00 = isochronous, 01 = interrupt,
+ *                           10 = control, 11 = bulk)
+ */
+#define usbif_pipeportnum(pipe) ((pipe) & 0x1f)
+#define usbif_setportnum_pipe(pipe,portnum) \
+       ((pipe)|(portnum))
+#define usbif_pipeunlink(pipe) ((pipe) & 0x20)
+#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20))
+
+#define USBIF_BACK_MAX_PENDING_REQS (128)
+#define USBIF_MAX_SEGMENTS_PER_REQUEST (10)
+
+struct usbif_request_segment {
+       grant_ref_t gref;
+       uint16_t offset;
+       uint16_t length;
+};
+
+struct usbif_request {
+       uint16_t id; /* request id */
+       uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
+
+       /* basic urb parameter */
+       uint32_t pipe;
+       uint16_t transfer_flags;
+       uint16_t buffer_length;
+       union {
+               uint8_t ctrl[8]; /* setup_packet (Ctrl) */
+
+               struct {
+                       uint16_t interval; /* maximum (1024*8) in usb core */
+                       uint16_t start_frame; /* start frame */
+                       uint16_t number_of_packets; /* number of ISO packet */
+                       uint16_t nr_frame_desc_segs; /* number of iso_frame_desc segments */
+               } isoc;
+
+               struct {
+                       uint16_t interval; /* maximum (1024*8) in usb core */
+                       uint16_t pad[3];
+               } intr;
+
+               struct {
+                       uint16_t unlink_id; /* unlink request id */
+                       uint16_t pad[3];
+               } unlink;
+
+       } u;
+
+       /* urb data segments */
+       struct usbif_request_segment seg[USBIF_MAX_SEGMENTS_PER_REQUEST];
+};
+typedef struct usbif_request usbif_request_t;
+
+struct usbif_response {
+       uint16_t id; /* request id */
+       uint16_t start_frame;  /* start frame (ISO) */
+       int32_t status; /* status (non-ISO) */
+       int32_t actual_length; /* actual transfer length */
+       int32_t error_count; /* number of ISO errors */
+};
+typedef struct usbif_response usbif_response_t;
+
+DEFINE_RING_TYPES(usbif, struct usbif_request, struct usbif_response);
+#define USB_RING_SIZE __RING_SIZE((struct usbif_sring *)0, PAGE_SIZE)
+
+#endif /* __XEN_PUBLIC_IO_USBIF_H__ */
diff --git a/xen/include/public/io/vscsiif.h b/xen/include/public/io/vscsiif.h
new file mode 100644 (file)
index 0000000..3ce2914
--- /dev/null
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * vscsiif.h
+ * 
+ * Based on the blkif.h code.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright(c) FUJITSU Limited 2008.
+ */
+
+#ifndef __XEN__PUBLIC_IO_SCSI_H__
+#define __XEN__PUBLIC_IO_SCSI_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/* command between backend and frontend */
+#define VSCSIIF_ACT_SCSI_CDB         1    /* SCSI CDB command */
+#define VSCSIIF_ACT_SCSI_ABORT       2    /* SCSI Device(Lun) Abort*/
+#define VSCSIIF_ACT_SCSI_RESET       3    /* SCSI Device(Lun) Reset*/
+
+
+#define VSCSIIF_BACK_MAX_PENDING_REQS    128
+
+/*
+ * Maximum scatter/gather segments per request.
+ *
+ * Considering balance between allocating al least 16 "vscsiif_request"
+ * structures on one page (4096bytes) and number of scatter gather 
+ * needed, we decided to use 26 as a magic number.
+ */
+#define VSCSIIF_SG_TABLESIZE             26
+
+/*
+ * base on linux kernel 2.6.18
+ */
+#define VSCSIIF_MAX_COMMAND_SIZE         16
+#define VSCSIIF_SENSE_BUFFERSIZE         96
+
+
+struct vscsiif_request {
+    uint16_t rqid;          /* private guest value, echoed in resp  */
+    uint8_t act;            /* command between backend and frontend */
+    uint8_t cmd_len;
+
+    uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+    uint16_t timeout_per_command;     /* The command is issued by twice 
+                                         the value in Backend. */
+    uint16_t channel, id, lun;
+    uint16_t padding;
+    uint8_t sc_data_direction;        /* for DMA_TO_DEVICE(1)
+                                         DMA_FROM_DEVICE(2)
+                                         DMA_NONE(3) requests  */
+    uint8_t nr_segments;              /* Number of pieces of scatter-gather */
+
+    struct scsiif_request_segment {
+        grant_ref_t gref;
+        uint16_t offset;
+        uint16_t length;
+    } seg[VSCSIIF_SG_TABLESIZE];
+    uint32_t reserved[3];
+};
+typedef struct vscsiif_request vscsiif_request_t;
+
+struct vscsiif_response {
+    uint16_t rqid;
+    uint8_t padding;
+    uint8_t sense_len;
+    uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+    int32_t rslt;
+    uint32_t residual_len;     /* request bufflen - 
+                                  return the value from physical device */
+    uint32_t reserved[36];
+};
+typedef struct vscsiif_response vscsiif_response_t;
+
+DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
+
+
+#endif  /*__XEN__PUBLIC_IO_SCSI_H__*/
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
index fc19f2fe50847f0c4e1d179816016c12fd878c65..04252226a195c87504b85c42a3c7e077a3ef490f 100644 (file)
@@ -155,27 +155,6 @@ typedef struct xen_kexec_range {
     unsigned long start;
 } xen_kexec_range_t;
 
-/* vmcoreinfo stuff */
-#define VMCOREINFO_BYTES           (4096)
-#define VMCOREINFO_NOTE_NAME       "VMCOREINFO_XEN"
-void arch_crash_save_vmcoreinfo(void);
-void vmcoreinfo_append_str(const char *fmt, ...)
-       __attribute__ ((format (printf, 1, 2)));
-#define VMCOREINFO_PAGESIZE(value) \
-       vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
-       vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \
-       vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name)
-#define VMCOREINFO_STRUCT_SIZE(name) \
-       vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
-       vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-                             (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \
-       vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #alias, \
-                             (unsigned long)offsetof(struct name, field))
-
 #endif /* _XEN_PUBLIC_KEXEC_H */
 
 /*
index d7b9fff97216346a200e6455baa7d6c6e2304165..ba4051e95fe05d019bf34b6db8ed315f4a11b183 100644 (file)
@@ -48,6 +48,8 @@
 /* NUMA node to allocate from. */
 #define XENMEMF_node(x)     (((x) + 1) << 8)
 #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
+/* Flag to populate physmap with populate-on-demand entries */
+#define XENMEMF_populate_on_demand (1<<16)
 #endif
 
 struct xen_memory_reservation {
@@ -204,7 +206,7 @@ struct xen_add_to_physmap {
     /* Source mapping space. */
 #define XENMAPSPACE_shared_info 0 /* shared info page */
 #define XENMAPSPACE_grant_table 1 /* grant table page */
-#define XENMAPSPACE_mfn         2 /* usual MFN */
+#define XENMAPSPACE_gmfn        2 /* GMFN */
     unsigned int space;
 
     /* Index into source mapping space. */
@@ -216,45 +218,8 @@ struct xen_add_to_physmap {
 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
 
-/*
- * Unmaps the page appearing at a particular GPFN from the specified guest's
- * pseudophysical address space.
- * arg == addr of xen_remove_from_physmap_t.
- */
-#define XENMEM_remove_from_physmap      15
-struct xen_remove_from_physmap {
-    /* Which domain to change the mapping for. */
-    domid_t domid;
-
-    /* GPFN of the current mapping of the page. */
-    xen_pfn_t     gpfn;
-};
-typedef struct xen_remove_from_physmap xen_remove_from_physmap_t;
-DEFINE_XEN_GUEST_HANDLE(xen_remove_from_physmap_t);
-
-/*
- * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
- * code on failure. This call only works for auto-translated guests.
- */
-#define XENMEM_translate_gpfn_list  8
-struct xen_translate_gpfn_list {
-    /* Which domain to translate for? */
-    domid_t domid;
-
-    /* Length of list. */
-    xen_ulong_t nr_gpfns;
-
-    /* List of GPFNs to translate. */
-    XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
-
-    /*
-     * Output list to contain MFN translations. May be the same as the input
-     * list (in which case each input GPFN is overwritten with the output MFN).
-     */
-    XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
-};
-typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
-DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
+/*** REMOVED ***/
+/*#define XENMEM_translate_gpfn_list  8*/
 
 /*
  * Returns the pseudo-physical memory map as it was when the domain
@@ -299,6 +264,19 @@ struct xen_foreign_memory_map {
 typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
 DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
 
+#define XENMEM_set_pod_target       16
+#define XENMEM_get_pod_target       17
+struct xen_pod_target {
+    /* IN */
+    uint64_t target_pages;
+    /* OUT */
+    uint64_t tot_pages;
+    uint64_t pod_cache_pages;
+    uint64_t pod_entries;
+    /* IN */
+    domid_t domid;
+};
+typedef struct xen_pod_target xen_pod_target_t;
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
 /*
index 8057277baab52e2f7fc87ed85e44e3b563e0f962..cb7e4d469bc992c6b0d0f4191f88752e69956b43 100644 (file)
@@ -40,6 +40,21 @@ struct physdev_eoi {
 typedef struct physdev_eoi physdev_eoi_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
 
+/*
+ * Register a shared page for the hypervisor to indicate whether the guest
+ * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly
+ * once the guest used this function in that the associated event channel
+ * will automatically get unmasked. The page registered is used as a bit
+ * array indexed by Xen's PIRQ value.
+ */
+#define PHYSDEVOP_pirq_eoi_gmfn         17
+struct physdev_pirq_eoi_gmfn {
+    /* IN */
+    xen_pfn_t gmfn;
+};
+typedef struct physdev_pirq_eoi_gmfn physdev_pirq_eoi_gmfn_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pirq_eoi_gmfn_t);
+
 /*
  * Query the status of an IRQ line.
  * @arg == pointer to physdev_irq_status_query structure.
@@ -168,6 +183,31 @@ struct physdev_manage_pci {
 typedef struct physdev_manage_pci physdev_manage_pci_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t);
 
+#define PHYSDEVOP_restore_msi            19
+struct physdev_restore_msi {
+    /* IN */
+    uint8_t bus;
+    uint8_t devfn;
+};
+typedef struct physdev_restore_msi physdev_restore_msi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_restore_msi_t);
+
+#define PHYSDEVOP_manage_pci_add_ext     20
+struct physdev_manage_pci_ext {
+    /* IN */
+    uint8_t bus;
+    uint8_t devfn;
+    unsigned is_extfn;
+    unsigned is_virtfn;
+    struct {
+        uint8_t bus;
+        uint8_t devfn;
+    } physfn;
+};
+
+typedef struct physdev_manage_pci_ext physdev_manage_pci_ext_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_ext_t);
+
 /*
  * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
  * hypercall since 0x00030202.
index 6b10954a53e1d5fa7b5ebbd90fcdb92228af82c4..48d327c06775eaea875bbc78e972be3cbf6c107c 100644 (file)
@@ -262,17 +262,185 @@ struct xen_sysctl_get_pmstat {
 typedef struct xen_sysctl_get_pmstat xen_sysctl_get_pmstat_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_get_pmstat_t);
 
+/*
+ * Status codes. Must be greater than 0 to avoid confusing
+ * sysctl callers that see 0 as a plain successful return.
+ */
+#define XEN_CPU_HOTPLUG_STATUS_OFFLINE 1
+#define XEN_CPU_HOTPLUG_STATUS_ONLINE  2
+#define XEN_CPU_HOTPLUG_STATUS_NEW     3
+
 #define XEN_SYSCTL_cpu_hotplug       11
 struct xen_sysctl_cpu_hotplug {
     /* IN variables */
     uint32_t cpu;   /* Physical cpu. */
 #define XEN_SYSCTL_CPU_HOTPLUG_ONLINE  0
 #define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1
+#define XEN_SYSCTL_CPU_HOTPLUG_STATUS 2
     uint32_t op;    /* hotplug opcode */
 };
 typedef struct xen_sysctl_cpu_hotplug xen_sysctl_cpu_hotplug_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpu_hotplug_t);
 
+/*
+ * Get/set xen power management, include 
+ * 1. cpufreq governors and related parameters
+ */
+#define XEN_SYSCTL_pm_op        12
+struct xen_userspace {
+    uint32_t scaling_setspeed;
+};
+typedef struct xen_userspace xen_userspace_t;
+
+struct xen_ondemand {
+    uint32_t sampling_rate_max;
+    uint32_t sampling_rate_min;
+
+    uint32_t sampling_rate;
+    uint32_t up_threshold;
+};
+typedef struct xen_ondemand xen_ondemand_t;
+
+/* 
+ * cpufreq para name of this structure named 
+ * same as sysfs file name of native linux
+ */
+#define CPUFREQ_NAME_LEN 16
+struct xen_get_cpufreq_para {
+    /* IN/OUT variable */
+    uint32_t cpu_num;
+    uint32_t freq_num;
+    uint32_t gov_num;
+
+    /* for all governors */
+    /* OUT variable */
+    XEN_GUEST_HANDLE_64(uint32) affected_cpus;
+    XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies;
+    XEN_GUEST_HANDLE_64(char)   scaling_available_governors;
+    char scaling_driver[CPUFREQ_NAME_LEN];
+
+    uint32_t cpuinfo_cur_freq;
+    uint32_t cpuinfo_max_freq;
+    uint32_t cpuinfo_min_freq;
+    uint32_t scaling_cur_freq;
+
+    char scaling_governor[CPUFREQ_NAME_LEN];
+    uint32_t scaling_max_freq;
+    uint32_t scaling_min_freq;
+
+    /* for specific governor */
+    union {
+        struct  xen_userspace userspace;
+        struct  xen_ondemand ondemand;
+    } u;
+};
+
+struct xen_set_cpufreq_gov {
+    char scaling_governor[CPUFREQ_NAME_LEN];
+};
+
+struct xen_set_cpufreq_para {
+    #define SCALING_MAX_FREQ           1
+    #define SCALING_MIN_FREQ           2
+    #define SCALING_SETSPEED           3
+    #define SAMPLING_RATE              4
+    #define UP_THRESHOLD               5
+
+    uint32_t ctrl_type;
+    uint32_t ctrl_value;
+};
+
+/* Get physical CPU topology information. */
+#define INVALID_TOPOLOGY_ID  (~0U)
+struct xen_get_cputopo {
+     /* IN: maximum addressable entry in
+      * the caller-provided cpu_to_core/socket.
+      */
+    uint32_t max_cpus;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
+
+    /* OUT: number of cpus returned
+     * If OUT is greater than IN then the cpu_to_core/socket is truncated!
+     */
+    uint32_t nr_cpus;
+};
+
+struct xen_sysctl_pm_op {
+    #define PM_PARA_CATEGORY_MASK      0xf0
+    #define CPUFREQ_PARA               0x10
+
+    /* cpufreq command type */
+    #define GET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x01)
+    #define SET_CPUFREQ_GOV            (CPUFREQ_PARA | 0x02)
+    #define SET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x03)
+    #define GET_CPUFREQ_AVGFREQ        (CPUFREQ_PARA | 0x04)
+
+    /* get CPU topology */
+    #define XEN_SYSCTL_pm_op_get_cputopo  0x20
+
+    /* set/reset scheduler power saving option */
+    #define XEN_SYSCTL_pm_op_set_sched_opt_smt    0x21
+
+    uint32_t cmd;
+    uint32_t cpuid;
+    union {
+        struct xen_get_cpufreq_para get_para;
+        struct xen_set_cpufreq_gov  set_gov;
+        struct xen_set_cpufreq_para set_para;
+        uint64_t get_avgfreq;
+        struct xen_get_cputopo      get_topo;
+        uint32_t                    set_sched_opt_smt;
+    };
+};
+
+#define XEN_SYSCTL_page_offline_op        14
+struct xen_sysctl_page_offline_op {
+    /* IN: range of page to be offlined */
+#define sysctl_page_offline     1
+#define sysctl_page_online      2
+#define sysctl_query_page_offline  3
+    uint32_t cmd;
+    uint32_t start;
+    uint32_t end;
+    /* OUT: result of page offline request */
+    /*
+     * bit 0~15: result flags
+     * bit 16~31: owner
+     */
+    XEN_GUEST_HANDLE(uint32) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK    (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID   (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED  (0x1UL << 1)
+#define PG_OFFLINE_PENDING   (0x1UL << 2)
+#define PG_OFFLINE_FAILED    (0x1UL << 3)
+
+#define PG_ONLINE_FAILED     PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED    PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED              (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE                (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING       (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN                (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK    (0xFFUL << 4)
+
+/* only valid when PG_OFFLINE_FAILED */
+#define PG_OFFLINE_XENPAGE   (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE  (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM   (0x1UL << 11)
+#define PG_OFFLINE_OWNED     (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN    (0x1UL << 13)
+#define PG_ONLINE_BROKEN     PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
 
 struct xen_sysctl {
     uint32_t cmd;
@@ -289,6 +457,8 @@ struct xen_sysctl {
         struct xen_sysctl_availheap         availheap;
         struct xen_sysctl_get_pmstat        get_pmstat;
         struct xen_sysctl_cpu_hotplug       cpu_hotplug;
+        struct xen_sysctl_pm_op             pm_op;
+        struct xen_sysctl_page_offline_op   page_offline;
         uint8_t                             pad[128];
     } u;
 };
index 5cce0287ebf75c2a2a2615cc5efb8451054afbee..83e09f3b593f21b50481e3c20fa8dc6f482ed520 100644 (file)
@@ -60,7 +60,8 @@
 #define TRC_TRACE_WRAP_BUFFER  (TRC_GEN + 2)
 #define TRC_TRACE_CPU_CHANGE    (TRC_GEN + 3)
 
-#define TRC_SCHED_RUNSTATE_CHANGE (TRC_SCHED_MIN + 1)
+#define TRC_SCHED_RUNSTATE_CHANGE   (TRC_SCHED_MIN + 1)
+#define TRC_SCHED_CONTINUE_RUNNING  (TRC_SCHED_MIN + 2)
 #define TRC_SCHED_DOM_ADD        (TRC_SCHED_VERBOSE +  1)
 #define TRC_SCHED_DOM_REM        (TRC_SCHED_VERBOSE +  2)
 #define TRC_SCHED_SLEEP          (TRC_SCHED_VERBOSE +  3)
 #define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
 #define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
 #define TRC_HVM_IO_ASSIST       (TRC_HVM_HANDLER + 0x16)
+#define TRC_HVM_IO_ASSIST64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x16)
 #define TRC_HVM_MMIO_ASSIST     (TRC_HVM_HANDLER + 0x17)
+#define TRC_HVM_MMIO_ASSIST64   (TRC_HVM_HANDLER + TRC_64_FLAG + 0x17)
 #define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
 #define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
 #define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
+#define TRC_HVM_INTR_WINDOW     (TRC_HVM_HANDLER + 0X20)
 
 /* trace subclasses for power management */
 #define TRC_PM_FREQ     0x00801000      /* xen cpu freq events */
index 4b444b4c71d51bfc97dce5c33a2133d15f43ae8b..fb03d0c22ca7cfe572aff1a476c226e117c665e9 100644 (file)
@@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
  */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
@@ -247,12 +254,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
     unsigned int cmd;
     union {
-        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+         * CLEAR_PAGE, COPY_PAGE */
         xen_pfn_t     mfn;
         /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
         unsigned long linear_addr;
@@ -266,6 +276,8 @@ struct mmuext_op {
 #else
         void *vcpumask;
 #endif
+        /* COPY_PAGE */
+        xen_pfn_t src_mfn;
     } arg2;
 };
 typedef struct mmuext_op mmuext_op_t;
@@ -342,6 +354,9 @@ typedef uint16_t domid_t;
  */
 #define DOMID_XEN  (0x7FF2U)
 
+/* DOMID_INVALID is used to identity invalid domid */
+#define DOMID_INVALID (0x7FFFU)
+
 /*
  * Send an array of these to HYPERVISOR_mmu_update().
  * NB. The fields are natural pointer/address size for this architecture.
@@ -501,6 +516,7 @@ typedef struct shared_info shared_info_t;
  *      a. relocated kernel image
  *      b. initial ram disk              [mod_start, mod_len]
  *      c. list of allocated page frames [mfn_list, nr_pages]
+ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
  *      d. start_info_t structure        [register ESI (x86)]
  *      e. bootstrap page tables         [pt_base, CR3 (x86)]
  *      f. bootstrap stack               [register ESP (x86)]
@@ -542,6 +558,9 @@ struct start_info {
     unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
     unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
     int8_t cmd_line[MAX_GUEST_CMDLINE];
+    /* The pfn range here covers both page table and p->m table frames.   */
+    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
 };
 typedef struct start_info start_info_t;
 
index e08a1a561226aca0ce834530c2c513015abaa99a..cbf795aeaf179de595c422dac9bd49b86fe3fef5 100644 (file)
@@ -282,7 +282,6 @@ typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, co
 
 unsigned int acpi_get_processor_id (unsigned int cpu);
 char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
-unsigned long acpi_find_rsdp (void);
 int acpi_boot_init (void);
 int acpi_boot_table_init (void);
 int acpi_numa_init (void);
index 86395d134e8915fe7941c5afe38be99ae6fcd124..63edd3e6606ba1657cff06def5e409ac73c76faf 100644 (file)
@@ -178,15 +178,10 @@ void xlat_vcpu_runstate_info(struct vcpu_runstate_info *);
 int switch_compat(struct domain *);
 int switch_native(struct domain *);
 
-#define BITS_PER_GUEST_LONG(d) \
-    (!IS_COMPAT(d) ? BITS_PER_LONG : COMPAT_BITS_PER_LONG)
-
 #else
 
 #define compat_handle_is_null(hnd) 0
 
-#define BITS_PER_GUEST_LONG(d) BITS_PER_LONG
-
 #endif
 
 #endif /* __XEN_COMPAT_H__ */
index 32d71d15f30b01d8351b0fc754b0cdab0038fbc1..559db769dcd234244a968d3fd11cbf39905330b7 100644 (file)
 #define ACPI_PROCESSOR_MAX_POWER        8
 #define CPUIDLE_NAME_LEN                16
 
+#define ACPI_CSTATE_EM_NONE     0
+#define ACPI_CSTATE_EM_SYSIO    1
+#define ACPI_CSTATE_EM_FFH      2
+#define ACPI_CSTATE_EM_HALT     3
+
 struct acpi_processor_cx
 {
+    u8 idx;
     u8 valid;
     u8 type;
     u32 address;
-    u8 space_id;
+    u8 entry_method; /* ACPI_CSTATE_EM_xxx */
     u32 latency;
     u32 latency_ticks;
     u32 power;
index a6f8e259422284ef9ea6c6d09bec02a1967cfc95..23dfd09a141f7eceb3f4ae6ce2f429bd2366c163 100644 (file)
@@ -38,6 +38,8 @@
  *
  * int first_cpu(mask)                 Number lowest set bit, or NR_CPUS
  * int next_cpu(cpu, mask)             Next cpu past 'cpu', or NR_CPUS
+ * int last_cpu(mask)                  Number highest set bit, or NR_CPUS
+ * int cycle_cpu(cpu, mask)            Next cpu cycling from 'cpu', or NR_CPUS
  *
  * cpumask_t cpumask_of_cpu(cpu)       Return cpumask with bit 'cpu' set
  * CPU_MASK_ALL                                Initializer - all bits set
@@ -225,12 +227,23 @@ static inline int __next_cpu(int n, const cpumask_t *srcp, int nbits)
 #define last_cpu(src) __last_cpu(&(src), NR_CPUS)
 static inline int __last_cpu(const cpumask_t *srcp, int nbits)
 {
-       int cpu, pcpu = NR_CPUS;
-       for (cpu = first_cpu(*srcp); cpu < NR_CPUS; cpu = next_cpu(cpu, *srcp))
+       int cpu, pcpu = nbits;
+       for (cpu = __first_cpu(srcp, nbits);
+            cpu < nbits;
+            cpu = __next_cpu(cpu, srcp, nbits))
                pcpu = cpu;
        return pcpu;
 }
 
+#define cycle_cpu(n, src) __cycle_cpu((n), &(src), NR_CPUS)
+static inline int __cycle_cpu(int n, const cpumask_t *srcp, int nbits)
+{
+    int nxt = __next_cpu(n, srcp, nbits);
+    if (nxt == nbits)
+        nxt = __first_cpu(srcp, nbits);
+    return nxt;
+}
+
 #define cpumask_of_cpu(cpu)                                            \
 ({                                                                     \
        typeof(_unused_cpumask_arg_) m;                                 \
index cf82d07ea065aeeec40db3eff303273c28e737b6..65df554421e2a0d565a7c27af5bcb324276af9bd 100644 (file)
@@ -23,6 +23,10 @@ void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
  * Arch-specifics.
  */
 
+/* Allocate/free a domain structure. */
+struct domain *alloc_domain_struct(void);
+void free_domain_struct(struct domain *d);
+
 /* Allocate/free a VCPU structure. */
 struct vcpu *alloc_vcpu_struct(void);
 void free_vcpu_struct(struct vcpu *v);
index cd2226678b5777d904b69c4ef33e62c0d90ab2be..5240179cf0b3c97506964fb43d7558e641a60367 100644 (file)
@@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn);
  * Pass a VA within a page previously mapped in the context of the
  * currently-executing VCPU via a call to map_domain_page().
  */
-void unmap_domain_page(void *va);
+void unmap_domain_page(const void *va);
 
 /*
  * Similar to the above calls, except the mapping is accessible in all
@@ -32,7 +32,7 @@ void unmap_domain_page(void *va);
  * mappings can also be unmapped from any context.
  */
 void *map_domain_page_global(unsigned long mfn);
-void unmap_domain_page_global(void *va);
+void unmap_domain_page_global(const void *va);
 
 #define DMCACHE_ENTRY_VALID 1U
 #define DMCACHE_ENTRY_HELD  2U
@@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long mfn, struct domain_mmap_cache *cache)
 }
 
 static inline void
-unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache)
+unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache)
 {
     ASSERT(cache != NULL);
     cache->flags &= ~DMCACHE_ENTRY_HELD;
index 209f4fa47e996b9ffce69f76b6f675249766d695..03faac7bf2bcaac611d6fc7e7a268031b789b4f0 100644 (file)
@@ -27,7 +27,7 @@
 #ifndef __XEN_ELF_H__
 #define __XEN_ELF_H__
 
-#include <public/elfstructs.h>
+#include <xen/elfstructs.h>
 
 #define ELFNOTE_ALIGN(_n_) (((_n_)+3)&~3)
 #define ELFNOTE_NAME(_n_) ((char*)(_n_) + sizeof(*(_n_)))
diff --git a/xen/include/xen/elfstructs.h b/xen/include/xen/elfstructs.h
new file mode 100644 (file)
index 0000000..62f9399
--- /dev/null
@@ -0,0 +1,527 @@
+#ifndef __XEN_ELFSTRUCTS_H__
+#define __XEN_ELFSTRUCTS_H__
+/*
+ * Copyright (c) 1995, 1996 Erik Theisen.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+typedef uint8_t                Elf_Byte;
+
+typedef uint32_t       Elf32_Addr;     /* Unsigned program address */
+typedef uint32_t       Elf32_Off;      /* Unsigned file offset */
+typedef int32_t                Elf32_Sword;    /* Signed large integer */
+typedef uint32_t       Elf32_Word;     /* Unsigned large integer */
+typedef uint16_t       Elf32_Half;     /* Unsigned medium integer */
+
+typedef uint64_t       Elf64_Addr;
+typedef uint64_t       Elf64_Off;
+typedef int32_t                Elf64_Shalf;
+
+typedef int32_t                Elf64_Sword;
+typedef uint32_t       Elf64_Word;
+
+typedef int64_t                Elf64_Sxword;
+typedef uint64_t       Elf64_Xword;
+
+typedef uint32_t       Elf64_Half;
+typedef uint16_t       Elf64_Quarter;
+
+/*
+ * e_ident[] identification indexes
+ * See http://www.caldera.com/developers/gabi/2000-07-17/ch4.eheader.html 
+ */
+#define EI_MAG0                0               /* file ID */
+#define EI_MAG1                1               /* file ID */
+#define EI_MAG2                2               /* file ID */
+#define EI_MAG3                3               /* file ID */
+#define EI_CLASS       4               /* file class */
+#define EI_DATA                5               /* data encoding */
+#define EI_VERSION     6               /* ELF header version */
+#define EI_OSABI       7               /* OS/ABI ID */
+#define EI_ABIVERSION  8               /* ABI version */
+#define EI_PAD         9               /* start of pad bytes */
+#define EI_NIDENT      16              /* Size of e_ident[] */
+
+/* e_ident[] magic number */
+#define        ELFMAG0         0x7f            /* e_ident[EI_MAG0] */
+#define        ELFMAG1         'E'             /* e_ident[EI_MAG1] */
+#define        ELFMAG2         'L'             /* e_ident[EI_MAG2] */
+#define        ELFMAG3         'F'             /* e_ident[EI_MAG3] */
+#define        ELFMAG          "\177ELF"       /* magic */
+#define        SELFMAG         4               /* size of magic */
+
+/* e_ident[] file class */
+#define        ELFCLASSNONE    0               /* invalid */
+#define        ELFCLASS32      1               /* 32-bit objs */
+#define        ELFCLASS64      2               /* 64-bit objs */
+#define        ELFCLASSNUM     3               /* number of classes */
+
+/* e_ident[] data encoding */
+#define ELFDATANONE    0               /* invalid */
+#define ELFDATA2LSB    1               /* Little-Endian */
+#define ELFDATA2MSB    2               /* Big-Endian */
+#define ELFDATANUM     3               /* number of data encode defines */
+
+/* e_ident[] Operating System/ABI */
+#define ELFOSABI_SYSV          0       /* UNIX System V ABI */
+#define ELFOSABI_HPUX          1       /* HP-UX operating system */
+#define ELFOSABI_NETBSD                2       /* NetBSD */
+#define ELFOSABI_LINUX         3       /* GNU/Linux */
+#define ELFOSABI_HURD          4       /* GNU/Hurd */
+#define ELFOSABI_86OPEN                5       /* 86Open common IA32 ABI */
+#define ELFOSABI_SOLARIS       6       /* Solaris */
+#define ELFOSABI_MONTEREY      7       /* Monterey */
+#define ELFOSABI_IRIX          8       /* IRIX */
+#define ELFOSABI_FREEBSD       9       /* FreeBSD */
+#define ELFOSABI_TRU64         10      /* TRU64 UNIX */
+#define ELFOSABI_MODESTO       11      /* Novell Modesto */
+#define ELFOSABI_OPENBSD       12      /* OpenBSD */
+#define ELFOSABI_ARM           97      /* ARM */
+#define ELFOSABI_STANDALONE    255     /* Standalone (embedded) application */
+
+/* e_ident */
+#define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \
+                      (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \
+                      (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \
+                      (ehdr).e_ident[EI_MAG3] == ELFMAG3)
+
+/* ELF Header */
+typedef struct elfhdr {
+       unsigned char   e_ident[EI_NIDENT]; /* ELF Identification */
+       Elf32_Half      e_type;         /* object file type */
+       Elf32_Half      e_machine;      /* machine */
+       Elf32_Word      e_version;      /* object file version */
+       Elf32_Addr      e_entry;        /* virtual entry point */
+       Elf32_Off       e_phoff;        /* program header table offset */
+       Elf32_Off       e_shoff;        /* section header table offset */
+       Elf32_Word      e_flags;        /* processor-specific flags */
+       Elf32_Half      e_ehsize;       /* ELF header size */
+       Elf32_Half      e_phentsize;    /* program header entry size */
+       Elf32_Half      e_phnum;        /* number of program header entries */
+       Elf32_Half      e_shentsize;    /* section header entry size */
+       Elf32_Half      e_shnum;        /* number of section header entries */
+       Elf32_Half      e_shstrndx;     /* section header table's "section
+                                          header string table" entry offset */
+} Elf32_Ehdr;
+
+typedef struct {
+       unsigned char   e_ident[EI_NIDENT];     /* Id bytes */
+       Elf64_Quarter   e_type;                 /* file type */
+       Elf64_Quarter   e_machine;              /* machine type */
+       Elf64_Half      e_version;              /* version number */
+       Elf64_Addr      e_entry;                /* entry point */
+       Elf64_Off       e_phoff;                /* Program hdr offset */
+       Elf64_Off       e_shoff;                /* Section hdr offset */
+       Elf64_Half      e_flags;                /* Processor flags */
+       Elf64_Quarter   e_ehsize;               /* sizeof ehdr */
+       Elf64_Quarter   e_phentsize;            /* Program header entry size */
+       Elf64_Quarter   e_phnum;                /* Number of program headers */
+       Elf64_Quarter   e_shentsize;            /* Section header entry size */
+       Elf64_Quarter   e_shnum;                /* Number of section headers */
+       Elf64_Quarter   e_shstrndx;             /* String table index */
+} Elf64_Ehdr;
+
+/* e_type */
+#define ET_NONE                0               /* No file type */
+#define ET_REL         1               /* relocatable file */
+#define ET_EXEC                2               /* executable file */
+#define ET_DYN         3               /* shared object file */
+#define ET_CORE                4               /* core file */
+#define ET_NUM         5               /* number of types */
+#define ET_LOPROC      0xff00          /* reserved range for processor */
+#define ET_HIPROC      0xffff          /*  specific e_type */
+
+/* e_machine */
+#define EM_NONE                0               /* No Machine */
+#define EM_M32         1               /* AT&T WE 32100 */
+#define EM_SPARC       2               /* SPARC */
+#define EM_386         3               /* Intel 80386 */
+#define EM_68K         4               /* Motorola 68000 */
+#define EM_88K         5               /* Motorola 88000 */
+#define EM_486         6               /* Intel 80486 - unused? */
+#define EM_860         7               /* Intel 80860 */
+#define EM_MIPS                8               /* MIPS R3000 Big-Endian only */
+/*
+ * Don't know if EM_MIPS_RS4_BE,
+ * EM_SPARC64, EM_PARISC,
+ * or EM_PPC are ABI compliant
+ */
+#define EM_MIPS_RS4_BE 10              /* MIPS R4000 Big-Endian */
+#define EM_SPARC64     11              /* SPARC v9 64-bit unoffical */
+#define EM_PARISC      15              /* HPPA */
+#define EM_SPARC32PLUS 18              /* Enhanced instruction set SPARC */
+#define EM_PPC         20              /* PowerPC */
+#define EM_PPC64       21              /* PowerPC 64-bit */
+#define EM_ARM         40              /* Advanced RISC Machines ARM */
+#define EM_ALPHA       41              /* DEC ALPHA */
+#define EM_SPARCV9     43              /* SPARC version 9 */
+#define EM_ALPHA_EXP   0x9026          /* DEC ALPHA */
+#define EM_IA_64       50              /* Intel Merced */
+#define EM_X86_64      62              /* AMD x86-64 architecture */
+#define EM_VAX         75              /* DEC VAX */
+
+/* Version */
+#define EV_NONE                0               /* Invalid */
+#define EV_CURRENT     1               /* Current */
+#define EV_NUM         2               /* number of versions */
+
+/* Section Header */
+typedef struct {
+       Elf32_Word      sh_name;        /* name - index into section header
+                                          string table section */
+       Elf32_Word      sh_type;        /* type */
+       Elf32_Word      sh_flags;       /* flags */
+       Elf32_Addr      sh_addr;        /* address */
+       Elf32_Off       sh_offset;      /* file offset */
+       Elf32_Word      sh_size;        /* section size */
+       Elf32_Word      sh_link;        /* section header table index link */
+       Elf32_Word      sh_info;        /* extra information */
+       Elf32_Word      sh_addralign;   /* address alignment */
+       Elf32_Word      sh_entsize;     /* section entry size */
+} Elf32_Shdr;
+
+typedef struct {
+       Elf64_Half      sh_name;        /* section name */
+       Elf64_Half      sh_type;        /* section type */
+       Elf64_Xword     sh_flags;       /* section flags */
+       Elf64_Addr      sh_addr;        /* virtual address */
+       Elf64_Off       sh_offset;      /* file offset */
+       Elf64_Xword     sh_size;        /* section size */
+       Elf64_Half      sh_link;        /* link to another */
+       Elf64_Half      sh_info;        /* misc info */
+       Elf64_Xword     sh_addralign;   /* memory alignment */
+       Elf64_Xword     sh_entsize;     /* table entry size */
+} Elf64_Shdr;
+
+/* Special Section Indexes */
+#define SHN_UNDEF      0               /* undefined */
+#define SHN_LORESERVE  0xff00          /* lower bounds of reserved indexes */
+#define SHN_LOPROC     0xff00          /* reserved range for processor */
+#define SHN_HIPROC     0xff1f          /*   specific section indexes */
+#define SHN_ABS                0xfff1          /* absolute value */
+#define SHN_COMMON     0xfff2          /* common symbol */
+#define SHN_HIRESERVE  0xffff          /* upper bounds of reserved indexes */
+
+/* sh_type */
+#define SHT_NULL       0               /* inactive */
+#define SHT_PROGBITS   1               /* program defined information */
+#define SHT_SYMTAB     2               /* symbol table section */
+#define SHT_STRTAB     3               /* string table section */
+#define SHT_RELA       4               /* relocation section with addends*/
+#define SHT_HASH       5               /* symbol hash table section */
+#define SHT_DYNAMIC    6               /* dynamic section */
+#define SHT_NOTE       7               /* note section */
+#define SHT_NOBITS     8               /* no space section */
+#define SHT_REL                9               /* relation section without addends */
+#define SHT_SHLIB      10              /* reserved - purpose unknown */
+#define SHT_DYNSYM     11              /* dynamic symbol table section */
+#define SHT_NUM                12              /* number of section types */
+#define SHT_LOPROC     0x70000000      /* reserved range for processor */
+#define SHT_HIPROC     0x7fffffff      /*  specific section header types */
+#define SHT_LOUSER     0x80000000      /* reserved range for application */
+#define SHT_HIUSER     0xffffffff      /*  specific indexes */
+
+/* Section names */
+#define ELF_BSS         ".bss"         /* uninitialized data */
+#define ELF_DATA        ".data"                /* initialized data */
+#define ELF_DEBUG       ".debug"       /* debug */
+#define ELF_DYNAMIC     ".dynamic"     /* dynamic linking information */
+#define ELF_DYNSTR      ".dynstr"      /* dynamic string table */
+#define ELF_DYNSYM      ".dynsym"      /* dynamic symbol table */
+#define ELF_FINI        ".fini"                /* termination code */
+#define ELF_GOT         ".got"         /* global offset table */
+#define ELF_HASH        ".hash"                /* symbol hash table */
+#define ELF_INIT        ".init"                /* initialization code */
+#define ELF_REL_DATA    ".rel.data"    /* relocation data */
+#define ELF_REL_FINI    ".rel.fini"    /* relocation termination code */
+#define ELF_REL_INIT    ".rel.init"    /* relocation initialization code */
+#define ELF_REL_DYN     ".rel.dyn"     /* relocaltion dynamic link info */
+#define ELF_REL_RODATA  ".rel.rodata"  /* relocation read-only data */
+#define ELF_REL_TEXT    ".rel.text"    /* relocation code */
+#define ELF_RODATA      ".rodata"      /* read-only data */
+#define ELF_SHSTRTAB    ".shstrtab"    /* section header string table */
+#define ELF_STRTAB      ".strtab"      /* string table */
+#define ELF_SYMTAB      ".symtab"      /* symbol table */
+#define ELF_TEXT        ".text"                /* code */
+
+
+/* Section Attribute Flags - sh_flags */
+#define SHF_WRITE      0x1             /* Writable */
+#define SHF_ALLOC      0x2             /* occupies memory */
+#define SHF_EXECINSTR  0x4             /* executable */
+#define SHF_MASKPROC   0xf0000000      /* reserved bits for processor */
+                                       /*  specific section attributes */
+
+/* Symbol Table Entry */
+typedef struct elf32_sym {
+       Elf32_Word      st_name;        /* name - index into string table */
+       Elf32_Addr      st_value;       /* symbol value */
+       Elf32_Word      st_size;        /* symbol size */
+       unsigned char   st_info;        /* type and binding */
+       unsigned char   st_other;       /* 0 - no defined meaning */
+       Elf32_Half      st_shndx;       /* section header index */
+} Elf32_Sym;
+
+typedef struct {
+       Elf64_Half      st_name;        /* Symbol name index in str table */
+       Elf_Byte        st_info;        /* type / binding attrs */
+       Elf_Byte        st_other;       /* unused */
+       Elf64_Quarter   st_shndx;       /* section index of symbol */
+       Elf64_Xword     st_value;       /* value of symbol */
+       Elf64_Xword     st_size;        /* size of symbol */
+} Elf64_Sym;
+
+/* Symbol table index */
+#define STN_UNDEF      0               /* undefined */
+
+/* Extract symbol info - st_info */
+#define ELF32_ST_BIND(x)       ((x) >> 4)
+#define ELF32_ST_TYPE(x)       (((unsigned int) x) & 0xf)
+#define ELF32_ST_INFO(b,t)     (((b) << 4) + ((t) & 0xf))
+
+#define ELF64_ST_BIND(x)       ((x) >> 4)
+#define ELF64_ST_TYPE(x)       (((unsigned int) x) & 0xf)
+#define ELF64_ST_INFO(b,t)     (((b) << 4) + ((t) & 0xf))
+
+/* Symbol Binding - ELF32_ST_BIND - st_info */
+#define STB_LOCAL      0               /* Local symbol */
+#define STB_GLOBAL     1               /* Global symbol */
+#define STB_WEAK       2               /* like global - lower precedence */
+#define STB_NUM                3               /* number of symbol bindings */
+#define STB_LOPROC     13              /* reserved range for processor */
+#define STB_HIPROC     15              /*  specific symbol bindings */
+
+/* Symbol type - ELF32_ST_TYPE - st_info */
+#define STT_NOTYPE     0               /* not specified */
+#define STT_OBJECT     1               /* data object */
+#define STT_FUNC       2               /* function */
+#define STT_SECTION    3               /* section */
+#define STT_FILE       4               /* file */
+#define STT_NUM                5               /* number of symbol types */
+#define STT_LOPROC     13              /* reserved range for processor */
+#define STT_HIPROC     15              /*  specific symbol types */
+
+/* Relocation entry with implicit addend */
+typedef struct {
+       Elf32_Addr      r_offset;       /* offset of relocation */
+       Elf32_Word      r_info;         /* symbol table index and type */
+} Elf32_Rel;
+
+/* Relocation entry with explicit addend */
+typedef struct {
+       Elf32_Addr      r_offset;       /* offset of relocation */
+       Elf32_Word      r_info;         /* symbol table index and type */
+       Elf32_Sword     r_addend;
+} Elf32_Rela;
+
+/* Extract relocation info - r_info */
+#define ELF32_R_SYM(i)         ((i) >> 8)
+#define ELF32_R_TYPE(i)                ((unsigned char) (i))
+#define ELF32_R_INFO(s,t)      (((s) << 8) + (unsigned char)(t))
+
+typedef struct {
+       Elf64_Xword     r_offset;       /* where to do it */
+       Elf64_Xword     r_info;         /* index & type of relocation */
+} Elf64_Rel;
+
+typedef struct {
+       Elf64_Xword     r_offset;       /* where to do it */
+       Elf64_Xword     r_info;         /* index & type of relocation */
+       Elf64_Sxword    r_addend;       /* adjustment value */
+} Elf64_Rela;
+
+#define        ELF64_R_SYM(info)       ((info) >> 32)
+#define        ELF64_R_TYPE(info)      ((info) & 0xFFFFFFFF)
+#define ELF64_R_INFO(s,t)      (((s) << 32) + (u_int32_t)(t))
+
+/* Program Header */
+typedef struct {
+       Elf32_Word      p_type;         /* segment type */
+       Elf32_Off       p_offset;       /* segment offset */
+       Elf32_Addr      p_vaddr;        /* virtual address of segment */
+       Elf32_Addr      p_paddr;        /* physical address - ignored? */
+       Elf32_Word      p_filesz;       /* number of bytes in file for seg. */
+       Elf32_Word      p_memsz;        /* number of bytes in mem. for seg. */
+       Elf32_Word      p_flags;        /* flags */
+       Elf32_Word      p_align;        /* memory alignment */
+} Elf32_Phdr;
+
+typedef struct {
+       Elf64_Half      p_type;         /* entry type */
+       Elf64_Half      p_flags;        /* flags */
+       Elf64_Off       p_offset;       /* offset */
+       Elf64_Addr      p_vaddr;        /* virtual address */
+       Elf64_Addr      p_paddr;        /* physical address */
+       Elf64_Xword     p_filesz;       /* file size */
+       Elf64_Xword     p_memsz;        /* memory size */
+       Elf64_Xword     p_align;        /* memory & file alignment */
+} Elf64_Phdr;
+
+/* Segment types - p_type */
+#define PT_NULL                0               /* unused */
+#define PT_LOAD                1               /* loadable segment */
+#define PT_DYNAMIC     2               /* dynamic linking section */
+#define PT_INTERP      3               /* the RTLD */
+#define PT_NOTE                4               /* auxiliary information */
+#define PT_SHLIB       5               /* reserved - purpose undefined */
+#define PT_PHDR                6               /* program header */
+#define PT_NUM         7               /* Number of segment types */
+#define PT_LOPROC      0x70000000      /* reserved range for processor */
+#define PT_HIPROC      0x7fffffff      /*  specific segment types */
+
+/* Segment flags - p_flags */
+#define PF_X           0x1             /* Executable */
+#define PF_W           0x2             /* Writable */
+#define PF_R           0x4             /* Readable */
+#define PF_MASKPROC    0xf0000000      /* reserved bits for processor */
+                                       /*  specific segment flags */
+
+/* Dynamic structure */
+typedef struct {
+       Elf32_Sword     d_tag;          /* controls meaning of d_val */
+       union {
+               Elf32_Word      d_val;  /* Multiple meanings - see d_tag */
+               Elf32_Addr      d_ptr;  /* program virtual address */
+       } d_un;
+} Elf32_Dyn;
+
+typedef struct {
+       Elf64_Xword     d_tag;          /* controls meaning of d_val */
+       union {
+               Elf64_Addr      d_ptr;
+               Elf64_Xword     d_val;
+       } d_un;
+} Elf64_Dyn;
+
+/* Dynamic Array Tags - d_tag */
+#define DT_NULL                0               /* marks end of _DYNAMIC array */
+#define DT_NEEDED      1               /* string table offset of needed lib */
+#define DT_PLTRELSZ    2               /* size of relocation entries in PLT */
+#define DT_PLTGOT      3               /* address PLT/GOT */
+#define DT_HASH                4               /* address of symbol hash table */
+#define DT_STRTAB      5               /* address of string table */
+#define DT_SYMTAB      6               /* address of symbol table */
+#define DT_RELA                7               /* address of relocation table */
+#define DT_RELASZ      8               /* size of relocation table */
+#define DT_RELAENT     9               /* size of relocation entry */
+#define DT_STRSZ       10              /* size of string table */
+#define DT_SYMENT      11              /* size of symbol table entry */
+#define DT_INIT                12              /* address of initialization func. */
+#define DT_FINI                13              /* address of termination function */
+#define DT_SONAME      14              /* string table offset of shared obj */
+#define DT_RPATH       15              /* string table offset of library
+                                          search path */
+#define DT_SYMBOLIC    16              /* start sym search in shared obj. */
+#define DT_REL         17              /* address of rel. tbl. w addends */
+#define DT_RELSZ       18              /* size of DT_REL relocation table */
+#define DT_RELENT      19              /* size of DT_REL relocation entry */
+#define DT_PLTREL      20              /* PLT referenced relocation entry */
+#define DT_DEBUG       21              /* bugger */
+#define DT_TEXTREL     22              /* Allow rel. mod. to unwritable seg */
+#define DT_JMPREL      23              /* add. of PLT's relocation entries */
+#define DT_BIND_NOW    24              /* Bind now regardless of env setting */
+#define DT_NUM         25              /* Number used. */
+#define DT_LOPROC      0x70000000      /* reserved range for processor */
+#define DT_HIPROC      0x7fffffff      /*  specific dynamic array tags */
+
+/* Standard ELF hashing function */
+unsigned int elf_hash(const unsigned char *name);
+
+/*
+ * Note Definitions
+ */
+typedef struct {
+       Elf32_Word namesz;
+       Elf32_Word descsz;
+       Elf32_Word type;
+} Elf32_Note;
+
+typedef struct {
+       Elf64_Half namesz;
+       Elf64_Half descsz;
+       Elf64_Half type;
+} Elf64_Note;
+
+
+#if defined(ELFSIZE)
+#define CONCAT(x,y)    __CONCAT(x,y)
+#define ELFNAME(x)     CONCAT(elf,CONCAT(ELFSIZE,CONCAT(_,x)))
+#define ELFNAME2(x,y)  CONCAT(x,CONCAT(_elf,CONCAT(ELFSIZE,CONCAT(_,y))))
+#define ELFNAMEEND(x)  CONCAT(x,CONCAT(_elf,ELFSIZE))
+#define ELFDEFNNAME(x) CONCAT(ELF,CONCAT(ELFSIZE,CONCAT(_,x)))
+#endif
+
+#if defined(ELFSIZE) && (ELFSIZE == 32)
+#define Elf_Ehdr       Elf32_Ehdr
+#define Elf_Phdr       Elf32_Phdr
+#define Elf_Shdr       Elf32_Shdr
+#define Elf_Sym                Elf32_Sym
+#define Elf_Rel                Elf32_Rel
+#define Elf_RelA       Elf32_Rela
+#define Elf_Dyn                Elf32_Dyn
+#define Elf_Word       Elf32_Word
+#define Elf_Sword      Elf32_Sword
+#define Elf_Addr       Elf32_Addr
+#define Elf_Off                Elf32_Off
+#define Elf_Nhdr       Elf32_Nhdr
+#define Elf_Note       Elf32_Note
+
+#define ELF_R_SYM      ELF32_R_SYM
+#define ELF_R_TYPE     ELF32_R_TYPE
+#define ELF_R_INFO     ELF32_R_INFO
+#define ELFCLASS       ELFCLASS32
+
+#define ELF_ST_BIND    ELF32_ST_BIND
+#define ELF_ST_TYPE    ELF32_ST_TYPE
+#define ELF_ST_INFO    ELF32_ST_INFO
+
+#define AuxInfo                Aux32Info
+#elif defined(ELFSIZE) && (ELFSIZE == 64)
+#define Elf_Ehdr       Elf64_Ehdr
+#define Elf_Phdr       Elf64_Phdr
+#define Elf_Shdr       Elf64_Shdr
+#define Elf_Sym                Elf64_Sym
+#define Elf_Rel                Elf64_Rel
+#define Elf_RelA       Elf64_Rela
+#define Elf_Dyn                Elf64_Dyn
+#define Elf_Word       Elf64_Word
+#define Elf_Sword      Elf64_Sword
+#define Elf_Addr       Elf64_Addr
+#define Elf_Off                Elf64_Off
+#define Elf_Nhdr       Elf64_Nhdr
+#define Elf_Note       Elf64_Note
+
+#define ELF_R_SYM      ELF64_R_SYM
+#define ELF_R_TYPE     ELF64_R_TYPE
+#define ELF_R_INFO     ELF64_R_INFO
+#define ELFCLASS       ELFCLASS64
+
+#define ELF_ST_BIND    ELF64_ST_BIND
+#define ELF_ST_TYPE    ELF64_ST_TYPE
+#define ELF_ST_INFO    ELF64_ST_INFO
+
+#define AuxInfo                Aux64Info
+#endif
+
+#endif /* __XEN_ELFSTRUCTS_H__ */
index ed3fd0fd20675bee41a4a6bef390b18dc0a341da..b366e582a8eba95a028f87138ef1deb12001f8f5 100644 (file)
@@ -44,6 +44,9 @@ int evtchn_send(struct domain *d, unsigned int lport);
 /* Bind a local event-channel port to the specified VCPU. */
 long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id);
 
+/* Unmask a local event-channel port. */
+int evtchn_unmask(unsigned int port);
+
 /* Allocate/free a Xen-attached event channel port. */
 int alloc_unbound_xen_event_channel(
     struct vcpu *local_vcpu, domid_t remote_domid);
index 0164668e2cc1f3f85316677e6c1e901258ee9edc..d0e8040a19063384aa721c5e5ccf42e881122a3e 100644 (file)
@@ -32,6 +32,7 @@
 struct active_grant_entry {
     u32           pin;    /* Reference count information.  */
     domid_t       domid;  /* Domain being granted access.  */
+    unsigned long gfn;    /* Guest's idea of the frame being granted. */
     unsigned long frame;  /* Frame being granted.          */
 };
 
index 2a66956c773ff4fe523414731f87563943163dea..8ed453004434fb3195c66a24ca15fd43b98860a7 100644 (file)
@@ -40,7 +40,7 @@ struct hvm_iommu {
     /* amd iommu support */
     int domain_id;
     int paging_mode;
-    void *root_table;
+    struct page_info *root_table;
     bool_t p2m_synchronized;
 
     /* iommu_ops */
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
new file mode 100644 (file)
index 0000000..1a879d6
--- /dev/null
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * irq.h
+ * 
+ * Interrupt distribution and delivery logic.
+ * 
+ * Copyright (c) 2006, K A Fraser, XenSource Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#ifndef __XEN_HVM_IRQ_H__
+#define __XEN_HVM_IRQ_H__
+
+#include <xen/types.h>
+#include <xen/spinlock.h>
+#include <asm/irq.h>
+#include <public/hvm/save.h>
+
+struct dev_intx_gsi_link {
+    struct list_head list;
+    uint8_t device;
+    uint8_t intx;
+    uint8_t gsi;
+    uint8_t link;
+};
+
+#define _HVM_IRQ_DPCI_MACH_PCI_SHIFT            0
+#define _HVM_IRQ_DPCI_MACH_MSI_SHIFT            1
+#define _HVM_IRQ_DPCI_GUEST_PCI_SHIFT           4
+#define _HVM_IRQ_DPCI_GUEST_MSI_SHIFT           5
+#define _HVM_IRQ_DPCI_TRANSLATE_SHIFT          15
+#define HVM_IRQ_DPCI_MACH_PCI        (1 << _HVM_IRQ_DPCI_MACH_PCI_SHIFT)
+#define HVM_IRQ_DPCI_MACH_MSI        (1 << _HVM_IRQ_DPCI_MACH_MSI_SHIFT)
+#define HVM_IRQ_DPCI_GUEST_PCI       (1 << _HVM_IRQ_DPCI_GUEST_PCI_SHIFT)
+#define HVM_IRQ_DPCI_GUEST_MSI       (1 << _HVM_IRQ_DPCI_GUEST_MSI_SHIFT)
+#define HVM_IRQ_DPCI_TRANSLATE       (1 << _HVM_IRQ_DPCI_TRANSLATE_SHIFT)
+
+struct hvm_gmsi_info {
+    uint32_t gvec;
+    uint32_t gflags;
+};
+
+struct hvm_mirq_dpci_mapping {
+    uint32_t flags;
+    int pending;
+    struct list_head digl_list;
+    struct domain *dom;
+    struct hvm_gmsi_info gmsi;
+};
+
+struct hvm_girq_dpci_mapping {
+    struct list_head list;
+    uint8_t device;
+    uint8_t intx;
+    uint8_t machine_gsi;
+};
+
+#define NR_ISAIRQS  16
+#define NR_LINK     4
+
+/* Protected by domain's event_lock */
+struct hvm_irq_dpci {
+    /* Machine IRQ to guest device/intx mapping. */
+    DECLARE_BITMAP(mapping, NR_IRQS);
+    struct hvm_mirq_dpci_mapping mirq[NR_IRQS];
+    /* Guest IRQ to guest device/intx mapping. */
+    struct list_head girq[NR_IRQS];
+    uint8_t msi_gvec_pirq[NR_VECTORS];
+    DECLARE_BITMAP(dirq_mask, NR_IRQS);
+    /* Record of mapped ISA IRQs */
+    DECLARE_BITMAP(isairq_map, NR_ISAIRQS);
+    /* Record of mapped Links */
+    uint8_t link_cnt[NR_LINK];
+    struct timer hvm_timer[NR_IRQS];
+};
+
+/* Modify state of a PCI INTx wire. */
+void hvm_pci_intx_assert(
+    struct domain *d, unsigned int device, unsigned int intx);
+void hvm_pci_intx_deassert(
+    struct domain *d, unsigned int device, unsigned int intx);
+
+/* Modify state of an ISA device's IRQ wire. */
+void hvm_isa_irq_assert(
+    struct domain *d, unsigned int isa_irq);
+void hvm_isa_irq_deassert(
+    struct domain *d, unsigned int isa_irq);
+
+void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
+
+void hvm_maybe_deassert_evtchn_irq(void);
+void hvm_assert_evtchn_irq(struct vcpu *v);
+void hvm_set_callback_via(struct domain *d, uint64_t via);
+
+void hvm_dirq_assist(struct vcpu *v);
+
+#endif /* __XEN_HVM_IRQ_H__ */
index 723369e2488e1fa5c10b50e58032a3a19b9688c1..49e9113493b1c676d0bb28a027c3f1088bd19d46 100644 (file)
@@ -152,6 +152,8 @@ __initcall(__hvm_register_##_x##_save_and_restore);
 /* Entry points for saving and restoring HVM domain state */
 size_t hvm_save_size(struct domain *d);
 int hvm_save(struct domain *d, hvm_domain_context_t *h);
+int hvm_save_one(struct domain *d,  uint16_t typecode, uint16_t instance, 
+                 XEN_GUEST_HANDLE_64(uint8) handle);
 int hvm_load(struct domain *d, hvm_domain_context_t *h);
 
 /* Arch-specific definitions. */
index 7d58109ec2d4d9294b8323eac173268c04a9e945..99d2e0008af0d9217d8d6157ca010d3a0f6b2491 100644 (file)
@@ -48,7 +48,7 @@ do_platform_op(
  * at what point in the page list to resume. For this purpose I steal the
  * high-order bits of the @cmd parameter, which are otherwise unused and zero.
  */
-#define MEMOP_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */
+#define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */
 #define MEMOP_CMD_MASK     ((1 << MEMOP_EXTENT_SHIFT) - 1)
 
 extern long
@@ -124,6 +124,12 @@ compat_memory_op(
     unsigned int cmd,
     XEN_GUEST_HANDLE(void) arg);
 
+extern int
+compat_vcpu_op(
+    int cmd,
+    int vcpuid,
+    XEN_GUEST_HANDLE(void) arg);
+
 #endif
 
 #endif /* __XEN_HYPERCALL_H__ */
index db461b9dcb0fa97eb4c1478c83bd5472dbfbf45b..c32604d9c9800c6f1c5bef013907958322cee774 100644 (file)
@@ -29,6 +29,7 @@
     rangeset_contains_singleton((d)->irq_caps, i)
 
 #define multipage_allocation_permitted(d)               \
-    (!rangeset_is_empty((d)->iomem_caps))
+    (!rangeset_is_empty((d)->iomem_caps) ||             \
+     !rangeset_is_empty((d)->arch.ioport_caps))
 
 #endif /* __XEN_IOCAP_H__ */
index f230df7b8e9bbf14f2283ada51e9f3cb4bc56005..37fa3c7aa8a552304144fb42f1e65b660d66b580 100644 (file)
@@ -31,6 +31,9 @@ extern int iommu_enabled;
 extern int iommu_pv_enabled;
 extern int force_iommu;
 extern int iommu_passthrough;
+extern int iommu_snoop;
+extern int iommu_qinval;
+extern int iommu_intremap;
 
 #define domain_hvm_iommu(d)     (&d->arch.hvm_domain.hvm_iommu)
 
@@ -62,7 +65,7 @@ int iommu_domain_init(struct domain *d);
 void iommu_domain_destroy(struct domain *d);
 int device_assigned(u8 bus, u8 devfn);
 int assign_device(struct domain *d, u8 bus, u8 devfn);
-void deassign_device(struct domain *d, u8 bus, u8 devfn);
+int deassign_device(struct domain *d, u8 bus, u8 devfn);
 int iommu_get_device_group(struct domain *d, u8 bus, u8 devfn, 
     XEN_GUEST_HANDLE_64(uint32) buf, int max_sdevs);
 int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn);
@@ -110,7 +113,9 @@ struct iommu_ops {
 void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value);
 void iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg);
 
-int iommu_suspend(void);
-int iommu_resume(void);
+void iommu_suspend(void);
+void iommu_resume(void);
+
+void iommu_set_dom0_mapping(struct domain *d);
 
 #endif /* _IOMMU_H_ */
index a4dd3f6333f6ef411e05813e4610ca57f40c7545..36b5b6f8acd47116f7d80d066142fa3a781579c7 100644 (file)
@@ -22,8 +22,14 @@ struct irqaction
 #define IRQ_PENDING    4       /* IRQ pending - replay on enable */
 #define IRQ_REPLAY     8       /* IRQ has been replayed but not acked yet */
 #define IRQ_GUEST       16      /* IRQ is handled by guest OS(es) */
+#define IRQ_GUEST_EOI_PENDING 32 /* IRQ was disabled, pending a guest EOI */
 #define IRQ_PER_CPU     256     /* IRQ is per CPU */
 
+/* Special IRQ numbers. */
+#define AUTO_ASSIGN_IRQ         (-1)
+#define NEVER_ASSIGN_IRQ        (-2)
+#define FREE_TO_ASSIGN_IRQ      (-3)
+
 /*
  * Interrupt controller descriptor. This is all we need
  * to describe about the low-level hardware. 
@@ -61,14 +67,23 @@ typedef struct {
     cpumask_t affinity;
 } __cacheline_aligned irq_desc_t;
 
-extern irq_desc_t irq_desc[NR_IRQS];
+extern irq_desc_t irq_desc[NR_VECTORS];
 
-extern int setup_irq(unsigned int, struct irqaction *);
-extern void free_irq(unsigned int);
-extern int request_irq(unsigned int irq,
+extern int setup_irq_vector(unsigned int, struct irqaction *);
+extern void release_irq_vector(unsigned int);
+extern int request_irq_vector(unsigned int vector,
                void (*handler)(int, void *, struct cpu_user_regs *),
                unsigned long irqflags, const char * devname, void *dev_id);
 
+#define setup_irq(irq, action) \
+    setup_irq_vector(irq_to_vector(irq), action)
+
+#define release_irq(irq) \
+    release_irq_vector(irq_to_vector(irq))
+
+#define request_irq(irq, handler, irqflags, devname, devid) \
+    request_irq_vector(irq_to_vector(irq), handler, irqflags, devname, devid)
+
 extern hw_irq_controller no_irq_type;
 extern void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs);
 
@@ -81,13 +96,16 @@ extern void pirq_guest_unbind(struct domain *d, int irq);
 extern irq_desc_t *domain_spin_lock_irq_desc(
     struct domain *d, int irq, unsigned long *pflags);
 
-static inline void set_native_irq_info(int irq, cpumask_t mask)
+static inline void set_native_irq_info(unsigned int vector, cpumask_t mask)
 {
-    irq_desc[irq].affinity = mask;
+    irq_desc[vector].affinity = mask;
 }
 
+#ifdef irq_to_vector
 static inline void set_irq_info(int irq, cpumask_t mask)
 {
-    set_native_irq_info(irq, mask);
+    set_native_irq_info(irq_to_vector(irq), mask);
 }
+#endif
+
 #endif /* __XEN_IRQ_H__ */
index 9dc3dacac007bb19ba033dae7195bd04d0726a49..d78510e639e61d9f940cf98b18ff852519f788e7 100644 (file)
@@ -33,6 +33,27 @@ crash_xen_info_t *kexec_crash_save_info(void);
 void machine_crash_shutdown(void);
 int machine_kexec_get(xen_kexec_range_t *range);
 
+/* vmcoreinfo stuff */
+#define VMCOREINFO_BYTES           (4096)
+#define VMCOREINFO_NOTE_NAME       "VMCOREINFO_XEN"
+void arch_crash_save_vmcoreinfo(void);
+void vmcoreinfo_append_str(const char *fmt, ...)
+       __attribute__ ((format (printf, 1, 2)));
+#define VMCOREINFO_PAGESIZE(value) \
+       vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+       vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ALIAS(alias, name) \
+       vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name)
+#define VMCOREINFO_STRUCT_SIZE(name) \
+       vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+       vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+                             (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_OFFSET_ALIAS(name, field, alias) \
+       vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #alias, \
+                             (unsigned long)offsetof(struct name, field))
+
 #endif /* __XEN_KEXEC_H__ */
 
 /*
index d6aa891eb944b178d366c47af23a952c61b65f4d..93fdabb9f117d2736d9ada05c23a5355e2e472bb 100644 (file)
@@ -16,7 +16,7 @@ void __warn(char *file, int line);
 #define WARN_ON(p) do { if (p) WARN(); } while (0)
 
 /* Force a compilation error if condition is true */
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2 * !!(condition)]))
+#define BUILD_BUG_ON(condition) ((void)sizeof(struct { int:-!!(condition); }))
 
 #ifndef assert_failed
 #define assert_failed(p)                                        \
@@ -95,6 +95,7 @@ unsigned long long parse_size_and_unit(const char *s, const char **ps);
 #define TAINT_MACHINE_CHECK             (1<<1)
 #define TAINT_BAD_PAGE                  (1<<2)
 #define TAINT_SYNC_CONSOLE              (1<<3)
+#define TAINT_ERROR_INJECT              (1<<4)
 extern int tainted;
 #define TAINT_STRING_MAX_LEN            20
 extern char *print_tainted(char *str);
diff --git a/xen/include/xen/libelf.h b/xen/include/xen/libelf.h
new file mode 100644 (file)
index 0000000..1c92a73
--- /dev/null
@@ -0,0 +1,271 @@
+/******************************************************************************
+ * libelf.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_LIBELF_H__
+#define __XEN_LIBELF_H__
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__)
+#define XEN_ELF_LITTLE_ENDIAN
+#else
+#error define architectural endianness
+#endif
+
+#undef ELFSIZE
+#include "elfstructs.h"
+#ifdef __XEN__
+#include <public/elfnote.h>
+#include <public/features.h>
+#else
+#include <xen/elfnote.h>
+#include <xen/features.h>
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+typedef union {
+    Elf32_Ehdr e32;
+    Elf64_Ehdr e64;
+} elf_ehdr;
+
+typedef union {
+    Elf32_Phdr e32;
+    Elf64_Phdr e64;
+} elf_phdr;
+
+typedef union {
+    Elf32_Shdr e32;
+    Elf64_Shdr e64;
+} elf_shdr;
+
+typedef union {
+    Elf32_Sym e32;
+    Elf64_Sym e64;
+} elf_sym;
+
+typedef union {
+    Elf32_Rel e32;
+    Elf64_Rel e64;
+} elf_rel;
+
+typedef union {
+    Elf32_Rela e32;
+    Elf64_Rela e64;
+} elf_rela;
+
+typedef union {
+    Elf32_Note e32;
+    Elf64_Note e64;
+} elf_note;
+
+struct elf_binary {
+    /* elf binary */
+    const char *image;
+    size_t size;
+    char class;
+    char data;
+
+    const elf_ehdr *ehdr;
+    const char *sec_strtab;
+    const elf_shdr *sym_tab;
+    const char *sym_strtab;
+
+    /* loaded to */
+    char *dest;
+    uint64_t pstart;
+    uint64_t pend;
+    uint64_t reloc_offset;
+
+    uint64_t bsd_symtab_pstart;
+    uint64_t bsd_symtab_pend;
+
+#ifndef __XEN__
+    /* misc */
+    FILE *log;
+#endif
+    int verbose;
+};
+
+/* ------------------------------------------------------------------------ */
+/* accessing elf header fields                                              */
+
+#ifdef XEN_ELF_BIG_ENDIAN
+# define NATIVE_ELFDATA ELFDATA2MSB
+#else
+# define NATIVE_ELFDATA ELFDATA2LSB
+#endif
+
+#define elf_32bit(elf) (ELFCLASS32 == (elf)->class)
+#define elf_64bit(elf) (ELFCLASS64 == (elf)->class)
+#define elf_msb(elf)   (ELFDATA2MSB == (elf)->data)
+#define elf_lsb(elf)   (ELFDATA2LSB == (elf)->data)
+#define elf_swap(elf)  (NATIVE_ELFDATA != (elf)->data)
+
+#define elf_uval(elf, str, elem)                                        \
+    ((ELFCLASS64 == (elf)->class)                                       \
+     ? elf_access_unsigned((elf), (str),                                \
+                           offsetof(typeof(*(str)),e64.elem),           \
+                           sizeof((str)->e64.elem))                     \
+     : elf_access_unsigned((elf), (str),                                \
+                           offsetof(typeof(*(str)),e32.elem),           \
+                           sizeof((str)->e32.elem)))
+
+#define elf_sval(elf, str, elem)                                        \
+    ((ELFCLASS64 == (elf)->class)                                       \
+     ? elf_access_signed((elf), (str),                                  \
+                         offsetof(typeof(*(str)),e64.elem),             \
+                         sizeof((str)->e64.elem))                       \
+     : elf_access_signed((elf), (str),                                  \
+                         offsetof(typeof(*(str)),e32.elem),             \
+                         sizeof((str)->e32.elem)))
+
+#define elf_size(elf, str)                              \
+    ((ELFCLASS64 == (elf)->class)                       \
+     ? sizeof((str)->e64) : sizeof((str)->e32))
+
+uint64_t elf_access_unsigned(struct elf_binary *elf, const void *ptr,
+                             uint64_t offset, size_t size);
+int64_t elf_access_signed(struct elf_binary *elf, const void *ptr,
+                          uint64_t offset, size_t size);
+
+uint64_t elf_round_up(struct elf_binary *elf, uint64_t addr);
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_tools.c                                                        */
+
+int elf_shdr_count(struct elf_binary *elf);
+int elf_phdr_count(struct elf_binary *elf);
+
+const elf_shdr *elf_shdr_by_name(struct elf_binary *elf, const char *name);
+const elf_shdr *elf_shdr_by_index(struct elf_binary *elf, int index);
+const elf_phdr *elf_phdr_by_index(struct elf_binary *elf, int index);
+
+const char *elf_section_name(struct elf_binary *elf, const elf_shdr * shdr);
+const void *elf_section_start(struct elf_binary *elf, const elf_shdr * shdr);
+const void *elf_section_end(struct elf_binary *elf, const elf_shdr * shdr);
+
+const void *elf_segment_start(struct elf_binary *elf, const elf_phdr * phdr);
+const void *elf_segment_end(struct elf_binary *elf, const elf_phdr * phdr);
+
+const elf_sym *elf_sym_by_name(struct elf_binary *elf, const char *symbol);
+const elf_sym *elf_sym_by_index(struct elf_binary *elf, int index);
+
+const char *elf_note_name(struct elf_binary *elf, const elf_note * note);
+const void *elf_note_desc(struct elf_binary *elf, const elf_note * note);
+uint64_t elf_note_numeric(struct elf_binary *elf, const elf_note * note);
+const elf_note *elf_note_next(struct elf_binary *elf, const elf_note * note);
+
+int elf_is_elfbinary(const void *image);
+int elf_phdr_is_loadable(struct elf_binary *elf, const elf_phdr * phdr);
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_loader.c                                                       */
+
+int elf_init(struct elf_binary *elf, const char *image, size_t size);
+#ifdef __XEN__
+void elf_set_verbose(struct elf_binary *elf);
+#else
+void elf_set_logfile(struct elf_binary *elf, FILE * log, int verbose);
+#endif
+
+void elf_parse_binary(struct elf_binary *elf);
+void elf_load_binary(struct elf_binary *elf);
+
+void *elf_get_ptr(struct elf_binary *elf, unsigned long addr);
+uint64_t elf_lookup_addr(struct elf_binary *elf, const char *symbol);
+
+void elf_parse_bsdsyms(struct elf_binary *elf, uint64_t pstart); /* private */
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_relocate.c                                                     */
+
+int elf_reloc(struct elf_binary *elf);
+
+/* ------------------------------------------------------------------------ */
+/* xc_libelf_dominfo.c                                                      */
+
+#define UNSET_ADDR          ((uint64_t)-1)
+
+enum xen_elfnote_type {
+    XEN_ENT_NONE = 0,
+    XEN_ENT_LONG = 1,
+    XEN_ENT_STR  = 2
+};
+
+struct xen_elfnote {
+    enum xen_elfnote_type type;
+    const char *name;
+    union {
+        const char *str;
+        uint64_t num;
+    } data;
+};
+
+struct elf_dom_parms {
+    /* raw */
+    const char *guest_info;
+    const void *elf_note_start;
+    const void *elf_note_end;
+    struct xen_elfnote elf_notes[XEN_ELFNOTE_MAX + 1];
+
+    /* parsed */
+    char guest_os[16];
+    char guest_ver[16];
+    char xen_ver[16];
+    char loader[16];
+    int pae;
+    int bsd_symtab;
+    uint64_t virt_base;
+    uint64_t virt_entry;
+    uint64_t virt_hypercall;
+    uint64_t virt_hv_start_low;
+    uint64_t p2m_base;
+    uint64_t elf_paddr_offset;
+    uint32_t f_supported[XENFEAT_NR_SUBMAPS];
+    uint32_t f_required[XENFEAT_NR_SUBMAPS];
+
+    /* calculated */
+    uint64_t virt_offset;
+    uint64_t virt_kstart;
+    uint64_t virt_kend;
+};
+
+static inline void elf_xen_feature_set(int nr, uint32_t * addr)
+{
+    addr[nr >> 5] |= 1 << (nr & 31);
+}
+static inline int elf_xen_feature_get(int nr, uint32_t * addr)
+{
+    return !!(addr[nr >> 5] & (1 << (nr & 31)));
+}
+
+int elf_xen_parse_features(const char *features,
+                           uint32_t *supported,
+                           uint32_t *required);
+int elf_xen_parse_note(struct elf_binary *elf,
+                       struct elf_dom_parms *parms,
+                       const elf_note *note);
+int elf_xen_parse_guest_info(struct elf_binary *elf,
+                             struct elf_dom_parms *parms);
+int elf_xen_parse(struct elf_binary *elf,
+                  struct elf_dom_parms *parms);
+
+#endif /* __XEN_LIBELF_H__ */
index 08bd72d8ce2b511b8bad39d8f0555f0b0228a583..82340f3ae46826c42f7e174e7502bb953dbdfbdc 100644 (file)
@@ -45,9 +45,9 @@ void end_boot_allocator(void);
 
 /* Xen suballocator. These functions are interrupt-safe. */
 void init_xenheap_pages(paddr_t ps, paddr_t pe);
-void *alloc_xenheap_pages(unsigned int order);
+void *alloc_xenheap_pages(unsigned int order, unsigned int memflags);
 void free_xenheap_pages(void *v, unsigned int order);
-#define alloc_xenheap_page() (alloc_xenheap_pages(0))
+#define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
 #define free_xenheap_page(v) (free_xenheap_pages(v,0))
 
 /* Domain suballocator. These functions are *not* interrupt-safe.*/
@@ -60,6 +60,9 @@ unsigned long avail_domheap_pages_region(
 unsigned long avail_domheap_pages(void);
 #define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f))
 #define free_domheap_page(p)  (free_domheap_pages(p,0))
+unsigned int online_page(unsigned long mfn, uint32_t *status);
+int offline_page(unsigned long mfn, int broken, uint32_t *status);
+int query_page_offline(unsigned long mfn, uint32_t *status);
 
 void scrub_heap_pages(void);
 
@@ -72,6 +75,8 @@ int assign_pages(
 /* memflags: */
 #define _MEMF_no_refcount 0
 #define  MEMF_no_refcount (1U<<_MEMF_no_refcount)
+#define _MEMF_populate_on_demand 1
+#define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
 #define _MEMF_node        8
 #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
 #define _MEMF_bits        24
@@ -83,26 +88,200 @@ int assign_pages(
 #define MAX_ORDER 20 /* 2^20 contiguous pages */
 #endif
 
+#define page_list_entry list_head
+
+#include <asm/mm.h>
+
+#ifndef page_list_entry
+struct page_list_head
+{
+    struct page_info *next, *tail;
+};
+/* These must only have instances in struct page_info. */
+# define page_list_entry
+
+# define PAGE_LIST_HEAD_INIT(name) { NULL, NULL }
+# define PAGE_LIST_HEAD(name) \
+    struct page_list_head name = PAGE_LIST_HEAD_INIT(name)
+# define INIT_PAGE_LIST_HEAD(head) ((head)->tail = (head)->next = NULL)
+# define INIT_PAGE_LIST_ENTRY(ent) ((ent)->prev = (ent)->next = ~0)
+
+static inline int
+page_list_empty(const struct page_list_head *head)
+{
+    return !head->next;
+}
+static inline struct page_info *
+page_list_first(const struct page_list_head *head)
+{
+    return head->next;
+}
+static inline struct page_info *
+page_list_next(const struct page_info *page,
+               const struct page_list_head *head)
+{
+    return page != head->tail ? mfn_to_page(page->list.next) : NULL;
+}
+static inline struct page_info *
+page_list_prev(const struct page_info *page,
+               const struct page_list_head *head)
+{
+    return page != head->next ? mfn_to_page(page->list.prev) : NULL;
+}
+static inline void
+page_list_add(struct page_info *page, struct page_list_head *head)
+{
+    if ( head->next )
+    {
+        page->list.next = page_to_mfn(head->next);
+        head->next->list.prev = page_to_mfn(page);
+    }
+    else
+    {
+        head->tail = page;
+        page->list.next = ~0;
+    }
+    page->list.prev = ~0;
+    head->next = page;
+}
+static inline void
+page_list_add_tail(struct page_info *page, struct page_list_head *head)
+{
+    page->list.next = ~0;
+    if ( head->next )
+    {
+        page->list.prev = page_to_mfn(head->tail);
+        head->tail->list.next = page_to_mfn(page);
+    }
+    else
+    {
+        page->list.prev = ~0;
+        head->next = page;
+    }
+    head->tail = page;
+}
+static inline bool_t
+__page_list_del_head(struct page_info *page, struct page_list_head *head,
+                     struct page_info *next, struct page_info *prev)
+{
+    if ( head->next == page )
+    {
+        if ( head->tail != page )
+        {
+            next->list.prev = ~0;
+            head->next = next;
+        }
+        else
+            head->tail = head->next = NULL;
+        return 1;
+    }
+
+    if ( head->tail == page )
+    {
+        prev->list.next = ~0;
+        head->tail = prev;
+        return 1;
+    }
+
+    return 0;
+}
+static inline void
+page_list_del(struct page_info *page, struct page_list_head *head)
+{
+    struct page_info *next = mfn_to_page(page->list.next);
+    struct page_info *prev = mfn_to_page(page->list.prev);
+
+    if ( !__page_list_del_head(page, head, next, prev) )
+    {
+        next->list.prev = page->list.prev;
+        prev->list.next = page->list.next;
+    }
+}
+static inline void
+page_list_del2(struct page_info *page, struct page_list_head *head1,
+               struct page_list_head *head2)
+{
+    struct page_info *next = mfn_to_page(page->list.next);
+    struct page_info *prev = mfn_to_page(page->list.prev);
+
+    if ( !__page_list_del_head(page, head1, next, prev) &&
+         !__page_list_del_head(page, head2, next, prev) )
+    {
+        next->list.prev = page->list.prev;
+        prev->list.next = page->list.next;
+    }
+}
+static inline struct page_info *
+page_list_remove_head(struct page_list_head *head)
+{
+    struct page_info *page = head->next;
+
+    if ( page )
+        page_list_del(page, head);
+
+    return page;
+}
+
+#define page_list_for_each(pos, head) \
+    for ( pos = (head)->next; pos; pos = page_list_next(pos, head) )
+#define page_list_for_each_safe(pos, tmp, head) \
+    for ( pos = (head)->next; \
+          pos ? (tmp = page_list_next(pos, head), 1) : 0; \
+          pos = tmp )
+#define page_list_for_each_safe_reverse(pos, tmp, head) \
+    for ( pos = (head)->tail; \
+          pos ? (tmp = page_list_prev(pos, head), 1) : 0; \
+          pos = tmp )
+#else
+# define page_list_head                  list_head
+# define PAGE_LIST_HEAD_INIT             LIST_HEAD_INIT
+# define PAGE_LIST_HEAD                  LIST_HEAD
+# define INIT_PAGE_LIST_HEAD             INIT_LIST_HEAD
+# define INIT_PAGE_LIST_ENTRY            INIT_LIST_HEAD
+# define page_list_empty                 list_empty
+# define page_list_first(hd)             list_entry((hd)->next, \
+                                                    struct page_info, list)
+# define page_list_next(pg, hd)          list_entry((pg)->list.next, \
+                                                    struct page_info, list)
+# define page_list_add(pg, hd)           list_add(&(pg)->list, hd)
+# define page_list_add_tail(pg, hd)      list_add_tail(&(pg)->list, hd)
+# define page_list_del(pg, hd)           list_del(&(pg)->list)
+# define page_list_del2(pg, hd1, hd2)    list_del(&(pg)->list)
+# define page_list_remove_head(hd)       (!page_list_empty(hd) ? \
+    ({ \
+        struct page_info *__pg = page_list_first(hd); \
+        list_del(&__pg->list); \
+        __pg; \
+    }) : NULL)
+# define page_list_for_each(pos, head)   list_for_each_entry(pos, head, list)
+# define page_list_for_each_safe(pos, tmp, head) \
+    list_for_each_entry_safe(pos, tmp, head, list)
+# define page_list_for_each_safe_reverse(pos, tmp, head) \
+    list_for_each_entry_safe_reverse(pos, tmp, head, list)
+#endif
+
 /* Automatic page scrubbing for dead domains. */
-extern struct list_head page_scrub_list;
-#define page_scrub_schedule_work()              \
-    do {                                        \
-        if ( !list_empty(&page_scrub_list) )    \
-            raise_softirq(PAGE_SCRUB_SOFTIRQ);  \
+extern struct page_list_head page_scrub_list;
+#define page_scrub_schedule_work()                 \
+    do {                                           \
+        if ( !page_list_empty(&page_scrub_list) )  \
+            raise_softirq(PAGE_SCRUB_SOFTIRQ);     \
     } while ( 0 )
 #define page_scrub_kick()                                               \
     do {                                                                \
-        if ( !list_empty(&page_scrub_list) )                            \
+        if ( !page_list_empty(&page_scrub_list) )                       \
             cpumask_raise_softirq(cpu_online_map, PAGE_SCRUB_SOFTIRQ);  \
     } while ( 0 )
 unsigned long avail_scrub_pages(void);
 
-#include <asm/mm.h>
-
 int guest_remove_page(struct domain *d, unsigned long gmfn);
 
-/* Returns TRUE if the memory at address @p is ordinary RAM. */
-int memory_is_conventional_ram(paddr_t p);
+#define RAM_TYPE_CONVENTIONAL 0x00000001
+#define RAM_TYPE_RESERVED     0x00000002
+#define RAM_TYPE_UNUSABLE     0x00000004
+#define RAM_TYPE_ACPI         0x00000008
+/* Returns TRUE if the whole page at @mfn is of the requested RAM type(s) above. */
+int page_is_ram_type(unsigned long mfn, unsigned long mem_type);
 
 extern unsigned long *alloc_bitmap;    /* for vmcoreinfo */
 
index ff38dce11af8224ed678c3572efdfe74de750961..6827e0d6772d25af144d0504a6055c18df979a5c 100644 (file)
 #define PCI_BDF(b,d,f)  ((((b) & 0xff) << 8) | PCI_DEVFN(d,f))
 #define PCI_BDF2(b,df)  ((((b) & 0xff) << 8) | ((df) & 0xff))
 
+#define MAX_MSIX_TABLE_ENTRIES  2048
+#define MAX_MSIX_TABLE_PAGES    8
+struct pci_dev_info {
+    unsigned is_extfn;
+    unsigned is_virtfn;
+    struct {
+        u8 bus;
+        u8 devfn;
+    } physfn;
+};
+
 struct pci_dev {
     struct list_head alldevs_list;
     struct list_head domain_list;
+
     struct list_head msi_list;
+    int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
+    int msix_table_idx[MAX_MSIX_TABLE_PAGES];
+    spinlock_t msix_table_lock;
+
     struct domain *domain;
     const u8 bus;
     const u8 devfn;
-    spinlock_t lock;
+    struct pci_dev_info info;
 };
 
 #define for_each_pdev(domain, pdev) \
     list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list)
 
 /*
- * The pcidevs_lock write-lock must be held when doing alloc_pdev() or
- * free_pdev().  Never de-reference pdev without holding pdev->lock or
- * pcidevs_lock.  Always aquire pcidevs_lock before pdev->lock when
- * doing free_pdev().
+ * The pcidevs_lock protect alldevs_list, and the assignment for the 
+ * devices, it also sync the access to the msi capability that is not
+ * interrupt handling related (the mask bit register).
  */
 
-extern rwlock_t pcidevs_lock;
+extern spinlock_t pcidevs_lock;
 
 struct pci_dev *alloc_pdev(u8 bus, u8 devfn);
 void free_pdev(struct pci_dev *pdev);
@@ -59,6 +74,9 @@ struct pci_dev *pci_lock_domain_pdev(struct domain *d, int bus, int devfn);
 void pci_release_devices(struct domain *d);
 int pci_add_device(u8 bus, u8 devfn);
 int pci_remove_device(u8 bus, u8 devfn);
+int pci_add_device_ext(u8 bus, u8 devfn, struct pci_dev_info *info);
+struct pci_dev *pci_get_pdev(int bus, int devfn);
+struct pci_dev *pci_get_pdev_by_domain(struct domain *d, int bus, int devfn);
 
 uint8_t pci_conf_read8(
     unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg);
@@ -78,4 +96,7 @@ void pci_conf_write32(
 int pci_find_cap_offset(u8 bus, u8 dev, u8 func, u8 cap);
 int pci_find_next_cap(u8 bus, unsigned int devfn, u8 pos, int cap);
 
+int msixtbl_pt_register(struct domain *d, int pirq, uint64_t gtable);
+void msixtbl_pt_unregister(struct domain *d, int pirq);
+
 #endif /* __XEN_PCI_H__ */
index c1914a8b94a975fa5360722536b49a3746c29028..361554cfb82fbec7447d3817a8767007adab45d7 100644 (file)
 #define  PCI_PM_CAP_PME_D3cold 0x8000  /* PME# from D3 (cold) */
 #define PCI_PM_CTRL            4       /* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK        0x0003  /* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET     0x0004  /* No reset for D3hot->D0 */
+#define  PCI_PM_CTRL_NO_SOFT_RESET     0x0008  /* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE        0x0100  /* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK     0x1e00  /* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK   0x6000  /* Data scale (??) */
index 99a3246b96c10d97d517dc448327e0c23ab98546..d166e2269b15a72037e1f2063497eebd37525c72 100644 (file)
@@ -16,6 +16,41 @@ PERFCOUNTER(sched_irq,              "sched: timer")
 PERFCOUNTER(sched_run,              "sched: runs through scheduler")
 PERFCOUNTER(sched_ctx,              "sched: context switches")
 
+PERFCOUNTER(vcpu_check,             "csched: vcpu_check")
+PERFCOUNTER(schedule,               "csched: schedule")
+PERFCOUNTER(acct_run,               "csched: acct_run")
+PERFCOUNTER(acct_no_work,           "csched: acct_no_work")
+PERFCOUNTER(acct_balance,           "csched: acct_balance")
+PERFCOUNTER(acct_reorder,           "csched: acct_reorder")
+PERFCOUNTER(acct_min_credit,        "csched: acct_min_credit")
+PERFCOUNTER(acct_vcpu_active,       "csched: acct_vcpu_active")
+PERFCOUNTER(acct_vcpu_idle,         "csched: acct_vcpu_idle")
+PERFCOUNTER(vcpu_sleep,             "csched: vcpu_sleep")
+PERFCOUNTER(vcpu_wake_running,      "csched: vcpu_wake_running")
+PERFCOUNTER(vcpu_wake_onrunq,       "csched: vcpu_wake_onrunq")
+PERFCOUNTER(vcpu_wake_runnable,     "csched: vcpu_wake_runnable")
+PERFCOUNTER(vcpu_wake_not_runnable, "csched: vcpu_wake_not_runnable")
+PERFCOUNTER(vcpu_park,              "csched: vcpu_park")
+PERFCOUNTER(vcpu_unpark,            "csched: vcpu_unpark")
+PERFCOUNTER(tickle_local_idler,     "csched: tickle_local_idler")
+PERFCOUNTER(tickle_local_over,      "csched: tickle_local_over")
+PERFCOUNTER(tickle_local_under,     "csched: tickle_local_under")
+PERFCOUNTER(tickle_local_other,     "csched: tickle_local_other")
+PERFCOUNTER(tickle_idlers_none,     "csched: tickle_idlers_none")
+PERFCOUNTER(tickle_idlers_some,     "csched: tickle_idlers_some")
+PERFCOUNTER(load_balance_idle,      "csched: load_balance_idle")
+PERFCOUNTER(load_balance_over,      "csched: load_balance_over")
+PERFCOUNTER(load_balance_other,     "csched: load_balance_other")
+PERFCOUNTER(steal_trylock_failed,   "csched: steal_trylock_failed")
+PERFCOUNTER(steal_peer_idle,        "csched: steal_peer_idle")
+PERFCOUNTER(migrate_queued,         "csched: migrate_queued")
+PERFCOUNTER(migrate_running,        "csched: migrate_running")
+PERFCOUNTER(dom_init,               "csched: dom_init")
+PERFCOUNTER(dom_destroy,            "csched: dom_destroy")
+PERFCOUNTER(vcpu_init,              "csched: vcpu_init")
+PERFCOUNTER(vcpu_destroy,           "csched: vcpu_destroy")
+PERFCOUNTER(vcpu_hot,               "csched: vcpu_hot")
+
 PERFCOUNTER(need_flush_tlb_flush,   "PG_need_flush tlb flushes")
 
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
index 540f1c6b8c31fd289036506fa5845c5b567c86e2..efaec7e9b7fa66902991dd53e707dc2e2c407653 100644 (file)
@@ -19,6 +19,7 @@
 #include <xen/xenoprof.h>
 #include <xen/rcupdate.h>
 #include <xen/irq.h>
+#include <xen/mm.h>
 
 #ifdef CONFIG_COMPAT
 #include <compat/vcpu.h>
@@ -29,12 +30,11 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
 extern struct domain *dom0;
 
 #ifndef CONFIG_COMPAT
-#define MAX_EVTCHNS(d)     NR_EVENT_CHANNELS
+#define BITS_PER_EVTCHN_WORD(d) BITS_PER_LONG
 #else
-#define MAX_EVTCHNS(d)     (!IS_COMPAT(d) ? \
-                            NR_EVENT_CHANNELS : \
-                            sizeof(unsigned int) * sizeof(unsigned int) * 64)
+#define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG)
 #endif
+#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d))
 #define EVTCHNS_PER_BUCKET 128
 #define NR_EVTCHN_BUCKETS  (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
 
@@ -102,6 +102,9 @@ struct vcpu
     } runstate_guest; /* guest address */
 #endif
 
+    /* last time when vCPU is scheduled out */
+    uint64_t last_run_time;
+
     /* Has the FPU been initialised? */
     bool_t           fpu_initialised;
     /* Has the FPU been used since it was last saved? */
@@ -171,8 +174,8 @@ struct domain
     spinlock_t       domain_lock;
 
     spinlock_t       page_alloc_lock; /* protects all the following fields  */
-    struct list_head page_list;       /* linked list, of size tot_pages     */
-    struct list_head xenpage_list;    /* linked list, of size xenheap_pages */
+    struct page_list_head page_list;  /* linked list, of size tot_pages     */
+    struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */
     unsigned int     tot_pages;       /* number of pages currently possesed */
     unsigned int     max_pages;       /* maximum value for tot_pages        */
     unsigned int     xenheap_pages;   /* # pages allocated from Xen heap    */
@@ -340,20 +343,18 @@ static inline struct domain *get_current_domain(void)
 struct domain *domain_create(
     domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
-#define _DOMCRF_hvm   0
-#define DOMCRF_hvm    (1U<<_DOMCRF_hvm)
+#define _DOMCRF_hvm           0
+#define DOMCRF_hvm            (1U<<_DOMCRF_hvm)
  /* DOMCRF_hap: Create a domain with hardware-assisted paging. */
-#define _DOMCRF_hap   1
-#define DOMCRF_hap    (1U<<_DOMCRF_hap)
+#define _DOMCRF_hap           1
+#define DOMCRF_hap            (1U<<_DOMCRF_hap)
+ /* DOMCRF_s3_integrity: Create a domain with tboot memory integrity protection
+                        by tboot */
+#define _DOMCRF_s3_integrity  2
+#define DOMCRF_s3_integrity   (1U<<_DOMCRF_s3_integrity)
  /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */
-#define _DOMCRF_dummy 2
-#define DOMCRF_dummy  (1U<<_DOMCRF_dummy)
-
-int construct_dom0(
-    struct domain *d,
-    unsigned long image_start, unsigned long image_len, 
-    unsigned long initrd_start, unsigned long initrd_len,
-    char *cmdline);
+#define _DOMCRF_dummy         3
+#define DOMCRF_dummy          (1U<<_DOMCRF_dummy)
 
 /*
  * rcu_lock_domain_by_id() is more efficient than get_domain_by_id().
@@ -538,20 +539,19 @@ int vcpu_locked_change_affinity(struct vcpu *v, cpumask_t *affinity);
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
+uint64_t get_cpu_idle_time(unsigned int cpu);
 
 #define IS_PRIV(_d) ((_d)->is_privileged)
 #define IS_PRIV_FOR(_d, _t) (IS_PRIV(_d) || ((_d)->target && (_d)->target == (_t)))
 
-#ifndef IS_COMPAT
-#define IS_COMPAT(d) 0
-#endif
-
 #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist))
 
 #define is_hvm_domain(d) ((d)->is_hvm)
 #define is_hvm_vcpu(v)   (is_hvm_domain(v->domain))
 #define need_iommu(d)    ((d)->need_iommu && !(d)->is_hvm)
 
+extern int sched_smt_power_savings;
+
 extern enum cpufreq_controller {
     FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen
 } cpufreq_controller;
index 298211cb2a754bd6680240e89c824dfc41b67e76..7a5a5ab5d721e71ae945a8c2dfebad5700f09bb8 100644 (file)
 
 #include <xen/config.h>
 #include <asm/system.h>
+#include <asm/spinlock.h>
 
-#define spin_lock_irqsave(lock, flags) \
-    do { local_irq_save(flags); spin_lock(lock); } while ( 0 )
-#define spin_lock_irq(lock) \
-    do { local_irq_disable(); spin_lock(lock); } while ( 0 )
+#ifndef NDEBUG
+struct lock_debug {
+    int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */
+};
+#define _LOCK_DEBUG { -1 }
+void spin_debug_enable(void);
+void spin_debug_disable(void);
+#else
+struct lock_debug { };
+#define _LOCK_DEBUG { }
+#define spin_debug_enable() ((void)0)
+#define spin_debug_disable() ((void)0)
+#endif
 
-#define read_lock_irqsave(lock, flags) \
-    do { local_irq_save(flags); read_lock(lock); } while ( 0 )
-#define read_lock_irq(lock) \
-    do { local_irq_disable(); read_lock(lock); } while ( 0 )
+typedef struct {
+    raw_spinlock_t raw;
+    u16 recurse_cpu:12;
+    u16 recurse_cnt:4;
+    struct lock_debug debug;
+} spinlock_t;
 
-#define write_lock_irqsave(lock, flags) \
-    do { local_irq_save(flags); write_lock(lock); } while ( 0 )
-#define write_lock_irq(lock) \
-    do { local_irq_disable(); write_lock(lock); } while ( 0 )
 
-#define spin_unlock_irqrestore(lock, flags) \
-    do { spin_unlock(lock); local_irq_restore(flags); } while ( 0 )
-#define spin_unlock_irq(lock) \
-    do { spin_unlock(lock); local_irq_enable(); } while ( 0 )
+#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
+#define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
+#define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED)
 
-#define read_unlock_irqrestore(lock, flags) \
-    do { read_unlock(lock); local_irq_restore(flags); } while ( 0 )
-#define read_unlock_irq(lock) \
-    do { read_unlock(lock); local_irq_enable(); } while ( 0 )
+typedef struct {
+    raw_rwlock_t raw;
+    struct lock_debug debug;
+} rwlock_t;
 
-#define write_unlock_irqrestore(lock, flags) \
-    do { write_unlock(lock); local_irq_restore(flags); } while ( 0 )
-#define write_unlock_irq(lock) \
-    do { write_unlock(lock); local_irq_enable(); } while ( 0 )
+#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG }
+#define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
+#define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED)
 
-#ifdef CONFIG_SMP
+void _spin_lock(spinlock_t *lock);
+void _spin_lock_irq(spinlock_t *lock);
+unsigned long _spin_lock_irqsave(spinlock_t *lock);
 
-#include <asm/spinlock.h>
+void _spin_unlock(spinlock_t *lock);
+void _spin_unlock_irq(spinlock_t *lock);
+void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags);
 
-#else
+int _spin_is_locked(spinlock_t *lock);
+int _spin_trylock(spinlock_t *lock);
+void _spin_barrier(spinlock_t *lock);
+void _spin_barrier_irq(spinlock_t *lock);
 
-#if (__GNUC__ > 2)
-typedef struct { } spinlock_t;
-#define SPIN_LOCK_UNLOCKED /*(spinlock_t)*/ { }
-#else
-typedef struct { int gcc_is_buggy; } spinlock_t;
-#define SPIN_LOCK_UNLOCKED /*(spinlock_t)*/ { 0 }
-#endif
+void _spin_lock_recursive(spinlock_t *lock);
+void _spin_unlock_recursive(spinlock_t *lock);
 
-#define spin_lock_init(lock)             do { } while(0)
-#define spin_is_locked(lock)             (0)
-#define _raw_spin_lock(lock)             (void)(lock)
-#define _raw_spin_trylock(lock)          ({1; })
-#define _raw_spin_unlock(lock)           do { } while(0)
-#define _raw_spin_lock_recursive(lock)   do { } while(0)
-#define _raw_spin_unlock_recursive(lock) do { } while(0)
-
-#if (__GNUC__ > 2)
-typedef struct { } rwlock_t;
-#define RW_LOCK_UNLOCKED /*(rwlock_t)*/ { }
-#else
-typedef struct { int gcc_is_buggy; } rwlock_t;
-#define RW_LOCK_UNLOCKED /*(rwlock_t)*/ { 0 }
-#endif
+void _read_lock(rwlock_t *lock);
+void _read_lock_irq(rwlock_t *lock);
+unsigned long _read_lock_irqsave(rwlock_t *lock);
 
-#define rwlock_init(lock)            do { } while(0)
-#define _raw_read_lock(lock)         (void)(lock) /* Not "unused variable". */
-#define _raw_read_unlock(lock)       do { } while(0)
-#define _raw_write_lock(lock)        (void)(lock) /* Not "unused variable". */
-#define _raw_write_unlock(lock)      do { } while(0)
+void _read_unlock(rwlock_t *lock);
+void _read_unlock_irq(rwlock_t *lock);
+void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags);
 
-#endif
+void _write_lock(rwlock_t *lock);
+void _write_lock_irq(rwlock_t *lock);
+unsigned long _write_lock_irqsave(rwlock_t *lock);
+
+void _write_unlock(rwlock_t *lock);
+void _write_unlock_irq(rwlock_t *lock);
+void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags);
+
+int _rw_is_locked(rwlock_t *lock);
+
+#define spin_lock(l)                  _spin_lock(l)
+#define spin_lock_irq(l)              _spin_lock_irq(l)
+#define spin_lock_irqsave(l, f)       ((f) = _spin_lock_irqsave(l))
+
+#define spin_unlock(l)                _spin_unlock(l)
+#define spin_unlock_irq(l)            _spin_unlock_irq(l)
+#define spin_unlock_irqrestore(l, f)  _spin_unlock_irqrestore(l, f)
 
-#define spin_lock(_lock)             _raw_spin_lock(_lock)
-#define spin_trylock(_lock)          _raw_spin_trylock(_lock)
-#define spin_unlock(_lock)           _raw_spin_unlock(_lock)
-#define spin_lock_recursive(_lock)   _raw_spin_lock_recursive(_lock)
-#define spin_unlock_recursive(_lock) _raw_spin_unlock_recursive(_lock)
-#define read_lock(_lock)             _raw_read_lock(_lock)
-#define read_unlock(_lock)           _raw_read_unlock(_lock)
-#define write_lock(_lock)            _raw_write_lock(_lock)
-#define write_unlock(_lock)          _raw_write_unlock(_lock)
+#define spin_is_locked(l)             _spin_is_locked(l)
+#define spin_trylock(l)               _spin_trylock(l)
 
 /* Ensure a lock is quiescent between two critical operations. */
-static inline void spin_barrier(spinlock_t *lock)
-{
-    do { mb(); } while ( spin_is_locked(lock) );
-    mb();
-}
-
-#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x) rwlock_t x = RW_LOCK_UNLOCKED
+#define spin_barrier(l)               _spin_barrier(l)
+#define spin_barrier_irq(l)           _spin_barrier_irq(l)
+
+/*
+ * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be
+ * reentered recursively on the same CPU. All critical regions that may form
+ * part of a recursively-nested set must be protected by these forms. If there
+ * are any critical regions that cannot form part of such a set, they can use
+ * standard spin_[un]lock().
+ */
+#define spin_lock_recursive(l)        _spin_lock_recursive(l)
+#define spin_unlock_recursive(l)      _spin_unlock_recursive(l)
+
+#define read_lock(l)                  _read_lock(l)
+#define read_lock_irq(l)              _read_lock_irq(l)
+#define read_lock_irqsave(l, f)       ((f) = _read_lock_irqsave(l))
+
+#define read_unlock(l)                _read_unlock(l)
+#define read_unlock_irq(l)            _read_unlock_irq(l)
+#define read_unlock_irqrestore(l, f)  _read_unlock_irqrestore(l, f)
+
+#define write_lock(l)                 _write_lock(l)
+#define write_lock_irq(l)             _write_lock_irq(l)
+#define write_lock_irqsave(l, f)      ((f) = _write_lock_irqsave(l))
+
+#define write_unlock(l)               _write_unlock(l)
+#define write_unlock_irq(l)           _write_unlock_irq(l)
+#define write_unlock_irqrestore(l, f) _write_unlock_irqrestore(l, f)
+
+#define rw_is_locked(l)               _rw_is_locked(l)
 
 #endif /* __SPINLOCK_H__ */
index 31204e09a9f81fde5c628afe96fbba64c6a35a96..38ead9275526f039437cc073cce8f3de6652dc74 100644 (file)
@@ -13,7 +13,6 @@
 #include <asm/time.h>
 
 extern int init_xen_time(void);
-extern void cstate_save_tsc(void);
 extern void cstate_restore_tsc(void);
 
 extern unsigned long cpu_khz;
@@ -52,6 +51,7 @@ struct tm gmtime(unsigned long t);
 #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
 #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
 #define MICROSECS(_us)  ((s_time_t)((_us) * 1000ULL))
+#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
 
 extern void update_vcpu_system_time(struct vcpu *v);
 extern void update_domain_wallclock_time(struct domain *d);
index 6d4c0fc4fb0ad03e256d3fc168fa18dc7614347c..0379d950a3f312f2039bf72f493753f7cf396598 100644 (file)
 struct timer {
     /* System time expiry value (nanoseconds since boot). */
     s_time_t expires;
+    s_time_t expires_end;
 
     /* Position in active-timer data structure. */
     union {
         /* Timer-heap offset. */
         unsigned int heap_offset;
-        /* Overflow linked list. */
+        /* Linked list. */
         struct timer *list_next;
     };
 
@@ -121,6 +122,9 @@ DECLARE_PER_CPU(s_time_t, timer_deadline);
 /* Arch-defined function to reprogram timer hardware for new deadline. */
 extern int reprogram_timer(s_time_t timeout);
 
+/* calculate the aligned first tick time for a given periodic timer */ 
+extern s_time_t align_timer(s_time_t firsttick, uint64_t period);
+
 #endif /* _TIMER_H_ */
 
 /*
index 5616d0867f08dbf83b511f0565ee25fdfec71a75..525dbdedfafc681d557c52149a3263d3f51e4725 100644 (file)
 #define XENOPROF_DOMAIN_PASSIVE    2
 
 #define XENOPROF_IDLE              0
-#define XENOPROF_COUNTERS_RESERVED 1
-#define XENOPROF_READY             2
-#define XENOPROF_PROFILING         3
+#define XENOPROF_INITIALIZED       1
+#define XENOPROF_COUNTERS_RESERVED 2
+#define XENOPROF_READY             3
+#define XENOPROF_PROFILING         4
 
 #ifndef CONFIG_COMPAT
 typedef struct xenoprof_buf xenoprof_buf_t;
index 6f0b662b00377c5738bc458e45aba403b3301923..17f47e91edca29f1e193d8787a0eb2ff72d8f734 100644 (file)
 !      kexec_image                     kexec.h
 !      kexec_range                     kexec.h
 !      add_to_physmap                  memory.h
-!      remove_from_physmap             memory.h
 !      foreign_memory_map              memory.h
 !      memory_exchange                 memory.h
 !      memory_map                      memory.h
 !      memory_reservation              memory.h
-!      translate_gpfn_list             memory.h
+!      pod_target                      memory.h
 !      sched_poll                      sched.h
 ?      sched_remote_shutdown           sched.h
 ?      sched_shutdown                  sched.h
@@ -55,3 +54,7 @@
 !      processor_cx                    platform.h
 !      processor_flags                 platform.h
 !      processor_power                 platform.h
+!      pct_register                    platform.h
+?      processor_px                    platform.h
+!      psd_package                     platform.h
+!      processor_performance           platform.h
index 2f54b22626547dff2cf87c009f89223fdbbfdcce..e5e15d53420bd16373fc3024d17a1488ba63068c 100644 (file)
@@ -96,7 +96,6 @@ struct xsm_operations {
     int (*alloc_security_evtchn) (struct evtchn *chn);
     void (*free_security_evtchn) (struct evtchn *chn);
 
-    int (*translate_gpfn_list) (struct domain *d, unsigned long mfn);
     int (*memory_adjust_reservation) (struct domain *d1, struct domain *d2);
     int (*memory_stat_reservation) (struct domain *d1, struct domain *d2);
     int (*memory_pin_page) (struct domain *d, struct page_info *page);
@@ -143,7 +142,6 @@ struct xsm_operations {
     int (*update_va_mapping) (struct domain *d, struct domain *f, 
                                                             l1_pgentry_t pte);
     int (*add_to_physmap) (struct domain *d1, struct domain *d2);
-    int (*remove_from_physmap) (struct domain *d1, struct domain *d2);
     int (*sendtrigger) (struct domain *d);
     int (*test_assign_device) (uint32_t machine_bdf);
     int (*assign_device) (struct domain *d, uint32_t machine_bdf);
@@ -367,11 +365,6 @@ static inline void xsm_free_security_evtchn (struct evtchn *chn)
     xsm_call(free_security_evtchn(chn));
 }
 
-static inline int xsm_translate_gpfn_list (struct domain *d, unsigned long mfn)
-{
-    return xsm_call(translate_gpfn_list(d, mfn));
-}
-
 static inline int xsm_memory_adjust_reservation (struct domain *d1, struct
                                                                     domain *d2)
 {
@@ -584,11 +577,6 @@ static inline int xsm_add_to_physmap(struct domain *d1, struct domain *d2)
     return xsm_call(add_to_physmap(d1, d2));
 }
 
-static inline int xsm_remove_from_physmap(struct domain *d1, struct domain *d2)
-{
-    return xsm_call(remove_from_physmap(d1, d2));
-}
-
 static inline int xsm_sendtrigger(struct domain *d)
 {
     return xsm_call(sendtrigger(d));
index 76eb88b585c97d0966dec3b4c3efb0c773b871db..bf632d7aeae73c1dc42945bc63addec5ffb06edf 100644 (file)
@@ -81,7 +81,8 @@ static int read_symbol(FILE *in, struct sym_entry *s)
        if (rc != 3) {
                if (rc != EOF) {
                        /* skip line */
-                       fgets(str, 500, in);
+                       if (fgets(str, 500, in) == NULL)
+                               return -1; /* must check fgets result */
                }
                return -1;
        }
index d14172e41aedae604aab9ef9b0bb7a11c315da30..8809828b7f91f9f9e23a9e2391bf38a440921c94 100644 (file)
@@ -180,11 +180,6 @@ static int dummy_grant_query_size (struct domain *d1, struct domain *d2)
     return 0;
 }
 
-static int dummy_translate_gpfn_list (struct domain *d, unsigned long mfn)
-{
-    return 0;
-}
-
 static int dummy_memory_adjust_reservation (struct domain *d1,
                                                             struct domain *d2)
 {
@@ -457,10 +452,6 @@ static int dummy_ext_vcpucontext (struct domain *d, uint32_t cmd)
     return 0;
 }
 
-static int dummy_remove_from_physmap (struct domain *d1, struct domain *d2)
-{
-    return 0;
-}
 #endif
 
 struct xsm_operations dummy_xsm_ops;
@@ -522,7 +513,6 @@ void xsm_fixup_ops (struct xsm_operations *ops)
     set_to_dummy_if_null(ops, alloc_security_evtchn);
     set_to_dummy_if_null(ops, free_security_evtchn);
 
-    set_to_dummy_if_null(ops, translate_gpfn_list);
     set_to_dummy_if_null(ops, memory_adjust_reservation);
     set_to_dummy_if_null(ops, memory_stat_reservation);
     set_to_dummy_if_null(ops, memory_pin_page);
@@ -568,7 +558,6 @@ void xsm_fixup_ops (struct xsm_operations *ops)
     set_to_dummy_if_null(ops, mmu_machphys_update);
     set_to_dummy_if_null(ops, update_va_mapping);
     set_to_dummy_if_null(ops, add_to_physmap);
-    set_to_dummy_if_null(ops, remove_from_physmap);
     set_to_dummy_if_null(ops, sendtrigger);
     set_to_dummy_if_null(ops, test_assign_device);
     set_to_dummy_if_null(ops, assign_device);
index 0771e3f1a8e5d63951e5405a5e397500cfab3fa3..2b996c5d1e23e951acc54645b016ad1b8c8205a9 100644 (file)
@@ -367,20 +367,6 @@ static int get_mfn_sid(unsigned long mfn, u32 *sid)
     return rc;    
 }
 
-static int flask_translate_gpfn_list(struct domain *d, unsigned long mfn)
-{
-    int rc = 0;
-    u32 sid;
-    struct domain_security_struct *dsec;
-    dsec = d->ssid;
-
-    rc = get_mfn_sid(mfn, &sid);
-    if ( rc )
-        return rc;
-
-    return avc_has_perm(dsec->sid, sid, SECCLASS_MMU, MMU__TRANSLATEGP, NULL);
-}
-
 static int flask_memory_adjust_reservation(struct domain *d1, struct domain *d2)
 {
     return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__ADJUST);
@@ -834,6 +820,7 @@ static int flask_hvmcontext(struct domain *d, uint32_t cmd)
             perm = HVM__SETHVMC;
         break;
         case XEN_DOMCTL_gethvmcontext:
+        case XEN_DOMCTL_gethvmcontext_partial:
             perm = HVM__GETHVMC;
         break;
         default:
@@ -1071,11 +1058,6 @@ static int flask_add_to_physmap(struct domain *d1, struct domain *d2)
     return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
 }
 
-static int flask_remove_from_physmap(struct domain *d1, struct domain *d2)
-{
-    return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP);
-}
-
 static int flask_sendtrigger(struct domain *d)
 {
     return domain_has_perm(current->domain, d, SECCLASS_DOMAIN, DOMAIN__TRIGGER);
@@ -1280,7 +1262,6 @@ static struct xsm_operations flask_ops = {
     .alloc_security_evtchn = flask_alloc_security_evtchn,
     .free_security_evtchn = flask_free_security_evtchn,
 
-    .translate_gpfn_list = flask_translate_gpfn_list,
     .memory_adjust_reservation = flask_memory_adjust_reservation,
     .memory_stat_reservation = flask_memory_stat_reservation,
     .memory_pin_page = flask_memory_pin_page,
@@ -1325,7 +1306,6 @@ static struct xsm_operations flask_ops = {
     .mmu_machphys_update = flask_mmu_machphys_update,
     .update_va_mapping = flask_update_va_mapping,
     .add_to_physmap = flask_add_to_physmap,
-    .remove_from_physmap = flask_remove_from_physmap,
     .sendtrigger = flask_sendtrigger,
     .test_assign_device = flask_test_assign_device,
     .assign_device = flask_assign_device,
index 6e488e05074a27014f57cc4612f1cc56b7199410..42b23b08b7bfb10355832d04ea2cc26e5a219fbd 100644 (file)
@@ -1515,8 +1515,8 @@ int policydb_read(struct policydb *p, void *fp)
     if ( len != strlen(POLICYDB_STRING) )
     {
         printk(KERN_ERR "security:  policydb string length %d does not "
-               "match expected length %Zu\n",
-               len, (u32) strlen(POLICYDB_STRING));
+               "match expected length %lu\n",
+               len, strlen(POLICYDB_STRING));
         goto bad;
     }
     policydb_str = xmalloc_array(char, len + 1);