debuggers.hg

changeset 20658:1f5f36e11114

docs: Example usage of pvrdtscp algorithm

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Dec 11 08:51:21 2009 +0000 (2009-12-11)
parents 1396dfb8d6ba
children 2e5032921b07
files docs/misc/pvrdtscp.c
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/docs/misc/pvrdtscp.c	Fri Dec 11 08:51:21 2009 +0000
     1.3 @@ -0,0 +1,307 @@
     1.4 +/* pvrdtscp algorithm
     1.5 + *
     1.6 + * This sample code demonstrates the use of the paravirtualized rdtscp
     1.7 + * algorithm.  Using this algorithm, an application may communicate with
     1.8 + * the Xen hypervisor (version 4.0+) to obtain timestamp information which
     1.9 + * is both monotonically increasing and has a fixed 1 GHz rate, even across
    1.10 + * migrations between machines with different TSC rates and offsets.
    1.11 + * Further,the algorithm provides performance near the performance of a
    1.12 + * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED
    1.13 + * the application is running on a machine on which the rdtscp instruction
    1.14 + * is supported and TSC is "safe". The application must also be running in a
    1.15 + * PV domain.  (HVM domains may be supported at a later time.) On machines
    1.16 + * where TSC is unsafe or the rdtscp instruction is not supported, Xen
    1.17 + * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp
    1.18 + * algorithm, thus providing support for the algorithm for live migration
    1.19 + * across all machines.
    1.20 + *
    1.21 + * More information can be found within the Xen (4.0+) source tree at
    1.22 + *  docs/misc/tscmode.txt
    1.23 + *
    1.24 + * Copyright (c) 2009 Oracle Corporation and/or its affiliates.
    1.25 + * All rights reserved
    1.26 + * Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
    1.27 + * 
    1.28 + * This code is derived from code licensed under the GNU
    1.29 + * General Public License ("GPL") version 2 and is therefore itself
    1.30 + * also licensed under the GPL version 2.
    1.31 + *
    1.32 + * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2
    1.33 + * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp
    1.34 + * algorithm and its ABI to Xen version 4.0+ 
    1.35 + */
    1.36 +
    1.37 +#include <stdio.h>
    1.38 +#include <stdlib.h>
    1.39 +#include <string.h>
    1.40 +#include <sys/wait.h>
    1.41 +
    1.42 +#ifdef __LP64__
    1.43 +#define __X86_64__
    1.44 +typedef unsigned short u16;
    1.45 +typedef unsigned int u32;
    1.46 +typedef unsigned long u64;
    1.47 +typedef int i32;
    1.48 +typedef long i64;
    1.49 +#define NSEC_PER_SEC 1000000000
    1.50 +#else
    1.51 +#define __X86_32__
    1.52 +typedef unsigned int u16;
    1.53 +typedef unsigned long u32;
    1.54 +typedef unsigned long long u64;
    1.55 +typedef long i32;
    1.56 +typedef long long i64;
    1.57 +#define NSEC_PER_SEC 1000000000L
    1.58 +#endif
    1.59 +
    1.60 +static inline void hvm_cpuid(u32 idx, u32 sub,
    1.61 +				u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
    1.62 +{
    1.63 +	*eax = idx, *ecx = sub;
    1.64 +	asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
    1.65 +	    : "0" (*eax), "2" (*ecx));
    1.66 +}
    1.67 +
    1.68 +static inline void pv_cpuid(u32 idx, u32 sub,
    1.69 +				u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
    1.70 +{
    1.71 +	*eax = idx, *ecx = sub;
    1.72 +	asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax),
    1.73 +            "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx));
    1.74 +}
    1.75 +
    1.76 +static inline u64 do_rdtscp(u32 *aux)
    1.77 +{
    1.78 +static u64 last = 0;
    1.79 +	u32 lo32, hi32;
    1.80 +	u64 val;
    1.81 +
    1.82 +	asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux));
    1.83 +	val = lo32 | ((u64)hi32 << 32);
    1.84 +	return val;
    1.85 +}
    1.86 +
    1.87 +static inline int get_xen_tsc_mode(void)
    1.88 +{
    1.89 +	u32 val, dummy1, dummy2, dummy3;
    1.90 +	pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3);
    1.91 +	return val;
    1.92 +}
    1.93 +
    1.94 +static inline int get_xen_vtsc(void)
    1.95 +{
    1.96 +	u32 val, dummy1, dummy2, dummy3;
    1.97 +	pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3);
    1.98 +	return val & 1;
    1.99 +}
   1.100 +
   1.101 +static inline int get_xen_vtsc_khz(void)
   1.102 +{
   1.103 +	u32 val, dummy1, dummy2, dummy3;
   1.104 +	pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3);
   1.105 +	return val;
   1.106 +}
   1.107 +
   1.108 +static inline u32 get_xen_cpu_khz(void)
   1.109 +{
   1.110 +	u32 cpu_khz, dummy1, dummy2, dummy3;
   1.111 +	pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3);
   1.112 +	return cpu_khz;
   1.113 +}
   1.114 +
   1.115 +static inline u32 get_xen_incarnation(void)
   1.116 +{
   1.117 +	u32 incarn, dummy1, dummy2, dummy3;
   1.118 +	pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn);
   1.119 +	return incarn;
   1.120 +}
   1.121 +
   1.122 +static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift)
   1.123 +{
   1.124 +	u32 off_lo, off_hi, sys_lo, sys_hi, dummy;
   1.125 +
   1.126 +	pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift);
   1.127 +	*offset = off_lo | ((u64)off_hi << 32);
   1.128 +}
   1.129 +
   1.130 +static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift)
   1.131 +{
   1.132 +    u64 product;
   1.133 +#ifdef __X86_32__
   1.134 +    u32 tmp1, tmp2;
   1.135 +#endif
   1.136 +
   1.137 +    if ( tsc_shift < 0 )
   1.138 +        delta >>= -tsc_shift;
   1.139 +    else
   1.140 +        delta <<= tsc_shift;
   1.141 +
   1.142 +#ifdef __X86_32__
   1.143 +    asm (
   1.144 +        "mul  %5       ; "
   1.145 +        "mov  %4,%%eax ; "
   1.146 +        "mov  %%edx,%4 ; "
   1.147 +        "mul  %5       ; "
   1.148 +        "xor  %5,%5    ; "
   1.149 +        "add  %4,%%eax ; "
   1.150 +        "adc  %5,%%edx ; "
   1.151 +        : "=A" (product), "=r" (tmp1), "=r" (tmp2)
   1.152 +        : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) );
   1.153 +#else
   1.154 +    asm (
   1.155 +        "mul %%rdx ; shrd $32,%%rdx,%%rax"
   1.156 +        : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) );
   1.157 +#endif
   1.158 +
   1.159 +    return product;
   1.160 +}
   1.161 +
   1.162 +static inline u64 get_pvrdtscp_timestamp(int *discontinuity)
   1.163 +{
   1.164 +	static int firsttime = 1;
   1.165 +	static u64 last_pvrdtscp_timestamp = 0;
   1.166 +	static u32 last_tsc_aux;
   1.167 +	static u64 xen_ns_offset;
   1.168 +	static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift;
   1.169 +	u32 this_tsc_aux;
   1.170 +	u64 timestamp, cur_tsc, cur_ns;
   1.171 +
   1.172 +	if (firsttime) {
   1.173 +		cur_tsc = do_rdtscp(&last_tsc_aux);
   1.174 +		get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
   1.175 +					&xen_tsc_to_ns_shift);
   1.176 +		cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
   1.177 +					xen_tsc_to_ns_shift);
   1.178 +		timestamp = cur_ns - xen_ns_offset;
   1.179 +		last_pvrdtscp_timestamp = timestamp;
   1.180 +		firsttime = 0;
   1.181 +	}
   1.182 +	cur_tsc = do_rdtscp(&this_tsc_aux);
   1.183 +	*discontinuity = 0;
   1.184 +	while (this_tsc_aux != last_tsc_aux) {
   1.185 +		/* if tsc_aux changed, try again */
   1.186 +		last_tsc_aux = this_tsc_aux;
   1.187 +		get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
   1.188 +					&xen_tsc_to_ns_shift);
   1.189 +		cur_tsc = do_rdtscp(&this_tsc_aux);
   1.190 +		*discontinuity = 1;
   1.191 +	}
   1.192 +
   1.193 +	/* compute nsec from TSC and Xen time values */
   1.194 +	cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
   1.195 +					xen_tsc_to_ns_shift);
   1.196 +	timestamp = cur_ns - xen_ns_offset;
   1.197 +
   1.198 +	/* enforce monotonicity just in case */
   1.199 +	if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0)
   1.200 +		last_pvrdtscp_timestamp = timestamp;
   1.201 +	else {
   1.202 +		/* this should never happen but we'll check it anyway in
   1.203 +		 * case of some strange combination of scaling errors
   1.204 +		 * occurs across a very fast migration */
   1.205 +		printf("Time went backwards by %lluns\n",
   1.206 +		    (unsigned long long)(last_pvrdtscp_timestamp-timestamp));
   1.207 +		timestamp = ++last_pvrdtscp_timestamp;
   1.208 +	}
   1.209 +	return timestamp;
   1.210 +}
   1.211 +
   1.212 +#define HVM 1
   1.213 +#define PVM 0
   1.214 +
   1.215 +static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor)
   1.216 +{
   1.217 +	u32 eax, ebx, ecx, edx, base;
   1.218 +	union { char csig[16]; u32 u[4]; } sig;
   1.219 +
   1.220 +	for (base=0x40000000; base < 0x40010000; base += 0x100) {
   1.221 +		if (hvm==HVM)
   1.222 +			hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx);
   1.223 +		else
   1.224 +			pv_cpuid(base,0,&eax,&ebx,&ecx,&edx);
   1.225 +		sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx;
   1.226 +		sig.csig[12] = '\0';
   1.227 +		if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) {
   1.228 +				if (hvm==HVM)
   1.229 +					hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
   1.230 +				else
   1.231 +					pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
   1.232 +				*version_major = (eax >> 16) & 0xffff;
   1.233 +				*version_minor = eax & 0xffff;
   1.234 +				return 1;
   1.235 +		}
   1.236 +	}
   1.237 +	return 0;
   1.238 +}
   1.239 +
   1.240 +main(int ac, char **av)
   1.241 +{
   1.242 +	u32 dummy;
   1.243 +	u16 version_hi, version_lo;
   1.244 +	u64 ts, last_ts;
   1.245 +	int status, discontinuity = 0;
   1.246 +	pid_t pid;
   1.247 +
   1.248 +	if (running_on_xen(HVM,&version_hi,&version_lo)) {
   1.249 +		printf("running on Xen v%d.%d as an HVM domain, "
   1.250 +			"pvrdtsc not supported, exiting\n",
   1.251 +			(int)version_hi, (int)version_lo);
   1.252 +		exit(0);
   1.253 +	}
   1.254 +	pid = fork();
   1.255 +	if (pid == -1) {
   1.256 +		fprintf(stderr,"Huh? Fork failed\n");
   1.257 +		return 0;
   1.258 +	}
   1.259 +	else if (pid == 0) { /* child */
   1.260 +		pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy);
   1.261 +		exit(0);
   1.262 +	}
   1.263 +	waitpid(pid,&status,0);
   1.264 +	if (!WIFEXITED(status))
   1.265 +		exit(0);
   1.266 +	if (!running_on_xen(PVM,&version_hi,&version_lo)) {
   1.267 +		printf("not running on Xen, exiting\n");
   1.268 +		exit(0);
   1.269 +	}
   1.270 +	printf("running on Xen v%d.%d as a PV domain\n",
   1.271 +		(int)version_hi, (int)version_lo);
   1.272 +	if ( version_hi <= 3 ) {
   1.273 +		printf("pvrdtscp requires Xen version 4.0 or greater\n");
   1.274 +		/* exit(0); FIXME after xen-unstable is officially v4.0 */
   1.275 +	}
   1.276 +	if ( get_xen_tsc_mode() != 3 )
   1.277 +		printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n");
   1.278 +
   1.279 +	/* OK, we are on Xen, now loop forever checking timestamps */
   1.280 +	ts = get_pvrdtscp_timestamp(&discontinuity);
   1.281 +	printf("Starting with ts=%lluns 0x%llx (%llusec)\n",ts,ts,ts/NSEC_PER_SEC);
   1.282 +	printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
   1.283 +				(unsigned long)get_xen_incarnation(),
   1.284 +				(unsigned long)get_xen_vtsc(),
   1.285 +				(unsigned long)get_xen_vtsc_khz(),
   1.286 +				(unsigned long)get_xen_cpu_khz());
   1.287 +	ts = get_pvrdtscp_timestamp(&discontinuity);
   1.288 +	last_ts = ts;
   1.289 +	while (1) {
   1.290 +		ts = get_pvrdtscp_timestamp(&discontinuity);
   1.291 +		if (discontinuity)
   1.292 +			printf("migrated/restored, incarn=%d: "
   1.293 +                               "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
   1.294 +				(unsigned long)get_xen_incarnation(),
   1.295 +				(unsigned long)get_xen_vtsc(),
   1.296 +				(unsigned long)get_xen_vtsc_khz(),
   1.297 +				(unsigned long)get_xen_cpu_khz());
   1.298 +		if (ts < last_ts)
   1.299 +			/* this should NEVER happen, especially since there
   1.300 +			 * is a check for it in get_pvrdtscp_timestamp() */
   1.301 +			printf("Time went backwards: %lluns (%llusec)\n",
   1.302 +				last_ts-ts,(last_ts-ts)/NSEC_PER_SEC);
   1.303 +		if (ts > last_ts + 200000000LL)
   1.304 +			/* this is OK, usually about 2sec for save/restore
   1.305 +			 * and a fraction of a second for live migrate */
   1.306 +			printf("Time jumped forward %lluns (%llusec)\n",
   1.307 +				ts-last_ts,(ts-last_ts)/NSEC_PER_SEC);
   1.308 +		last_ts = ts;
   1.309 +	}
   1.310 +}