/*
* xenalyze.c: Analyzing xentrace output
*
* Written by George Dunlap.
*
* Copyright (c) 2006-2007, XenSource Inc.
* Copyright (c) 2007-2008, Citrix Systems R&D Ltd, UK
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; If not, see .
*/
#define _XOPEN_SOURCE 600
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "analyze.h"
#include "mread.h"
#include "pv.h"
#include
#include
#include
#include
struct mread_ctrl;
#define DEFAULT_CPU_HZ 2400000000LL
#define QHZ_FROM_HZ(_hz) (((_hz) << 10)/ 1000000000)
#define ADDR_SPACE_BITS 48
#define DEFAULT_SAMPLE_SIZE 1024
#define DEFAULT_SAMPLE_MAX 1024*1024*32
#define DEFAULT_INTERVAL_LENGTH 1000
struct array_struct {
unsigned long long *values;
int count;
};
#define warn_once(_x...) \
do { \
static int _w=1; \
if ( _w ) { \
_w=0; \
fprintf(warn, ##_x); \
} \
} while(0) \
/* -- Global variables -- */
struct {
int fd;
struct mread_ctrl *mh;
struct symbol_struct * symbols;
char * symbol_file;
char * trace_file;
int output_defined;
off_t file_size;
struct {
off_t update_offset;
int pipe[2];
FILE* out;
int pid;
} progress;
} G = {
.fd=-1,
.symbols = NULL,
.symbol_file = NULL,
.trace_file = NULL,
.output_defined = 0,
.file_size = 0,
.progress = { .update_offset = 0 },
};
/*
Kinds of errors:
Unexpected values
- RIP with information in high bits (not all 0 or 1)
- exit reason too high
Unexpected record layout
- x64 bit set in PIO,PV_PTWR_EMULATION_PAE,
- Unknown minor type (PV_PTWR_EMULATION, RUNSTATE_CHANGE
- Wrong record size
- More than one bit set in evt.main field
Unexpected sequences
- wake tsc tracking
- TSC dependency loop
- Mismatch between non-running old event states
- Runstate continue while running on another pcpu
- lost_record_end seen in non-lost pcpu
- Unexpected non-CPU_CHANGE record during new_pcpu scan
- record tsc < interval start tsc
- lost_record tsc !> order tsc
Limited resources
- interrupt interval slots
- record cpu > MAX_CPUS
Algorithm asserts
- Duplicate CR3/domain values
- Logic holes
- domain runstates
- runstate / tsc skew
- vcpu_{prev,next}_update p->current{==,!=}null
- vcpu start conditions
- lost_cpu count higher than # of seen cpus / < 0
- lost cpu has non-null p->current
Symbol file
-file doesn't open
-file not ordered
System
- short read
- malloc failed
Args
- Invalid cpu_hz value / suffix
- No trace file
- Can't open trace file
*/
enum error_level {
ERR_NONE=0,
ERR_STRICT, /* Be unreasonably picky */
ERR_WARN, /* Something midly unexpected */
ERR_SANITY, /* Sanity checks: RIP with info in high bits */
ERR_RECORD, /* Something that keeps you from processing the record */
ERR_FILE, /* Probably caused by a corrupt file */
ERR_LIMIT, /* Exceeded limits; data will be lost */
ERR_MAX_TOLERABLE=ERR_LIMIT,
/* -- Unrecoverable past this point -- */
ERR_ASSERT, /* Algoritm assert */
ERR_SYSTEM, /* System error: cannot allocate memory, short read, &c */
};
int verbosity = 5;
struct {
unsigned
scatterplot_interrupt_eip:1,
scatterplot_unpin_promote:1,
scatterplot_cr3_switch:1,
scatterplot_wake_to_halt:1,
scatterplot_io:1,
scatterplot_vmexit_eip:1,
scatterplot_runstate:1,
scatterplot_runstate_time:1,
scatterplot_pcpu:1,
scatterplot_extint_cycles:1,
scatterplot_rdtsc:1,
scatterplot_irq:1,
histogram_interrupt_eip:1,
interval_mode:1,
dump_all:1,
dump_raw_process:1,
dump_raw_reads:1,
dump_no_processing:1,
dump_ipi_latency:1,
dump_trace_volume_on_lost_record:1,
dump_show_power_states:1,
with_cr3_enumeration:1,
with_pio_enumeration:1,
with_mmio_enumeration:1,
with_interrupt_eip_enumeration:1,
show_default_domain_summary:1,
mmio_enumeration_skip_vga:1,
progress:1,
svm_mode:1,
summary:1,
report_pcpu:1,
tsc_loop_fatal:1,
summary_info;
long long cpu_qhz, cpu_hz;
int scatterplot_interrupt_vector;
int scatterplot_extint_cycles_vector;
int scatterplot_io_port;
int histogram_interrupt_vector;
unsigned long long histogram_interrupt_increment;
int interrupt_eip_enumeration_vector;
int default_guest_paging_levels;
int sample_size, sample_max;
enum error_level tolerance; /* Tolerate up to this level of error */
struct {
tsc_t cycles;
/* Used if interval is specified in seconds to delay calculating
* time_interval until all arguments have been processed (specifically,
* cpu_hz). */
unsigned msec;
enum {
INTERVAL_CR3_SCHEDULE_TIME,
INTERVAL_CR3_SCHEDULE_ORDERED,
INTERVAL_CR3_SHORT_SUMMARY,
INTERVAL_DOMAIN_TOTAL_TIME,
INTERVAL_DOMAIN_SHORT_SUMMARY,
INTERVAL_DOMAIN_GUEST_INTERRUPT,
INTERVAL_DOMAIN_GRANT_MAPS
} output;
enum {
INTERVAL_MODE_CUSTOM,
INTERVAL_MODE_ARRAY,
INTERVAL_MODE_LIST
} mode;
enum {
INTERVAL_CHECK_NONE,
INTERVAL_CHECK_CR3,
INTERVAL_CHECK_DOMAIN
} check;
/* Options for specific interval output types */
union {
struct array_struct array;
};
int count;
} interval;
} opt = {
.scatterplot_interrupt_eip=0,
.scatterplot_unpin_promote=0,
.scatterplot_cr3_switch=0,
.scatterplot_wake_to_halt=0,
.scatterplot_vmexit_eip=0,
.scatterplot_runstate=0,
.scatterplot_runstate_time=0,
.scatterplot_pcpu=0,
.scatterplot_extint_cycles=0,
.scatterplot_rdtsc=0,
.scatterplot_irq=0,
.histogram_interrupt_eip=0,
.dump_all = 0,
.dump_raw_process = 0,
.dump_raw_reads = 0,
.dump_no_processing = 0,
.dump_ipi_latency = 0,
.dump_trace_volume_on_lost_record = 0,
.dump_show_power_states = 0,
.with_cr3_enumeration = 0,
.with_pio_enumeration = 1,
.with_mmio_enumeration = 0,
.with_interrupt_eip_enumeration = 0,
.show_default_domain_summary = 0,
.mmio_enumeration_skip_vga = 1,
.progress = 0,
.svm_mode = 0,
.summary = 0,
.report_pcpu = 0,
.tsc_loop_fatal = 0,
.cpu_hz = DEFAULT_CPU_HZ,
/* Pre-calculate a multiplier that makes the rest of the
* calculations easier */
.cpu_qhz = QHZ_FROM_HZ(DEFAULT_CPU_HZ),
.default_guest_paging_levels = 2,
.sample_size = DEFAULT_SAMPLE_SIZE,
.sample_max = DEFAULT_SAMPLE_MAX,
.tolerance = ERR_SANITY,
.interval = { .msec = DEFAULT_INTERVAL_LENGTH },
};
FILE *warn = NULL;
/* -- Summary data -- */
struct cycle_framework {
tsc_t first_tsc, last_tsc, total_cycles;
};
struct interval_element {
int count;
long long cycles;
long long instructions;
};
struct cycle_summary {
int event_count, count, sample_size;
unsigned long long cycles;
long long *sample;
struct interval_element interval;
};
/* -- Symbol list information -- */
#define SYMBOL_ENTRIES_PER_STRUCT 1023
#define SYMBOL_NAME_SIZE 124
struct symbol_struct {
int count;
struct {
unsigned long long addr;
char name[SYMBOL_NAME_SIZE];
} symbols[SYMBOL_ENTRIES_PER_STRUCT];
struct symbol_struct *next;
};
void error(enum error_level l, struct record_info *ri);
void parse_symbol_file(char *fn) {
unsigned long long last_addr = 0;
FILE * symbol_file;
struct symbol_struct ** p=&G.symbols;
if((symbol_file=fopen(fn, "rb"))==NULL) {
fprintf(stderr, "Could not open symbol file %s\n", fn);
perror("open");
error(ERR_SYSTEM, NULL);
}
while(!feof(symbol_file)) {
/* Allocate a new struct if we need it */
if(!*p) {
*p = malloc(sizeof(**p));
if(!*p) {
fprintf(stderr, "Malloc failed!\n");
error(ERR_SYSTEM, NULL);
}
(*p)->count=0;
(*p)->next=NULL;
}
/* FIXME -- use SYMBOL_NAME_SIZE */
/* FIXME -- use regexp. This won't work for symbols with spaces (yes they exist) */
(*p)->symbols[(*p)->count].addr = 0xDEADBEEF;
if ( fscanf(symbol_file, "%llx %128s",
&(*p)->symbols[(*p)->count].addr,
(*p)->symbols[(*p)->count].name) == 0 )
break;
if( ((*p)->symbols[(*p)->count].addr > 0)
&& ((*p)->symbols[(*p)->count].addr < last_addr) ) {
fprintf(stderr, "Symbol file not properly ordered: %llx %s < %llx!\n",
(*p)->symbols[(*p)->count].addr,
(*p)->symbols[(*p)->count].name,
last_addr);
/* Could be recovered from; just free existing strings and set symbols to NULL */
error(ERR_ASSERT, NULL);
} else
last_addr = (*p)->symbols[(*p)->count].addr;
(*p)->count++;
/* If this struct is full, point to the next. It will be allocated
if needed. */
if((*p)->count == SYMBOL_ENTRIES_PER_STRUCT) {
p=&((*p)->next);
}
}
fclose(symbol_file);
}
/* WARNING not thread safe */
char * find_symbol(unsigned long long addr) {
struct symbol_struct * p=G.symbols;
int i;
char * lastname="ZERO";
unsigned long long offset=addr;
static char name[144];
if(!p) {
name[0]=0;
return name;
}
while(1) {
if(!p)
goto finish;
for(i=0; icount; i++) {
if(p->symbols[i].addr > addr)
goto finish;
else {
lastname=p->symbols[i].name;
offset=addr - p->symbols[i].addr;
}
}
p=p->next;
}
finish:
snprintf(name, 144, "(%s +%llx)",
lastname, offset);
return name;
}
/* -- Eip list data -- */
enum {
EIP_LIST_TYPE_NONE=0,
EIP_LIST_TYPE_MAX
};
struct eip_list_struct {
struct eip_list_struct *next;
unsigned long long eip;
struct cycle_summary summary;
int type;
void * extra;
};
struct {
void (*update)(struct eip_list_struct *, void *);
void (*new)(struct eip_list_struct *, void *);
void (*dump)(struct eip_list_struct *);
} eip_list_type[EIP_LIST_TYPE_MAX] = {
[EIP_LIST_TYPE_NONE] = {
.update=NULL,
.new=NULL,
.dump=NULL },
};
/* --- HVM class of events --- */
/*
* -- Algorithms --
*
* Interrupt Wake-to-halt detection
*
* Purpose: To correlate device interrupts to vcpu runtime.
*
* Diagram:
* ...
* blocked -> runnable <- set to waking
* ...
* runnable -> running
* inj_virq A <- Note "waking" interrupt
* vmenter <- Start tsc of "wake-to-halt" interval.
Turn off 'waking'.
* ...
* inj_virq B <- Note alternate interrupt
* vmenter <- Start tsc of "interrupt-to-halt" interval
* ...
* vmexit <- End tsc of "x-to-halt" interval
* running -> blocked <- Process
*
* The "waking" interrupts we want to sub-classify into
* "wake-only" (when interrupt was the only interrupt from wake to halt) and
* "wake-all" (whether this was the only interrupt or not).
*/
/* VMX data */
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT 3
#define EXIT_REASON_SIPI 4
#define EXIT_REASON_IO_SMI 5
#define EXIT_REASON_OTHER_SMI 6
#define EXIT_REASON_PENDING_INTERRUPT 7
#define EXIT_REASON_PENDING_VIRT_NMI 8
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_GETSEC 11
#define EXIT_REASON_HLT 12
#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
#define EXIT_REASON_RSM 17
#define EXIT_REASON_VMCALL 18
#define EXIT_REASON_VMCLEAR 19
#define EXIT_REASON_VMLAUNCH 20
#define EXIT_REASON_VMPTRLD 21
#define EXIT_REASON_VMPTRST 22
#define EXIT_REASON_VMREAD 23
#define EXIT_REASON_VMRESUME 24
#define EXIT_REASON_VMWRITE 25
#define EXIT_REASON_VMOFF 26
#define EXIT_REASON_VMON 27
#define EXIT_REASON_CR_ACCESS 28
#define EXIT_REASON_DR_ACCESS 29
#define EXIT_REASON_IO_INSTRUCTION 30
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_INVALID_GUEST_STATE 33
#define EXIT_REASON_MSR_LOADING 34
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_TRAP_FLAG 37
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
#define EXIT_REASON_MACHINE_CHECK 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_ACCESS_GDTR_OR_IDTR 46
#define EXIT_REASON_ACCESS_LDTR_OR_TR 47
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_RDTSCP 51
#define EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED 52
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
#define HVM_VMX_EXIT_REASON_MAX (EXIT_REASON_XSETBV+1)
char * hvm_vmx_exit_reason_name[HVM_VMX_EXIT_REASON_MAX] = {
[EXIT_REASON_EXCEPTION_NMI]="EXCEPTION_NMI",
[EXIT_REASON_EXTERNAL_INTERRUPT]="EXTERNAL_INTERRUPT",
[EXIT_REASON_TRIPLE_FAULT]="TRIPLE_FAULT",
[EXIT_REASON_INIT]="INIT",
[EXIT_REASON_SIPI]="SIPI",
[EXIT_REASON_IO_SMI]="IO_SMI",
[EXIT_REASON_OTHER_SMI]="OTHER_SMI",
[EXIT_REASON_PENDING_INTERRUPT]="PENDING_INTERRUPT",
[EXIT_REASON_PENDING_VIRT_NMI]="PENDING_VIRT_NMI",
[EXIT_REASON_TASK_SWITCH]="TASK_SWITCH",
[EXIT_REASON_CPUID]="CPUID",
[EXIT_REASON_GETSEC]="GETSEC",
[EXIT_REASON_HLT]="HLT",
[EXIT_REASON_INVD]="INVD",
[EXIT_REASON_INVLPG]="INVLPG",
[EXIT_REASON_RDPMC]="RDPMC",
[EXIT_REASON_RDTSC]="RDTSC",
[EXIT_REASON_RSM]="RSM",
[EXIT_REASON_VMCALL]="VMCALL",
[EXIT_REASON_VMCLEAR]="VMCLEAR",
[EXIT_REASON_VMLAUNCH]="VMLAUNCH",
[EXIT_REASON_VMPTRLD]="VMPTRLD",
[EXIT_REASON_VMPTRST]="VMPTRST",
[EXIT_REASON_VMREAD]="VMREAD",
[EXIT_REASON_VMRESUME]="VMRESUME",
[EXIT_REASON_VMWRITE]="VMWRITE",
[EXIT_REASON_VMOFF]="VMOFF",
[EXIT_REASON_VMON]="VMON",
[EXIT_REASON_CR_ACCESS]="CR_ACCESS",
[EXIT_REASON_DR_ACCESS]="DR_ACCESS",
[EXIT_REASON_IO_INSTRUCTION]="IO_INSTRUCTION",
[EXIT_REASON_MSR_READ]="MSR_READ",
[EXIT_REASON_MSR_WRITE]="MSR_WRITE",
[EXIT_REASON_INVALID_GUEST_STATE]="INVALID_GUEST_STATE",
[EXIT_REASON_MSR_LOADING]="MSR_LOADING",
[EXIT_REASON_MWAIT_INSTRUCTION]="MWAIT_INSTRUCTION",
[EXIT_REASON_MONITOR_TRAP_FLAG]="MONITOR_TRAP_FLAG",
[EXIT_REASON_MONITOR_INSTRUCTION]="MONITOR_INSTRUCTION",
[EXIT_REASON_PAUSE_INSTRUCTION]="PAUSE_INSTRUCTION",
[EXIT_REASON_MACHINE_CHECK]="MACHINE_CHECK",
[EXIT_REASON_TPR_BELOW_THRESHOLD]="TPR_BELOW_THRESHOLD",
[EXIT_REASON_APIC_ACCESS]="APIC_ACCESS",
[EXIT_REASON_EPT_VIOLATION]="EPT_VIOLATION",
[EXIT_REASON_EPT_MISCONFIG]="EPT_MISCONFIG",
[EXIT_REASON_INVEPT]="INVEPT",
[EXIT_REASON_RDTSCP]="RDTSCP",
[EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED]="VMX_PREEMPTION_TIMER_EXPIRED",
[EXIT_REASON_INVVPID]="INVVPID",
[EXIT_REASON_WBINVD]="WBINVD",
[EXIT_REASON_XSETBV]="XSETBV",
};
/* SVM data */
enum VMEXIT_EXITCODE
{
/* control register read exitcodes */
VMEXIT_CR0_READ = 0,
VMEXIT_CR1_READ = 1,
VMEXIT_CR2_READ = 2,
VMEXIT_CR3_READ = 3,
VMEXIT_CR4_READ = 4,
VMEXIT_CR5_READ = 5,
VMEXIT_CR6_READ = 6,
VMEXIT_CR7_READ = 7,
VMEXIT_CR8_READ = 8,
VMEXIT_CR9_READ = 9,
VMEXIT_CR10_READ = 10,
VMEXIT_CR11_READ = 11,
VMEXIT_CR12_READ = 12,
VMEXIT_CR13_READ = 13,
VMEXIT_CR14_READ = 14,
VMEXIT_CR15_READ = 15,
/* control register write exitcodes */
VMEXIT_CR0_WRITE = 16,
VMEXIT_CR1_WRITE = 17,
VMEXIT_CR2_WRITE = 18,
VMEXIT_CR3_WRITE = 19,
VMEXIT_CR4_WRITE = 20,
VMEXIT_CR5_WRITE = 21,
VMEXIT_CR6_WRITE = 22,
VMEXIT_CR7_WRITE = 23,
VMEXIT_CR8_WRITE = 24,
VMEXIT_CR9_WRITE = 25,
VMEXIT_CR10_WRITE = 26,
VMEXIT_CR11_WRITE = 27,
VMEXIT_CR12_WRITE = 28,
VMEXIT_CR13_WRITE = 29,
VMEXIT_CR14_WRITE = 30,
VMEXIT_CR15_WRITE = 31,
/* debug register read exitcodes */
VMEXIT_DR0_READ = 32,
VMEXIT_DR1_READ = 33,
VMEXIT_DR2_READ = 34,
VMEXIT_DR3_READ = 35,
VMEXIT_DR4_READ = 36,
VMEXIT_DR5_READ = 37,
VMEXIT_DR6_READ = 38,
VMEXIT_DR7_READ = 39,
VMEXIT_DR8_READ = 40,
VMEXIT_DR9_READ = 41,
VMEXIT_DR10_READ = 42,
VMEXIT_DR11_READ = 43,
VMEXIT_DR12_READ = 44,
VMEXIT_DR13_READ = 45,
VMEXIT_DR14_READ = 46,
VMEXIT_DR15_READ = 47,
/* debug register write exitcodes */
VMEXIT_DR0_WRITE = 48,
VMEXIT_DR1_WRITE = 49,
VMEXIT_DR2_WRITE = 50,
VMEXIT_DR3_WRITE = 51,
VMEXIT_DR4_WRITE = 52,
VMEXIT_DR5_WRITE = 53,
VMEXIT_DR6_WRITE = 54,
VMEXIT_DR7_WRITE = 55,
VMEXIT_DR8_WRITE = 56,
VMEXIT_DR9_WRITE = 57,
VMEXIT_DR10_WRITE = 58,
VMEXIT_DR11_WRITE = 59,
VMEXIT_DR12_WRITE = 60,
VMEXIT_DR13_WRITE = 61,
VMEXIT_DR14_WRITE = 62,
VMEXIT_DR15_WRITE = 63,
/* processor exception exitcodes (VMEXIT_EXCP[0-31]) */
VMEXIT_EXCEPTION_DE = 64, /* divide-by-zero-error */
VMEXIT_EXCEPTION_DB = 65, /* debug */
VMEXIT_EXCEPTION_NMI = 66, /* non-maskable-interrupt */
VMEXIT_EXCEPTION_BP = 67, /* breakpoint */
VMEXIT_EXCEPTION_OF = 68, /* overflow */
VMEXIT_EXCEPTION_BR = 69, /* bound-range */
VMEXIT_EXCEPTION_UD = 70, /* invalid-opcode*/
VMEXIT_EXCEPTION_NM = 71, /* device-not-available */
VMEXIT_EXCEPTION_DF = 72, /* double-fault */
VMEXIT_EXCEPTION_09 = 73, /* unsupported (reserved) */
VMEXIT_EXCEPTION_TS = 74, /* invalid-tss */
VMEXIT_EXCEPTION_NP = 75, /* segment-not-present */
VMEXIT_EXCEPTION_SS = 76, /* stack */
VMEXIT_EXCEPTION_GP = 77, /* general-protection */
VMEXIT_EXCEPTION_PF = 78, /* page-fault */
VMEXIT_EXCEPTION_15 = 79, /* reserved */
VMEXIT_EXCEPTION_MF = 80, /* x87 floating-point exception-pending */
VMEXIT_EXCEPTION_AC = 81, /* alignment-check */
VMEXIT_EXCEPTION_MC = 82, /* machine-check */
VMEXIT_EXCEPTION_XF = 83, /* simd floating-point */
/* exceptions 20-31 (exitcodes 84-95) are reserved */
/* ...and the rest of the #VMEXITs */
VMEXIT_INTR = 96,
VMEXIT_NMI = 97,
VMEXIT_SMI = 98,
VMEXIT_INIT = 99,
VMEXIT_VINTR = 100,
VMEXIT_CR0_SEL_WRITE = 101,
VMEXIT_IDTR_READ = 102,
VMEXIT_GDTR_READ = 103,
VMEXIT_LDTR_READ = 104,
VMEXIT_TR_READ = 105,
VMEXIT_IDTR_WRITE = 106,
VMEXIT_GDTR_WRITE = 107,
VMEXIT_LDTR_WRITE = 108,
VMEXIT_TR_WRITE = 109,
VMEXIT_RDTSC = 110,
VMEXIT_RDPMC = 111,
VMEXIT_PUSHF = 112,
VMEXIT_POPF = 113,
VMEXIT_CPUID = 114,
VMEXIT_RSM = 115,
VMEXIT_IRET = 116,
VMEXIT_SWINT = 117,
VMEXIT_INVD = 118,
VMEXIT_PAUSE = 119,
VMEXIT_HLT = 120,
VMEXIT_INVLPG = 121,
VMEXIT_INVLPGA = 122,
VMEXIT_IOIO = 123,
VMEXIT_MSR = 124,
VMEXIT_TASK_SWITCH = 125,
VMEXIT_FERR_FREEZE = 126,
VMEXIT_SHUTDOWN = 127,
VMEXIT_VMRUN = 128,
VMEXIT_VMMCALL = 129,
VMEXIT_VMLOAD = 130,
VMEXIT_VMSAVE = 131,
VMEXIT_STGI = 132,
VMEXIT_CLGI = 133,
VMEXIT_SKINIT = 134,
VMEXIT_RDTSCP = 135,
VMEXIT_ICEBP = 136,
VMEXIT_WBINVD = 137,
VMEXIT_MONITOR = 138,
VMEXIT_MWAIT = 139,
VMEXIT_MWAIT_CONDITIONAL= 140,
VMEXIT_NPF = 1024, /* nested paging fault */
VMEXIT_INVALID = -1
};
#define HVM_SVM_EXIT_REASON_MAX 1025
char * hvm_svm_exit_reason_name[HVM_SVM_EXIT_REASON_MAX] = {
/* 0-15 */
"VMEXIT_CR0_READ",
"VMEXIT_CR1_READ",
"VMEXIT_CR2_READ",
"VMEXIT_CR3_READ",
"VMEXIT_CR4_READ",
"VMEXIT_CR5_READ",
"VMEXIT_CR6_READ",
"VMEXIT_CR7_READ",
"VMEXIT_CR8_READ",
"VMEXIT_CR9_READ",
"VMEXIT_CR10_READ",
"VMEXIT_CR11_READ",
"VMEXIT_CR12_READ",
"VMEXIT_CR13_READ",
"VMEXIT_CR14_READ",
"VMEXIT_CR15_READ",
/* 16-31 */
"VMEXIT_CR0_WRITE",
"VMEXIT_CR1_WRITE",
"VMEXIT_CR2_WRITE",
"VMEXIT_CR3_WRITE",
"VMEXIT_CR4_WRITE",
"VMEXIT_CR5_WRITE",
"VMEXIT_CR6_WRITE",
"VMEXIT_CR7_WRITE",
"VMEXIT_CR8_WRITE",
"VMEXIT_CR9_WRITE",
"VMEXIT_CR10_WRITE",
"VMEXIT_CR11_WRITE",
"VMEXIT_CR12_WRITE",
"VMEXIT_CR13_WRITE",
"VMEXIT_CR14_WRITE",
"VMEXIT_CR15_WRITE",
/* 32-47 */
"VMEXIT_DR0_READ",
"VMEXIT_DR1_READ",
"VMEXIT_DR2_READ",
"VMEXIT_DR3_READ",
"VMEXIT_DR4_READ",
"VMEXIT_DR5_READ",
"VMEXIT_DR6_READ",
"VMEXIT_DR7_READ",
"VMEXIT_DR8_READ",
"VMEXIT_DR9_READ",
"VMEXIT_DR10_READ",
"VMEXIT_DR11_READ",
"VMEXIT_DR12_READ",
"VMEXIT_DR13_READ",
"VMEXIT_DR14_READ",
"VMEXIT_DR15_READ",
/* 48-63 */
"VMEXIT_DR0_WRITE",
"VMEXIT_DR1_WRITE",
"VMEXIT_DR2_WRITE",
"VMEXIT_DR3_WRITE",
"VMEXIT_DR4_WRITE",
"VMEXIT_DR5_WRITE",
"VMEXIT_DR6_WRITE",
"VMEXIT_DR7_WRITE",
"VMEXIT_DR8_WRITE",
"VMEXIT_DR9_WRITE",
"VMEXIT_DR10_WRITE",
"VMEXIT_DR11_WRITE",
"VMEXIT_DR12_WRITE",
"VMEXIT_DR13_WRITE",
"VMEXIT_DR14_WRITE",
"VMEXIT_DR15_WRITE",
/* 64-83 */
"VMEXIT_EXCEPTION_DE",
"VMEXIT_EXCEPTION_DB",
"VMEXIT_EXCEPTION_NMI",
"VMEXIT_EXCEPTION_BP",
"VMEXIT_EXCEPTION_OF",
"VMEXIT_EXCEPTION_BR",
"VMEXIT_EXCEPTION_UD",
"VMEXIT_EXCEPTION_NM",
"VMEXIT_EXCEPTION_DF",
"VMEXIT_EXCEPTION_09",
"VMEXIT_EXCEPTION_TS",
"VMEXIT_EXCEPTION_NP",
"VMEXIT_EXCEPTION_SS",
"VMEXIT_EXCEPTION_GP",
"VMEXIT_EXCEPTION_PF",
"VMEXIT_EXCEPTION_15",
"VMEXIT_EXCEPTION_MF",
"VMEXIT_EXCEPTION_AC",
"VMEXIT_EXCEPTION_MC",
"VMEXIT_EXCEPTION_XF",
/* 84-95 */
"VMEXIT_EXCEPTION_20",
"VMEXIT_EXCEPTION_21",
"VMEXIT_EXCEPTION_22",
"VMEXIT_EXCEPTION_23",
"VMEXIT_EXCEPTION_24",
"VMEXIT_EXCEPTION_25",
"VMEXIT_EXCEPTION_26",
"VMEXIT_EXCEPTION_27",
"VMEXIT_EXCEPTION_28",
"VMEXIT_EXCEPTION_29",
"VMEXIT_EXCEPTION_30",
"VMEXIT_EXCEPTION_31",
/* 96-99 */
"VMEXIT_INTR",
"VMEXIT_NMI",
"VMEXIT_SMI",
"VMEXIT_INIT",
/* 100-109 */
"VMEXIT_VINTR",
"VMEXIT_CR0_SEL_WRITE",
"VMEXIT_IDTR_READ",
"VMEXIT_GDTR_READ",
"VMEXIT_LDTR_READ",
"VMEXIT_TR_READ",
"VMEXIT_IDTR_WRITE",
"VMEXIT_GDTR_WRITE",
"VMEXIT_LDTR_WRITE",
"VMEXIT_TR_WRITE",
/* 110-119 */
"VMEXIT_RDTSC",
"VMEXIT_RDPMC",
"VMEXIT_PUSHF",
"VMEXIT_POPF",
"VMEXIT_CPUID",
"VMEXIT_RSM",
"VMEXIT_IRET",
"VMEXIT_SWINT",
"VMEXIT_INVD",
"VMEXIT_PAUSE",
/* 120-129 */
"VMEXIT_HLT",
"VMEXIT_INVLPG",
"VMEXIT_INVLPGA",
"VMEXIT_IOIO",
"VMEXIT_MSR",
"VMEXIT_TASK_SWITCH",
"VMEXIT_FERR_FREEZE",
"VMEXIT_SHUTDOWN",
"VMEXIT_VMRUN",
"VMEXIT_VMMCALL",
/* 130-139 */
"VMEXIT_VMLOAD",
"VMEXIT_VMSAVE",
"VMEXIT_STGI",
"VMEXIT_CLGI",
"VMEXIT_SKINIT",
"VMEXIT_RDTSCP",
"VMEXIT_ICEBP",
"VMEXIT_WBINVD",
"VMEXIT_MONITOR",
"VMEXIT_MWAIT",
/* 140 */
"VMEXIT_MWAIT_CONDITIONAL",
[VMEXIT_NPF] = "VMEXIT_NPF", /* nested paging fault */
};
#if ( HVM_VMX_EXIT_REASON_MAX > HVM_SVM_EXIT_REASON_MAX )
# define HVM_EXIT_REASON_MAX HVM_VMX_EXIT_REASON_MAX
# error - Strange!
#else
# define HVM_EXIT_REASON_MAX HVM_SVM_EXIT_REASON_MAX
#endif
/* General hvm information */
#define SPURIOUS_APIC_VECTOR 0xff
#define ERROR_APIC_VECTOR 0xfe
#define INVALIDATE_TLB_VECTOR 0xfd
#define EVENT_CHECK_VECTOR 0xfc
#define CALL_FUNCTION_VECTOR 0xfb
#define THERMAL_APIC_VECTOR 0xfa
#define LOCAL_TIMER_VECTOR 0xf9
#define EXTERNAL_INTERRUPT_MAX 256
/* Stringify numbers */
char * hvm_extint_vector_name[EXTERNAL_INTERRUPT_MAX] = {
[SPURIOUS_APIC_VECTOR] = "SPURIOS_APIC",
[ERROR_APIC_VECTOR] = "ERROR_APIC",
[INVALIDATE_TLB_VECTOR]= "INVALIDATE_TLB",
[EVENT_CHECK_VECTOR]= "EVENT_CHECK",
[CALL_FUNCTION_VECTOR]= "CALL_FUNCTION",
[THERMAL_APIC_VECTOR]= "THERMAL_APIC",
[LOCAL_TIMER_VECTOR] = "LOCAL_TIMER",
};
#define HVM_TRAP_MAX 20
char * hvm_trap_name[HVM_TRAP_MAX] = {
[0] = "Divide",
[1] = "RESERVED",
[2] = "NMI",
[3] = "Breakpoint",
[4] = "Overflow",
[5] = "BOUND",
[6] = "Invalid Op",
[7] = "Coprocessor not present",
[8] = "Double Fault",
[9] = "Coprocessor segment overrun",
[10] = "TSS",
[11] = "Segment not present",
[12] = "Stack-segment fault",
[13] = "GP",
[14] = "Page fault",
[15] = "RESERVED",
[16] = "FPU",
[17] = "Alignment check",
[18] = "Machine check",
[19] = "SIMD",
};
enum {
HVM_EVENT_HANDLER_NONE = 0,
HVM_EVENT_HANDLER_PF_XEN = 1,
HVM_EVENT_HANDLER_PF_INJECT,
HVM_EVENT_HANDLER_INJ_EXC,
HVM_EVENT_HANDLER_INJ_VIRQ,
HVM_EVENT_HANDLER_REINJ_VIRQ,
HVM_EVENT_HANDLER_IO_READ,
HVM_EVENT_HANDLER_IO_WRITE,
HVM_EVENT_HANDLER_CR_READ, /* 8 */
HVM_EVENT_HANDLER_CR_WRITE,
HVM_EVENT_HANDLER_DR_READ,
HVM_EVENT_HANDLER_DR_WRITE,
HVM_EVENT_HANDLER_MSR_READ,
HVM_EVENT_HANDLER_MSR_WRITE,
HVM_EVENT_HANDLER_CPUID,
HVM_EVENT_HANDLER_INTR,
HVM_EVENT_HANDLER_NMI, /* 16 */
HVM_EVENT_HANDLER_SMI,
HVM_EVENT_HANDLER_VMCALL,
HVM_EVENT_HANDLER_HLT,
HVM_EVENT_HANDLER_INVLPG,
HVM_EVENT_HANDLER_MCE,
HVM_EVENT_HANDLER_IO_ASSIST,
HVM_EVENT_HANDLER_MMIO_ASSIST,
HVM_EVENT_HANDLER_CLTS,
HVM_EVENT_HANDLER_LMSW,
HVM_EVENT_RDTSC,
HVM_EVENT_INTR_WINDOW=0x20, /* Oops... skipped 0x1b-1f */
HVM_EVENT_NPF,
HVM_EVENT_REALMODE_EMULATE,
HVM_EVENT_TRAP,
HVM_EVENT_TRAP_DEBUG,
HVM_EVENT_VLAPIC,
HVM_EVENT_HANDLER_MAX
};
char * hvm_event_handler_name[HVM_EVENT_HANDLER_MAX] = {
"(no handler)",
"pf_xen",
"pf_inject",
"inj_exc",
"inj_virq",
"reinj_virq",
"io_read",
"io_write",
"cr_read", /* 8 */
"cr_write",
"dr_read",
"dr_write",
"msr_read",
"msr_write",
"cpuid",
"intr",
"nmi", /* 16 */
"smi",
"vmcall",
"hlt",
"invlpg",
"mce",
"io_assist",
"mmio_assist",
"clts", /* 24 */
"lmsw",
"rdtsc",
[HVM_EVENT_INTR_WINDOW]="intr_window",
"npf",
"realmode_emulate",
"trap",
"trap_debug",
"vlapic"
};
enum {
HVM_VOL_VMENTRY,
HVM_VOL_VMEXIT,
HVM_VOL_HANDLER,
HVM_VOL_MAX
};
enum {
GUEST_INTERRUPT_CASE_NONE,
/* This interrupt woke, no other interrupts until halt */
GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE,
/* This interrupt woke, maybe another interrupt before halt */
GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY,
/* Time from interrupt (running) to halt */
GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT,
GUEST_INTERRUPT_CASE_MAX,
};
char *guest_interrupt_case_name[] = {
[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE]="wake to halt alone",
/* This interrupt woke, maybe another interrupt before halt */
[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY] ="wake to halt any ",
/* Time from interrupt (running) to halt */
[GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT] ="intr to halt ",
};
char *hvm_vol_name[HVM_VOL_MAX] = {
[HVM_VOL_VMENTRY]="vmentry",
[HVM_VOL_VMEXIT] ="vmexit",
[HVM_VOL_HANDLER]="handler",
};
enum {
HYPERCALL_set_trap_table = 0,
HYPERCALL_mmu_update,
HYPERCALL_set_gdt,
HYPERCALL_stack_switch,
HYPERCALL_set_callbacks,
HYPERCALL_fpu_taskswitch,
HYPERCALL_sched_op_compat,
HYPERCALL_platform_op,
HYPERCALL_set_debugreg,
HYPERCALL_get_debugreg,
HYPERCALL_update_descriptor,
HYPERCALL_memory_op=12,
HYPERCALL_multicall,
HYPERCALL_update_va_mapping,
HYPERCALL_set_timer_op,
HYPERCALL_event_channel_op_compat,
HYPERCALL_xen_version,
HYPERCALL_console_io,
HYPERCALL_physdev_op_compat,
HYPERCALL_grant_table_op,
HYPERCALL_vm_assist,
HYPERCALL_update_va_mapping_otherdomain,
HYPERCALL_iret,
HYPERCALL_vcpu_op,
HYPERCALL_set_segment_base,
HYPERCALL_mmuext_op,
HYPERCALL_acm_op,
HYPERCALL_nmi_op,
HYPERCALL_sched_op,
HYPERCALL_callback_op,
HYPERCALL_xenoprof_op,
HYPERCALL_event_channel_op,
HYPERCALL_physdev_op,
HYPERCALL_hvm_op,
HYPERCALL_sysctl,
HYPERCALL_domctl,
HYPERCALL_kexec_op,
HYPERCALL_MAX
};
char *hypercall_name[HYPERCALL_MAX] = {
[HYPERCALL_set_trap_table]="set_trap_table",
[HYPERCALL_mmu_update]="mmu_update",
[HYPERCALL_set_gdt]="set_gdt",
[HYPERCALL_stack_switch]="stack_switch",
[HYPERCALL_set_callbacks]="set_callbacks",
[HYPERCALL_fpu_taskswitch]="fpu_taskswitch",
[HYPERCALL_sched_op_compat]="sched_op(compat)",
[HYPERCALL_platform_op]="platform_op",
[HYPERCALL_set_debugreg]="set_debugreg",
[HYPERCALL_get_debugreg]="get_debugreg",
[HYPERCALL_update_descriptor]="update_descriptor",
[HYPERCALL_memory_op]="memory_op",
[HYPERCALL_multicall]="multicall",
[HYPERCALL_update_va_mapping]="update_va_mapping",
[HYPERCALL_set_timer_op]="set_timer_op",
[HYPERCALL_event_channel_op_compat]="evtchn_op(compat)",
[HYPERCALL_xen_version]="xen_version",
[HYPERCALL_console_io]="console_io",
[HYPERCALL_physdev_op_compat]="physdev_op(compat)",
[HYPERCALL_grant_table_op]="grant_table_op",
[HYPERCALL_vm_assist]="vm_assist",
[HYPERCALL_update_va_mapping_otherdomain]="update_va_mapping_otherdomain",
[HYPERCALL_iret]="iret",
[HYPERCALL_vcpu_op]="vcpu_op",
[HYPERCALL_set_segment_base]="set_segment_base",
[HYPERCALL_mmuext_op]="mmuext_op",
[HYPERCALL_acm_op]="acm_op",
[HYPERCALL_nmi_op]="nmi_op",
[HYPERCALL_sched_op]="sched_op",
[HYPERCALL_callback_op]="callback_op",
[HYPERCALL_xenoprof_op]="xenoprof_op",
[HYPERCALL_event_channel_op]="evtchn_op",
[HYPERCALL_physdev_op]="physdev_op",
[HYPERCALL_hvm_op]="hvm_op",
[HYPERCALL_sysctl]="sysctl",
[HYPERCALL_domctl]="domctl",
[HYPERCALL_kexec_op]="kexec_op"
};
enum {
PF_XEN_EMUL_LVL_0,
PF_XEN_EMUL_LVL_1,
PF_XEN_EMUL_LVL_2,
PF_XEN_EMUL_LVL_3,
PF_XEN_EMUL_LVL_4,
PF_XEN_EMUL_EARLY_UNSHADOW,
PF_XEN_EMUL_SET_CHANGED,
PF_XEN_EMUL_SET_UNCHANGED,
PF_XEN_EMUL_SET_FLUSH,
PF_XEN_EMUL_SET_ERROR,
PF_XEN_EMUL_PROMOTE,
PF_XEN_EMUL_DEMOTE,
PF_XEN_EMUL_PREALLOC_UNPIN,
PF_XEN_EMUL_PREALLOC_UNHOOK,
PF_XEN_EMUL_MAX,
};
char * pf_xen_emul_name[PF_XEN_EMUL_MAX] = {
[PF_XEN_EMUL_LVL_0]="non-linmap",
[PF_XEN_EMUL_LVL_1]="linmap l1",
[PF_XEN_EMUL_LVL_2]="linmap l2",
[PF_XEN_EMUL_LVL_3]="linmap l3",
[PF_XEN_EMUL_LVL_4]="linmap l4",
[PF_XEN_EMUL_EARLY_UNSHADOW]="early unshadow",
[PF_XEN_EMUL_SET_UNCHANGED]="set unchanged",
[PF_XEN_EMUL_SET_CHANGED]="set changed",
[PF_XEN_EMUL_SET_FLUSH]="set changed",
[PF_XEN_EMUL_SET_ERROR]="set changed",
[PF_XEN_EMUL_PROMOTE]="promote",
[PF_XEN_EMUL_DEMOTE]="demote",
[PF_XEN_EMUL_PREALLOC_UNPIN]="unpin",
[PF_XEN_EMUL_PREALLOC_UNHOOK]="unhook",
};
/* Rio only */
enum {
PF_XEN_NON_EMUL_VA_USER,
PF_XEN_NON_EMUL_VA_KERNEL,
PF_XEN_NON_EMUL_EIP_USER,
PF_XEN_NON_EMUL_EIP_KERNEL,
PF_XEN_NON_EMUL_MAX,
};
char * pf_xen_non_emul_name[PF_XEN_NON_EMUL_MAX] = {
[PF_XEN_NON_EMUL_VA_USER]="va user",
[PF_XEN_NON_EMUL_VA_KERNEL]="va kernel",
[PF_XEN_NON_EMUL_EIP_USER]="eip user",
[PF_XEN_NON_EMUL_EIP_KERNEL]="eip kernel",
};
enum {
PF_XEN_FIXUP_PREALLOC_UNPIN,
PF_XEN_FIXUP_PREALLOC_UNHOOK,
PF_XEN_FIXUP_UNSYNC,
PF_XEN_FIXUP_OOS_ADD,
PF_XEN_FIXUP_OOS_EVICT,
PF_XEN_FIXUP_PROMOTE,
PF_XEN_FIXUP_UPDATE_ONLY,
PF_XEN_FIXUP_WRMAP,
PF_XEN_FIXUP_BRUTE_FORCE,
PF_XEN_FIXUP_MAX,
};
char * pf_xen_fixup_name[PF_XEN_FIXUP_MAX] = {
[PF_XEN_FIXUP_PREALLOC_UNPIN] = "unpin",
[PF_XEN_FIXUP_PREALLOC_UNHOOK] = "unhook",
[PF_XEN_FIXUP_UNSYNC] = "unsync",
[PF_XEN_FIXUP_OOS_ADD] = "oos-add",
[PF_XEN_FIXUP_OOS_EVICT] = "oos-evict",
[PF_XEN_FIXUP_PROMOTE] = "promote",
[PF_XEN_FIXUP_UPDATE_ONLY] = "update",
[PF_XEN_FIXUP_WRMAP] = "wrmap",
[PF_XEN_FIXUP_BRUTE_FORCE] = "wrmap-bf",
};
enum {
PF_XEN_NOT_SHADOW = 1,
PF_XEN_FAST_PROPAGATE,
PF_XEN_FAST_MMIO,
PF_XEN_FALSE_FAST_PATH,
PF_XEN_MMIO,
PF_XEN_FIXUP,
PF_XEN_DOMF_DYING,
PF_XEN_EMULATE,
PF_XEN_EMULATE_UNSHADOW_USER,
PF_XEN_EMULATE_UNSHADOW_EVTINJ,
PF_XEN_EMULATE_UNSHADOW_UNHANDLED,
PF_XEN_LAST_FAULT=PF_XEN_EMULATE_UNSHADOW_UNHANDLED,
PF_XEN_NON_EMULATE,
PF_XEN_NO_HANDLER,
PF_XEN_MAX,
};
#define SHADOW_WRMAP_BF 12
#define SHADOW_PREALLOC_UNPIN 13
#define SHADOW_RESYNC_FULL 14
#define SHADOW_RESYNC_ONLY 15
char * pf_xen_name[PF_XEN_MAX] = {
[PF_XEN_NOT_SHADOW]="propagate",
[PF_XEN_FAST_PROPAGATE]="fast propagate",
[PF_XEN_FAST_MMIO]="fast mmio",
[PF_XEN_FALSE_FAST_PATH]="false fast path",
[PF_XEN_MMIO]="mmio",
[PF_XEN_FIXUP]="fixup",
[PF_XEN_DOMF_DYING]="dom dying",
[PF_XEN_EMULATE]="emulate",
[PF_XEN_EMULATE_UNSHADOW_USER]="unshadow:user-mode",
[PF_XEN_EMULATE_UNSHADOW_EVTINJ]="unshadow:evt inj",
[PF_XEN_EMULATE_UNSHADOW_UNHANDLED]="unshadow:unhandled instr",
[PF_XEN_NON_EMULATE]="fixup|mmio",
[PF_XEN_NO_HANDLER]="(no handler)",
};
#define CORR_VA_INVALID (0ULL-1)
enum {
NONPF_MMIO_APIC,
NONPF_MMIO_NPF,
NONPF_MMIO_UNKNOWN,
NONPF_MMIO_MAX
};
struct mmio_info {
unsigned long long gpa;
unsigned long long va; /* Filled only by shadow */
unsigned data;
unsigned data_valid:1, is_write:1;
};
struct pf_xen_extra {
unsigned long long va;
union {
unsigned flags;
struct {
unsigned flag_set_ad:1,
flag_set_a:1,
flag_shadow_l1_get_ref:1,
flag_shadow_l1_put_ref:1,
flag_l2_propagate:1,
flag_set_changed:1,
flag_set_flush:1,
flag_set_error:1,
flag_demote:1,
flag_promote:1,
flag_wrmap:1,
flag_wrmap_guess_found:1,
flag_wrmap_brute_force:1,
flag_early_unshadow:1,
flag_emulation_2nd_pt_written:1,
flag_emulation_last_failed:1,
flag_emulate_full_pt:1,
flag_prealloc_unhook:1,
flag_unsync:1,
flag_oos_fixup_add:1,
flag_oos_fixup_evict:1;
};
}; /* Miami + ; fixup & emulate */
unsigned int error_code; /* Rio only */
/* Calculated */
int pf_case; /* Rio */
/* MMIO only */
unsigned long long gpa;
unsigned int data;
/* Emulate only */
unsigned long long gl1e; /* Miami + */
unsigned long long wval; /* Miami */
unsigned long long corresponding_va;
unsigned int pt_index[5], pt_is_lo;
int pt_level;
/* Other */
unsigned long long gfn;
/* Flags */
unsigned corr_valid:1,
corr_is_kernel:1,
va_is_kernel:1;
};
struct pcpu_info;
#define GUEST_INTERRUPT_MAX 350
#define FAKE_VECTOR 349
#define CR_MAX 9
#define RESYNCS_MAX 17
#define PF_XEN_FIXUP_UNSYNC_RESYNC_MAX 2
struct hvm_data;
struct hvm_summary_handler_node {
void (*handler)(struct hvm_data *, void* data);
void *data;
struct hvm_summary_handler_node *next;
};
struct hvm_data {
/* Summary information */
int init;
int vmexit_valid;
int summary_info;
struct vcpu_data *v; /* up-pointer */
/* SVM / VMX compatibility. FIXME - should be global */
char ** exit_reason_name;
int exit_reason_max;
struct hvm_summary_handler_node *exit_reason_summary_handler_list[HVM_EXIT_REASON_MAX];
/* Information about particular exit reasons */
struct {
struct cycle_summary exit_reason[HVM_EXIT_REASON_MAX];
int extint[EXTERNAL_INTERRUPT_MAX+1];
int *extint_histogram;
struct cycle_summary trap[HVM_TRAP_MAX];
struct cycle_summary pf_xen[PF_XEN_MAX];
struct cycle_summary pf_xen_emul[PF_XEN_EMUL_MAX];
struct cycle_summary pf_xen_emul_early_unshadow[5];
struct cycle_summary pf_xen_non_emul[PF_XEN_NON_EMUL_MAX];
struct cycle_summary pf_xen_fixup[PF_XEN_FIXUP_MAX];
struct cycle_summary pf_xen_fixup_unsync_resync[PF_XEN_FIXUP_UNSYNC_RESYNC_MAX+1];
struct cycle_summary cr_write[CR_MAX];
struct cycle_summary cr3_write_resyncs[RESYNCS_MAX+1];
struct cycle_summary vmcall[HYPERCALL_MAX+1];
struct cycle_summary generic[HVM_EVENT_HANDLER_MAX];
struct cycle_summary mmio[NONPF_MMIO_MAX];
struct hvm_gi_struct {
int count;
struct cycle_summary runtime[GUEST_INTERRUPT_CASE_MAX];
/* OK, not summary info, but still... */
int is_wake;
tsc_t start_tsc;
} guest_interrupt[GUEST_INTERRUPT_MAX + 1];
/* IPI Latency */
struct cycle_summary ipi_latency;
int ipi_count[256];
struct {
struct io_address *mmio, *pio;
} io;
} summary;
/* In-flight accumulation information */
struct {
union {
struct {
unsigned port:31,
is_write:1;
unsigned int val;
} io;
struct pf_xen_extra pf_xen;
struct {
unsigned cr;
unsigned long long val;
int repromote;
} cr_write;
struct {
unsigned addr;
unsigned long long val;
} msr;
struct {
unsigned int event;
uint32_t d[4];
} generic;
struct {
unsigned eax;
} vmcall;
struct {
unsigned vec;
} intr;
};
/* MMIO gets its separate area, since many exits may use it */
struct mmio_info mmio;
}inflight;
int resyncs;
void (*post_process)(struct hvm_data *);
tsc_t exit_tsc, arc_cycles, entry_tsc;
unsigned long long rip;
unsigned exit_reason, event_handler;
int short_summary_done:1, prealloc_unpin:1, wrmap_bf:1;
/* Immediate processing */
void *d;
/* Wake-to-halt detection. See comment above. */
struct {
unsigned waking:1;
/* Wake vector: keep track of time from vmentry until:
next halt, or next interrupt */
int vector, interrupts, interrupts_wanting_tsc;
} w2h;
/* Historical info */
tsc_t last_rdtsc;
};
enum {
HVM_SHORT_SUMMARY_EMULATE,
HVM_SHORT_SUMMARY_UNSYNC,
HVM_SHORT_SUMMARY_FIXUP,
HVM_SHORT_SUMMARY_MMIO,
HVM_SHORT_SUMMARY_PROPAGATE,
HVM_SHORT_SUMMARY_CR3,
HVM_SHORT_SUMMARY_VMCALL,
HVM_SHORT_SUMMARY_INTERRUPT,
HVM_SHORT_SUMMARY_HLT,
HVM_SHORT_SUMMARY_OTHER,
HVM_SHORT_SUMMARY_MAX,
};
char *hvm_short_summary_name[HVM_SHORT_SUMMARY_MAX] = {
[HVM_SHORT_SUMMARY_EMULATE] ="emulate",
[HVM_SHORT_SUMMARY_UNSYNC] ="unsync",
[HVM_SHORT_SUMMARY_FIXUP] ="fixup",
[HVM_SHORT_SUMMARY_MMIO] ="mmio",
[HVM_SHORT_SUMMARY_PROPAGATE]="propagate",
[HVM_SHORT_SUMMARY_CR3] ="cr3",
[HVM_SHORT_SUMMARY_VMCALL] ="vmcall",
[HVM_SHORT_SUMMARY_INTERRUPT]="intr",
[HVM_SHORT_SUMMARY_HLT] ="hlt",
[HVM_SHORT_SUMMARY_OTHER] ="other",
};
struct hvm_short_summary_struct {
struct cycle_summary s[HVM_SHORT_SUMMARY_MAX];
};
void init_hvm_data(struct hvm_data *h, struct vcpu_data *v) {
int i;
if(h->init)
return;
h->v = v;
h->init = 1;
if(opt.svm_mode) {
h->exit_reason_max = HVM_SVM_EXIT_REASON_MAX;
h->exit_reason_name = hvm_svm_exit_reason_name;
} else {
h->exit_reason_max = HVM_VMX_EXIT_REASON_MAX;
h->exit_reason_name = hvm_vmx_exit_reason_name;
}
if(opt.histogram_interrupt_eip) {
int count = ((1ULL<summary.extint_histogram = malloc(size);
if(h->summary.extint_histogram)
bzero(h->summary.extint_histogram, size);
else {
fprintf(stderr, "FATAL: Could not allocate %zd bytes for interrupt histogram!\n",
size);
error(ERR_SYSTEM, NULL);
}
}
for(i=0; isummary.guest_interrupt[i].count=0;
}
/* PV data */
enum {
PV_HYPERCALL=1,
PV_TRAP=3,
PV_PAGE_FAULT,
PV_FORCED_INVALID_OP,
PV_EMULATE_PRIVOP,
PV_EMULATE_4GB,
PV_MATH_STATE_RESTORE,
PV_PAGING_FIXUP,
PV_GDT_LDT_MAPPING_FAULT,
PV_PTWR_EMULATION,
PV_PTWR_EMULATION_PAE,
PV_HYPERCALL_V2 = 13,
PV_HYPERCALL_SUBCALL = 14,
PV_MAX
};
char *pv_name[PV_MAX] = {
[PV_HYPERCALL]="hypercall",
[PV_TRAP]="trap",
[PV_PAGE_FAULT]="page_fault",
[PV_FORCED_INVALID_OP]="forced_invalid_op",
[PV_EMULATE_PRIVOP]="emulate privop",
[PV_EMULATE_4GB]="emulate 4g",
[PV_MATH_STATE_RESTORE]="math state restore",
[PV_PAGING_FIXUP]="paging fixup",
[PV_GDT_LDT_MAPPING_FAULT]="gdt/ldt mapping fault",
[PV_PTWR_EMULATION]="ptwr",
[PV_PTWR_EMULATION_PAE]="ptwr(pae)",
[PV_HYPERCALL_V2]="hypercall",
[PV_HYPERCALL_SUBCALL]="hypercall (subcall)",
};
#define PV_HYPERCALL_MAX 56
#define PV_TRAP_MAX 20
struct pv_data {
unsigned summary_info:1;
int count[PV_MAX];
int hypercall_count[PV_HYPERCALL_MAX];
int trap_count[PV_TRAP_MAX];
};
/* Sched data */
enum {
RUNSTATE_RUNNING=0,
RUNSTATE_RUNNABLE,
RUNSTATE_BLOCKED,
RUNSTATE_OFFLINE,
RUNSTATE_LOST,
RUNSTATE_QUEUED,
RUNSTATE_INIT,
RUNSTATE_MAX
};
int runstate_graph[RUNSTATE_MAX] =
{
[RUNSTATE_BLOCKED]=0,
[RUNSTATE_OFFLINE]=1,
[RUNSTATE_RUNNABLE]=2,
[RUNSTATE_RUNNING]=3,
[RUNSTATE_LOST]=-1,
[RUNSTATE_QUEUED]=-2,
[RUNSTATE_INIT]=-2,
};
char * runstate_name[RUNSTATE_MAX]={
[RUNSTATE_RUNNING]= "running",
[RUNSTATE_RUNNABLE]="runnable",
[RUNSTATE_BLOCKED]= "blocked", /* to be blocked */
[RUNSTATE_OFFLINE]= "offline",
[RUNSTATE_QUEUED]= "queued",
[RUNSTATE_INIT]= "init",
[RUNSTATE_LOST]= "lost",
};
enum {
RUNNABLE_STATE_INVALID,
RUNNABLE_STATE_WAKE,
RUNNABLE_STATE_PREEMPT,
RUNNABLE_STATE_OTHER,
RUNNABLE_STATE_MAX
};
char * runnable_state_name[RUNNABLE_STATE_MAX]={
[RUNNABLE_STATE_INVALID]="invalid", /* Should never show up */
[RUNNABLE_STATE_WAKE]="wake",
[RUNNABLE_STATE_PREEMPT]="preempt",
[RUNNABLE_STATE_OTHER]="other",
};
/* Memory data */
enum {
MEM_PAGE_GRANT_MAP = 1,
MEM_PAGE_GRANT_UNMAP,
MEM_PAGE_GRANT_TRANSFER,
MEM_SET_P2M_ENTRY,
MEM_DECREASE_RESERVATION,
MEM_POD_POPULATE = 16,
MEM_POD_ZERO_RECLAIM,
MEM_POD_SUPERPAGE_SPLINTER,
MEM_MAX
};
char *mem_name[MEM_MAX] = {
[MEM_PAGE_GRANT_MAP] = "grant-map",
[MEM_PAGE_GRANT_UNMAP] = "grant-unmap",
[MEM_PAGE_GRANT_TRANSFER] = "grant-transfer",
[MEM_SET_P2M_ENTRY] = "set-p2m",
[MEM_DECREASE_RESERVATION] = "decrease-reservation",
[MEM_POD_POPULATE] = "pod-populate",
[MEM_POD_ZERO_RECLAIM] = "pod-zero-reclaim",
[MEM_POD_SUPERPAGE_SPLINTER] = "pod-superpage-splinter",
};
/* Per-unit information. */
struct cr3_value_struct {
struct cr3_value_struct * next;
struct cr3_value_struct * gnext;
unsigned long long gmfn;
int cr3_id;
unsigned long long first_time, last_time, run_time;
struct cycle_summary total_time, guest_time, hv_time;
int switch_count, flush_count;
struct hvm_short_summary_struct hvm;
struct {
int now;
int count;
} prealloc_unpin;
struct {
unsigned callback:1;
unsigned flush_count, switch_count;
unsigned fixup_user, emulate_corr_user;
} destroy;
};
#ifndef MAX_CPUS
#define MAX_CPUS 256
#endif
typedef uint32_t cpu_mask_t;
#define IDLE_DOMAIN 32767
#define DEFAULT_DOMAIN 32768
#define MAX_VLAPIC_LIST 8
struct vlapic_struct {
struct {
struct outstanding_ipi {
tsc_t first_tsc;
int vec, count;
int injected, valid;
} list[MAX_VLAPIC_LIST];
} outstanding;
};
struct vcpu_data {
int vid;
struct domain_data *d; /* up-pointer */
unsigned activated:1;
int guest_paging_levels;
/* Schedule info */
struct {
int state;
int runnable_state; /* Only valid when state==RUNSTATE_RUNNABLE */
tsc_t tsc;
/* TSC skew detection/correction */
struct last_oldstate_struct {
int wrong, actual, pid;
tsc_t tsc;
} last_oldstate;
/* Performance counters */
unsigned long long p1_start, p2_start;
} runstate;
struct pcpu_info *p;
tsc_t pcpu_tsc;
/* Hardware tracking */
struct {
long long val;
tsc_t start_time;
struct cr3_value_struct *data;
} cr3;
/* IPI latency tracking */
struct vlapic_struct vlapic;
/* Summary info */
struct cycle_framework f;
struct cycle_summary runstates[RUNSTATE_MAX];
struct cycle_summary runnable_states[RUNNABLE_STATE_MAX];
struct cycle_summary cpu_affinity_all,
cpu_affinity_pcpu[MAX_CPUS];
enum {
VCPU_DATA_NONE=0,
VCPU_DATA_HVM,
VCPU_DATA_PV
} data_type;
union {
struct hvm_data hvm;
struct pv_data pv;
};
};
enum {
DOMAIN_RUNSTATE_BLOCKED=0,
DOMAIN_RUNSTATE_PARTIAL_RUN,
DOMAIN_RUNSTATE_FULL_RUN,
DOMAIN_RUNSTATE_PARTIAL_CONTENTION,
DOMAIN_RUNSTATE_CONCURRENCY_HAZARD,
DOMAIN_RUNSTATE_FULL_CONTENTION,
DOMAIN_RUNSTATE_LOST,
DOMAIN_RUNSTATE_MAX
};
char * domain_runstate_name[] = {
[DOMAIN_RUNSTATE_BLOCKED]="blocked",
[DOMAIN_RUNSTATE_PARTIAL_RUN]="partial run",
[DOMAIN_RUNSTATE_FULL_RUN]="full run",
[DOMAIN_RUNSTATE_PARTIAL_CONTENTION]="partial contention",
[DOMAIN_RUNSTATE_CONCURRENCY_HAZARD]="concurrency_hazard",
[DOMAIN_RUNSTATE_FULL_CONTENTION]="full_contention",
[DOMAIN_RUNSTATE_LOST]="lost",
};
enum {
POD_RECLAIM_CONTEXT_UNKNOWN=0,
POD_RECLAIM_CONTEXT_FAULT,
POD_RECLAIM_CONTEXT_BALLOON,
POD_RECLAIM_CONTEXT_MAX
};
char * pod_reclaim_context_name[] = {
[POD_RECLAIM_CONTEXT_UNKNOWN]="unknown",
[POD_RECLAIM_CONTEXT_FAULT]="fault",
[POD_RECLAIM_CONTEXT_BALLOON]="balloon",
};
#define POD_ORDER_MAX 4
struct domain_data {
struct domain_data *next;
int did;
struct vcpu_data *vcpu[MAX_CPUS];
int max_vid;
int runstate;
tsc_t runstate_tsc;
struct cycle_summary total_time;
struct cycle_summary runstates[DOMAIN_RUNSTATE_MAX];
struct cr3_value_struct *cr3_value_head;
struct eip_list_struct *emulate_eip_list;
struct eip_list_struct *interrupt_eip_list;
int guest_interrupt[GUEST_INTERRUPT_MAX+1];
struct hvm_short_summary_struct hvm_short;
struct {
int done[MEM_MAX];
int done_interval[MEM_MAX];
int done_for[MEM_MAX];
int done_for_interval[MEM_MAX];
} memops;
struct {
int reclaim_order[POD_ORDER_MAX];
int reclaim_context[POD_RECLAIM_CONTEXT_MAX];
int reclaim_context_order[POD_RECLAIM_CONTEXT_MAX][POD_ORDER_MAX];
/* FIXME: Do a full cycle summary */
int populate_order[POD_ORDER_MAX];
} pod;
};
struct domain_data * domain_list=NULL;
struct domain_data default_domain;
enum {
TOPLEVEL_GEN=0,
TOPLEVEL_SCHED,
TOPLEVEL_DOM0OP,
TOPLEVEL_HVM,
TOPLEVEL_MEM,
TOPLEVEL_PV,
TOPLEVEL_SHADOW,
TOPLEVEL_HW,
TOPLEVEL_MAX=TOPLEVEL_HW+1,
};
char * toplevel_name[TOPLEVEL_MAX] = {
[TOPLEVEL_GEN]="gen",
[TOPLEVEL_SCHED]="sched",
[TOPLEVEL_DOM0OP]="dom0op",
[TOPLEVEL_HVM]="hvm",
[TOPLEVEL_MEM]="mem",
[TOPLEVEL_PV]="pv",
[TOPLEVEL_SHADOW]="shadow",
[TOPLEVEL_HW]="hw",
};
struct trace_volume {
unsigned long long toplevel[TOPLEVEL_MAX];
unsigned long long sched_verbose;
unsigned long long hvm[HVM_VOL_MAX];
} volume;
#define UPDATE_VOLUME(_p,_x,_s) \
do { \
(_p)->volume.total._x += _s; \
(_p)->volume.last_buffer._x += _s; \
} while(0)
void volume_clear(struct trace_volume *vol)
{
bzero(vol, sizeof(*vol));
}
void volume_summary(struct trace_volume *vol)
{
int j, k;
for(j=0; jtoplevel[j]) {
printf(" %-6s: %10lld\n",
toplevel_name[j], vol->toplevel[j]);
switch(j) {
case TOPLEVEL_SCHED:
if(vol->sched_verbose)
printf(" +-verbose: %10lld\n",
vol->sched_verbose);
break;
case TOPLEVEL_HVM:
for(k=0; khvm[k])
printf(" +-%-7s: %10lld\n",
hvm_vol_name[k], vol->hvm[k]);
}
break;
}
}
}
struct pcpu_info {
/* Information about this pcpu */
unsigned active:1, summary:1;
int pid;
/* Information related to scanning thru the file */
tsc_t first_tsc, last_tsc, order_tsc;
off_t file_offset;
off_t next_cpu_change_offset;
struct record_info ri;
int last_cpu_change_pid;
int power_state;
/* Information related to tsc skew detection / correction */
struct {
tsc_t offset;
cpu_mask_t downstream; /* To detect cycles in dependencies */
} tsc_skew;
/* Information related to domain tracking */
struct vcpu_data * current;
struct {
unsigned active:1,
domain_valid:1,
seen_valid_schedule:1; /* Seen an actual schedule since lost records */
unsigned did:16,vid:16;
tsc_t tsc;
} lost_record;
/* Record volume */
struct {
tsc_t buffer_first_tsc,
buffer_dom0_runstate_tsc,
buffer_dom0_runstate_cycles[RUNSTATE_MAX];
int buffer_dom0_runstate;
unsigned buffer_size;
struct trace_volume total, last_buffer;
} volume;
/* Time report */
struct {
tsc_t tsc;
struct cycle_summary idle, running, lost;
} time;
};
void __fill_in_record_info(struct pcpu_info *p);
#define INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX 10
struct {
int max_active_pcpu;
off_t last_epoch_offset;
int early_eof;
int lost_cpus;
tsc_t now;
struct cycle_framework f;
tsc_t buffer_trace_virq_tsc;
struct pcpu_info pcpu[MAX_CPUS];
struct {
int id;
/* Invariant: head null => tail null; head !null => tail valid */
struct cr3_value_struct *head, **tail;
} cr3;
struct {
tsc_t start_tsc;
/* Information about specific interval output types */
union {
struct {
struct interval_element ** values;
int count;
} array;
struct {
struct interval_list *head, **tail;
} list;
struct cr3_value_struct *cr3;
struct {
struct domain_data *d;
int guest_vector[INTERVAL_DOMAIN_GUEST_INTERRUPT_MAX];
} domain;
};
} interval;
} P = { 0 };
/* Function prototypes */
char * pcpu_string(int pcpu);
void pcpu_string_draw(struct pcpu_info *p);
void process_generic(struct record_info *ri);
void dump_generic(FILE *f, struct record_info *ri);
ssize_t __read_record(struct trace_record *rec, off_t offset);
void error(enum error_level l, struct record_info *ri);
void update_io_address(struct io_address ** list, unsigned int pa, int dir,
tsc_t arc_cycles, unsigned int va);
int check_extra_words(struct record_info *ri, int expected_size, const char *record);
int vcpu_set_data_type(struct vcpu_data *v, int type);
void cpumask_init(cpu_mask_t *c) {
*c = 0UL;
}
void cpumask_clear(cpu_mask_t *c, int cpu) {
*c &= ~(1UL << cpu);
}
void cpumask_set(cpu_mask_t *c, int cpu) {
*c |= (1UL << cpu);
}
int cpumask_isset(const cpu_mask_t *c, int cpu) {
if(*c & (1UL<time = ((c - P.f.first_tsc) << 10) / opt.cpu_qhz;
t->s = t->time / 1000000000;
t->ns = t->time - (t->s * 1000000000);
}
void abs_cycles_to_time(unsigned long long ac, struct time_struct *t) {
if(ac > P.f.first_tsc) {
/* t->time = ((ac - P.f.first_tsc) * 1000) / (opt.cpu_hz / 1000000 ); */
/* t->s = t->time / 1000000000; */
/* t->ns = t->time % 1000000000; */
t->time = ((ac - P.f.first_tsc) << 10) / opt.cpu_qhz;
t->s = t->time / 1000000000;
t->ns = t->time - (t->s * 1000000000);
} else {
t->time = t->s = t->ns = 0;
}
}
tsc_t abs_cycles_to_global(unsigned long long ac) {
if(ac > P.f.first_tsc)
return ac - P.f.first_tsc;
else
return 0;
}
void scatterplot_vs_time(tsc_t atsc, long long y) {
struct time_struct t;
abs_cycles_to_time(atsc, &t);
printf("%u.%09u %lld\n", t.s, t.ns, y);
}
/* -- Summary Code -- */
/* With compliments to "Numerical Recipes in C", which provided the algorithm
* and basic template for this function. */
long long percentile(long long * A, int N, int ple) {
int I, J, L, R, K;
long long X, W;
/* No samples! */
if ( N == 0 )
return 0;
/* Find K, the element # we want */
K=N*ple/100;
/* Set the left and right boundaries of the current search space */
L=0; R=N-1;
while(L < R) {
/* X: The value to order everything higher / lower than */
X=A[K];
/* Starting at the left and the right... */
I=L;
J=R;
do {
/* Find the first element on the left that is out-of-order w/ X */
while(A[I]> 1;
/* X: The value to order everything higher / lower than */
X=A[K];
/* Starting at the left and the right... */
I=L; I_weight = L_weight;
J=R; J_weight = R_weight;
do {
/* Find the first element on the left that is out-of-order w/ X */
while(A[I]> 1;
/* X: The value to order everything higher / lower than */
X=A[K];
/* Starting at the left and the right... */
I=L; I_weight = L_weight;
J=R; J_weight = R_weight;
do {
/* Find the first element on the left that is out-of-order w/ X */
while(A[I]cycles, f->total_cycles);
}
static inline double summary_percent_global(struct cycle_summary *s) {
return __summary_percent(s, &P.f);
}
static inline void update_cycles(struct cycle_summary *s, long long c) {
s->event_count++;
if (!c)
return;
if(opt.sample_size) {
if (s->count >= s->sample_size
&& (s->count == 0
|| opt.sample_max == 0
|| s->sample_size < opt.sample_max)) {
int new_size;
void * new_sample = NULL;
new_size = s->sample_size << 1;
if (new_size == 0)
new_size = opt.sample_size;
if (opt.sample_max != 0 && new_size > opt.sample_max)
new_size = opt.sample_max;
new_sample = realloc(s->sample, sizeof(*s->sample) * new_size);
if (new_sample) {
s->sample = new_sample;
s->sample_size = new_size;
}
}
if (s->count < s->sample_size) {
s->sample[s->count]=c;
} else {
/*
* If we run out of space for samples, start taking only a
* subset of samples.
*/
int lap, index;
lap = (s->count/s->sample_size)+1;
index =s->count % s->sample_size;
if((index - (lap/3))%lap == 0) {
s->sample[index]=c;
}
}
}
s->count++;
s->cycles += c;
s->interval.count++;
s->interval.cycles += c;
}
static inline void clear_interval_cycles(struct interval_element *e) {
e->cycles = 0;
e->count = 0;
e->instructions = 0;
}
static inline void print_cpu_affinity(struct cycle_summary *s, char *p) {
if(s->count) {
long long avg;
avg = s->cycles / s->count;
if ( opt.sample_size ) {
long long p5, p50, p95;
int data_size = s->count;
if(data_size > s->sample_size)
data_size = s->sample_size;
p50 = percentile(s->sample, data_size, 50);
p5 = percentile(s->sample, data_size, 5);
p95 = percentile(s->sample, data_size, 95);
printf("%s: %7d %6lld {%6lld|%6lld|%6lld}\n",
p, s->count, avg, p5, p50, p95);
} else {
printf("%s: %7d %6lld\n",
p, s->count, avg);
}
}
}
static inline void print_cycle_percent_summary(struct cycle_summary *s,
tsc_t total, char *p) {
if(s->count) {
long long avg;
double percent, seconds;
avg = s->cycles / s->count;
seconds = ((double)s->cycles) / opt.cpu_hz;
percent = ((double)(s->cycles * 100)) / total;
if ( opt.sample_size ) {
long long p5, p50, p95;
int data_size = s->count;
if(data_size > s->sample_size)
data_size = s->sample_size;
p50 = self_weighted_percentile(s->sample, data_size, 50);
p5 = self_weighted_percentile(s->sample, data_size, 5);
p95 = self_weighted_percentile(s->sample, data_size, 95);
printf("%s: %7d %5.2lfs %5.2lf%% %6lld {%6lld|%6lld|%6lld}\n",
p, s->count,
seconds,
percent,
avg, p5, p50, p95);
} else {
printf("%s: %7d %5.2lfs %5.2lf%% %6lld\n",
p, s->count,
seconds,
percent,
avg);
}
}
}
static inline void print_cycle_summary(struct cycle_summary *s, char *p) {
if(s->count) {
long long avg;
avg = s->cycles / s->count;
if ( opt.sample_size ) {
long long p5, p50, p95;
int data_size = s->count;
if(data_size > s->sample_size)
data_size = s->sample_size;
p50 = self_weighted_percentile(s->sample, data_size, 50);
p5 = self_weighted_percentile(s->sample, data_size, 5);
p95 = self_weighted_percentile(s->sample, data_size, 95);
printf("%s: %7d %5.2lfs %6lld {%6lld|%6lld|%6lld}\n",
p, s->count, ((double)s->cycles)/opt.cpu_hz,
avg, p5, p50, p95);
} else {
printf("%s: %7d %5.2lfs %6lld\n",
p, s->count, ((double)s->cycles)/opt.cpu_hz, avg);
}
}
}
#define PRINT_SUMMARY(_s, _p...) \
do { \
if((_s).event_count) { \
if ( opt.sample_size ) { \
unsigned long long p5, p50, p95; \
int data_size=(_s).count; \
if(data_size > (_s).sample_size) \
data_size=(_s).sample_size; \
p50=percentile((_s).sample, data_size, 50); \
p5=percentile((_s).sample, data_size, 5); \
p95=percentile((_s).sample, data_size, 95); \
printf(_p); \
printf(" %7d %5.2lfs %5.2lf%% %5lld cyc {%5lld|%5lld|%5lld}\n", \
(_s).event_count, \
((double)(_s).cycles)/opt.cpu_hz, \
summary_percent_global(&(_s)), \
(_s).count ? (_s).cycles / (_s).count:0, \
p5, p50, p95); \
} else { \
printf(_p); \
printf(" %7d %5.2lfs %5.2lf%% %5lld cyc\n", \
(_s).event_count, \
((double)(_s).cycles)/opt.cpu_hz, \
summary_percent_global(&(_s)), \
(_s).count ? (_s).cycles / (_s).count:0); \
} \
} \
} while(0)
#define INTERVAL_DESC_MAX 31
struct interval_list {
struct interval_element *elem;
struct interval_list *next;
char desc[INTERVAL_DESC_MAX+1]; /* +1 for the null terminator */
};
void __interval_cycle_percent_output(struct interval_element *e, tsc_t cycles) {
printf(" %.02lf",
__cycles_percent(e->cycles, cycles));
clear_interval_cycles(e);
}
void interval_cycle_percent_output(struct interval_element *e) {
__interval_cycle_percent_output(e, opt.interval.cycles);
}
void interval_time_output(void) {
struct time_struct t;
abs_cycles_to_time(P.interval.start_tsc, &t);
printf("%u.%09u", t.s, t.ns);
}
void interval_table_output(void) {
int i;
interval_time_output();
if(opt.interval.mode == INTERVAL_MODE_ARRAY) {
for(i=0; inext)
interval_cycle_percent_output(p->elem);
}
printf("\n");
}
void interval_table_tail(void) {
struct interval_list *p;
printf("time");
for(p=P.interval.list.head; p; p = p->next)
printf(" %s", p->desc);
printf("\n");
}
void interval_table_alloc(int count) {
P.interval.array.count = count;
P.interval.array.values = malloc(count * sizeof(struct interval_list *));
if(!P.interval.array.values) {
fprintf(stderr, "Malloc failed!\n");
error(ERR_SYSTEM, NULL);
}
bzero(P.interval.array.values, count*sizeof(struct interval_list *));
}
void interval_list_add(struct interval_element *e, char *desc) {
struct interval_list *p;
fprintf(warn, "%s: Adding element '%s'\n", __func__, desc);
if((p=malloc(sizeof(*p)))==NULL) {
fprintf(stderr, "malloc() failed.\n");
error(ERR_SYSTEM, NULL);
}
bzero(p, sizeof(*p));
p->elem = e;
strncpy(p->desc, desc, INTERVAL_DESC_MAX);
p->next=NULL;
if(P.interval.list.head)
*P.interval.list.tail = p;
else
P.interval.list.head = p;
P.interval.list.tail = &p->next;
}
void interval_cr3_schedule_time_header(void) {
if( opt.interval.mode == INTERVAL_MODE_ARRAY ) {
int i;
printf("time");
for(i=0; igmfn == opt.interval.array.values[i]) {
if(P.interval.array.values[i]) {
fprintf(stderr, "Fatal: duplicate cr3 value %llx!\n",
cr3->gmfn);
error(ERR_ASSERT, NULL);
}
fprintf(stderr, "%s: found gmfn %llx\n",
__func__, cr3->gmfn);
P.interval.array.values[i] = &cr3->total_time.interval;
}
}
} else if(opt.interval.mode == INTERVAL_MODE_LIST) {
char desc[32];
snprintf(desc, 32, "%llx", cr3->gmfn);
interval_list_add(&cr3->total_time.interval, desc);
} else {
/* Custom */
if(cr3->gmfn == opt.interval.array.values[0])
P.interval.cr3 = cr3;
}
}
int cr3_time_compare(const void *_a, const void *_b) {
struct cr3_value_struct *a=*(typeof(&a))_a;
struct cr3_value_struct *b=*(typeof(&a))_b;
if(a->total_time.interval.cycles < b->total_time.interval.cycles)
return 1;
else if(b->total_time.interval.cycles == a->total_time.interval.cycles) {
if(a->total_time.interval.count < b->total_time.interval.count)
return 1;
else if(a->total_time.interval.count == b->total_time.interval.count)
return 0;
else
return -1;
} else
return -1;
}
void interval_cr3_schedule_ordered_output(void) {
struct cr3_value_struct *p;
int i;
struct cr3_value_struct **qsort_array;
int N=0;
for(p=P.cr3.head; p; p=p->gnext)
N++;
if(!N)
return;
qsort_array = malloc(N * sizeof(struct eip_list_struct *));
for(i=0, p=P.cr3.head; p; p=p->gnext, i++)
qsort_array[i]=p;
qsort(qsort_array, N, sizeof(struct eip_list_struct *),
cr3_time_compare);
interval_time_output();
for(i=0; itotal_time.interval.cycles > 0) {
printf(" %8llx: %.02lf %c\n",
p->gmfn,
__cycles_percent(p->total_time.interval.cycles,
opt.interval.cycles),
(p->first_time > P.interval.start_tsc)?'*':' ');
}
clear_interval_cycles(&p->total_time.interval);
}
free(qsort_array);
}
void interval_cr3_short_summary_header(void) {
int i;
printf("time guest");
for(i=0; ihvm.s;
printf(" %.02lf",
__cycles_percent(p->total_time.interval.cycles,
opt.interval.cycles));
for(i=0; itotal_time.interval.cycles);
clear_interval_cycles(&p->total_time.interval);
printf("\n");
}
}
void interval_domain_value_check(struct domain_data *d) {
if( opt.interval.mode == INTERVAL_MODE_ARRAY ) {
int i;
for(i=0; idid == opt.interval.array.values[i]) {
if(P.interval.array.values[i]) {
fprintf(stderr, "Fatal: duplicate domain value %d!\n",
d->did);
error(ERR_ASSERT, NULL);
}
P.interval.array.values[i] = &d->total_time.interval;
}
}
} else if(opt.interval.mode == INTERVAL_MODE_LIST) {
char desc[32];
snprintf(desc, 32, "%d", d->did);
interval_list_add(&d->total_time.interval, desc);
} else {
if(d->did == opt.interval.array.values[0])
P.interval.domain.d = d;
}
}
void interval_domain_short_summary_header(void) {
int i;
printf("time running");
for(i=0; itotal_time.interval);
for(i=0; ihvm_short.s[i].interval);
printf("\n");
}
}
void interval_domain_guest_interrupt(struct hvm_data *h, int vector) {
struct domain_data *d = h->v->d;
int i;
/* Check to see if this vector is in the "print list" */
for(i=0; iguest_interrupt[vector]++;
}
}
void interval_domain_guest_interrupt_tail(void) {
int i;
printf("time running");
for(i=0; iguest_interrupt[v]);
d->guest_interrupt[v]=0;
}
printf("\n");
}
}
void interval_domain_grant_maps_output(void) {
if(P.interval.domain.d) {
struct domain_data *d;
d=P.interval.domain.d;
interval_time_output();
printf(" %d", d->memops.done_for_interval[MEM_PAGE_GRANT_MAP]);
d->memops.done_for_interval[MEM_PAGE_GRANT_MAP] = 0;
printf("\n");
}
}
/* General interval gateways */
void interval_callback(void) {
/* First, see if we're in generic mode. */
switch(opt.interval.mode) {
case INTERVAL_MODE_LIST:
case INTERVAL_MODE_ARRAY:
interval_table_output();
return;
default:
break;
}
switch(opt.interval.output) {
case INTERVAL_CR3_SCHEDULE_ORDERED:
interval_cr3_schedule_ordered_output();
break;
case INTERVAL_CR3_SHORT_SUMMARY:
interval_cr3_short_summary_output();
break;
case INTERVAL_DOMAIN_SHORT_SUMMARY:
interval_domain_short_summary_output();
break;
case INTERVAL_DOMAIN_GUEST_INTERRUPT:
interval_domain_guest_interrupt_output();
break;
case INTERVAL_DOMAIN_GRANT_MAPS:
interval_domain_grant_maps_output();
break;
default:
break;
}
}
void interval_header(void) {
switch(opt.interval.output) {
case INTERVAL_CR3_SHORT_SUMMARY:
interval_cr3_short_summary_header();
break;
case INTERVAL_DOMAIN_SHORT_SUMMARY:
interval_domain_short_summary_header();
break;
default:
break;
}
}
void interval_tail(void) {
if(opt.interval.mode == INTERVAL_MODE_LIST) {
interval_table_tail();
return;
}
switch(opt.interval.output) {
case INTERVAL_DOMAIN_GUEST_INTERRUPT:
interval_domain_guest_interrupt_tail();
break;
default:
break;
}
}
/* -- Eip list data -- */
void update_eip(struct eip_list_struct **head, unsigned long long eip,
unsigned long long cycles, int type, void * extra) {
struct eip_list_struct *p, **last=head;
for(p=*head; p; last = (&p->next), p=p->next)
if(p->eip >= eip)
break;
if(!p || p->eip != eip) {
p=malloc(sizeof(*p));
if(!p) {
perror("malloc failed");
error(ERR_SYSTEM, NULL);
}
bzero(p, sizeof(*p));
p->eip=eip;
p->type = type;
if(eip_list_type[type].new) {
eip_list_type[type].new(p, extra);
}
p->next = *last;
*last=p;
} else if(p->type != type) {
fprintf(stderr, "WARNING, mixed types! %d %d\n", p->type, type);
} else if(eip_list_type[type].update) {
eip_list_type[type].update(p, extra);
}
update_cycles(&p->summary, cycles);
}
int eip_compare(const void *_a, const void *_b) {
struct eip_list_struct *a=*(typeof(&a))_a;
struct eip_list_struct *b=*(typeof(&a))_b;
if(a->summary.cycles < b->summary.cycles)
return 1;
else if(b->summary.cycles == a->summary.cycles) {
if(a->summary.count < b->summary.count)
return 1;
else if(a->summary.count == b->summary.count)
return 0;
else
return -1;
} else
return -1;
}
void dump_eip(struct eip_list_struct *head) {
struct eip_list_struct *p;
int i;
int total = 0;
struct eip_list_struct **qsort_array;
int N=0;
for(p=head; p; p=p->next)
{
total += p->summary.count;
N++;
}
if(!N)
return;
qsort_array = malloc(N * sizeof(struct eip_list_struct *));
for(i=0, p=head; p; p=p->next, i++)
qsort_array[i]=p;
qsort(qsort_array, N, sizeof(struct eip_list_struct *),
eip_compare);
/* WARNING: don't use N after this point unless you copy this variable */
#if 0
if(opt.summary_eip_limit && opt.summary_eip_limit < N)
N=opt.summary_eip_limit;
#endif
printf(" Total samples: %d\n", total);
for(i=0; isummary.cycles )
PRINT_SUMMARY(p->summary, " %12llx%-45s: ",
p->eip,
find_symbol(p->eip));
else
{
printf(" %12llx%-45s: ",
p->eip,
find_symbol(p->eip));
printf(" %7d %5.2lf%%\n",
p->summary.count,
((double)p->summary.count*100)/total);
}
if(eip_list_type[p->type].dump) {
eip_list_type[p->type].dump(p);
}
}
free(qsort_array);
}
/* -- HVM code -- */
struct hvm_pf_xen_record {
//unsigned vcpu:16, domain:16;
union {
struct {
unsigned long long va;
unsigned int error_code;
} x64;
struct {
unsigned int va;
unsigned int error_code;
} x32;
};
};
void hvm_update_short_summary(struct hvm_data *h, int element) {
struct vcpu_data *v = h->v;
if(v->cr3.data)
update_cycles(&v->cr3.data->hvm.s[element], h->arc_cycles);
update_cycles(&v->d->hvm_short.s[element], h->arc_cycles);
h->short_summary_done=1;
}
void hvm_short_summary(struct hvm_short_summary_struct *hss,
tsc_t total, char *prefix) {
char desc[80];
int i;
for(i=0; is + i, total, desc);
}
}
/* Wrapper to try to make sure this is only called once per
* call site, rather than walking through the list each time */
#define hvm_set_summary_handler(_h, _s, _d) \
do { \
static int done=0; \
int ret; \
if(!done) { \
if ((ret=__hvm_set_summary_handler(_h, _s, _d))) \
fprintf(stderr, "%s: hvm_set_summary_handler returned %d\n", \
__func__, ret); \
done=1; \
} \
} while(0)
int __hvm_set_summary_handler(struct hvm_data *h, void (*s)(struct hvm_data *h, void*d), void*d) {
/* Set summary handler */
if(h->exit_reason < h->exit_reason_max)
{
struct hvm_summary_handler_node *p, **q;
/* Find the end of the list, checking to make sure there are no
* duplicates along the way */
q=&h->exit_reason_summary_handler_list[h->exit_reason];
p = *q;
while(p)
{
if(p->handler == s && p->data == d)
{
fprintf(stderr, "%s: Unexpected duplicate handler %p,%p\n",
__func__, s, d);
error(ERR_STRICT, NULL);
return -EBUSY;
}
q=&p->next;
p=*q;
}
assert(p==NULL);
/* Insert the new handler */
p=malloc(sizeof(*p));
if (!p) {
fprintf(stderr, "%s: Malloc failed!\n", __func__);
error(ERR_SYSTEM, NULL);
}
p->handler=s;
p->data = d;
p->next=*q;
*q=p;
return 0;
}
return -EINVAL;
}
void hvm_generic_postprocess(struct hvm_data *h);
static int hvm_set_postprocess(struct hvm_data *h, void (*s)(struct hvm_data *h))
{
if ( h->post_process == NULL
|| h->post_process == hvm_generic_postprocess )
{
h->post_process = s;
return 0;
}
else
return 1;
}
#define SIGN_EXTENDED_BITS (~((1ULL<<48)-1))
#define HIGH_BIT(_v) ((_v) & (1ULL<<47))
static inline int is_valid_addr64(unsigned long long va)
{
if(HIGH_BIT(va))
return ((va & SIGN_EXTENDED_BITS) == SIGN_EXTENDED_BITS);
else
return ((va & SIGN_EXTENDED_BITS) == 0);
}
void hvm_pf_xen_summary(struct hvm_data *h, void *d) {
int i,j, k;
printf(" page_fault\n");
for(i=0; isummary.pf_xen[i],
" %-25s ", pf_xen_name[i]);
}
else
{
PRINT_SUMMARY(h->summary.pf_xen[i],
" [%23d] ", i);
}
switch(i){
case PF_XEN_NON_EMULATE:
for(j=0; jsummary.pf_xen_non_emul[j],
" *%-13s ", pf_xen_non_emul_name[j]);
break;
case PF_XEN_EMULATE:
for(j=0; jsummary.pf_xen_emul[j],
" *%-13s ", pf_xen_emul_name[j]);
if(j == PF_XEN_EMUL_EARLY_UNSHADOW) {
int k;
for(k=0; k<5; k++) {
PRINT_SUMMARY(h->summary.pf_xen_emul_early_unshadow[k],
" +[%d] ", k);
}
}
}
break;
case PF_XEN_FIXUP:
for(j=0; jsummary.pf_xen_fixup[j],
" *%-13s ", pf_xen_fixup_name[j]);
if(j == PF_XEN_FIXUP_UNSYNC ) {
for(k=0; ksummary.pf_xen_fixup_unsync_resync[k],
" +[%3d] ", k);
}
PRINT_SUMMARY(h->summary.pf_xen_fixup_unsync_resync[k],
" +[max] ");
}
}
break;
}
}
}
void pf_preprocess(struct pf_xen_extra *e, int guest_paging_levels)
{
switch(guest_paging_levels) {
/* Select a subfield of _bits bits starting at bit _shift from _x */
#define _SUBFIELD(_bits, _shift, _x) \
(((_x)>>(_shift)) & ((1ULL<<(_bits))-1))
case 4:
/* Verify sign-extension */
if((HIGH_BIT(e->va)
&&((e->va & SIGN_EXTENDED_BITS) != SIGN_EXTENDED_BITS))
|| (!HIGH_BIT(e->va)
&& ((e->va & SIGN_EXTENDED_BITS) != 0))) {
fprintf(warn, "Strange, va %llx not properly sign extended for 4-level pagetables\n",
e->va);
}
e->pt_index[4]=_SUBFIELD(9,39,e->va);
e->pt_index[3]=_SUBFIELD(9,30,e->va);
e->pt_index[2]=_SUBFIELD(9,21,e->va);
e->pt_index[1]=_SUBFIELD(9,12,e->va);
/* These are only useful for the linear-pagetable code */
e->pt_index[0]=_SUBFIELD(9,3,e->va);
if(e->va & 0x4)
e->pt_is_lo=0;
break;
case 3:
e->pt_index[3]=_SUBFIELD(2,30,e->va);
e->pt_index[2]=_SUBFIELD(9,21,e->va);
e->pt_index[1]=_SUBFIELD(9,12,e->va);
/* These are only useful for the linear-pagetable code */
e->pt_index[0]=_SUBFIELD(9,3,e->va);
if(e->va & 0x4)
e->pt_is_lo=0;
break;
case 2:
e->pt_index[2]=_SUBFIELD(10,22,e->va);
e->pt_index[1]=_SUBFIELD(10,12,e->va);
/* This is only useful for the linear pagetable code */
e->pt_index[0]=_SUBFIELD(10,2,e->va);
break;
case 0:
break;
default:
fprintf(warn, "Don't know how to handle %d-level pagetables\n",
guest_paging_levels);
}
e->corresponding_va = CORR_VA_INVALID;
e->pt_level = 0;
/* Detect accesses to Windows linear pagetables */
switch(guest_paging_levels)
{
case 2:
if(e->pt_index[2] == 768) {
if(e->pt_index[1] == 768)
{
e->pt_level = 2;
e->corresponding_va=((1UL<<22)-1)
| e->pt_index[0]<<22;
}
else
{
e->pt_level = 1;
e->corresponding_va = ((1UL<<12)-1)
| e->pt_index[1]<<22
| e->pt_index[0]<<12;
}
}
break;
case 3:
if(e->pt_index[3]==3 && (e->pt_index[2]>>2==0))
{
if(e->pt_index[2]==3 && e->pt_index[1]>>2==0)
{
if(e->pt_index[1] == 3 && e->pt_index[0]>>2==0)
{
e->pt_level = 3;
e->corresponding_va=((1UL<<30)-1)
| e->pt_index[0]<<30;
}
else
{
e->pt_level = 2;
e->corresponding_va=((1UL<<21)-1)
| e->pt_index[1]<<30
| e->pt_index[2]<<21;
}
}
else
{
e->pt_level = 1;
e->corresponding_va = ((1UL<<12)-1)
| e->pt_index[0]<<12
| e->pt_index[1]<<21
| e->pt_index[2]<<30;
}
}
break;
case 4:
if(e->pt_index[4] == 0x1ed)
{
if(e->pt_index[3] == 0x1ed)
{
if(e->pt_index[2] == 0x1ed)
{
if(e->pt_index[1] == 0x1ed)
{
e->pt_level = 4;
e->corresponding_va = ((1ULL<<39)-1)
| (unsigned long long)e->pt_index[0]<<39;
}
else
{
e->pt_level = 3;
e->corresponding_va = ((1ULL<<30)-1)
| (unsigned long long)e->pt_index[0]<<30
| (unsigned long long)e->pt_index[1]<<39;
}
}
else
{
e->pt_level = 2;
e->corresponding_va = ((1ULL<<21)-1)
| (unsigned long long)e->pt_index[0]<<21
| (unsigned long long)e->pt_index[1]<<30
| (unsigned long long)e->pt_index[2]<<39;
}
}
else
{
e->pt_level = 1;
e->corresponding_va = ((1ULL<<12)-1)
| (unsigned long long)e->pt_index[0]<<12
| (unsigned long long)e->pt_index[1]<<21
| (unsigned long long)e->pt_index[2]<<30
| (unsigned long long)e->pt_index[3]<<39;
}
if(HIGH_BIT(e->corresponding_va))
e->corresponding_va |= SIGN_EXTENDED_BITS;
}
break;
default:
break;
}
}
void hvm_pf_xen_preprocess(unsigned event, struct hvm_data *h) {
struct pf_xen_extra *e = &h->inflight.pf_xen;
struct mmio_info *m = &h->inflight.mmio;
struct hvm_pf_xen_record *r = (typeof(r))h->d;
if(event == TRC_HVM_PF_XEN64)
{
if(!is_valid_addr64(r->x64.va))
fprintf(warn, "%s: invalid va %llx",
__func__, r->x64.va);
e->va = r->x64.va;
e->error_code = r->x64.error_code;
}
else
{
e->va = r->x32.va;
e->error_code = r->x32.error_code;
}
if(m->data_valid)
e->pf_case = PF_XEN_MMIO;
else
{
pf_preprocess(e, h->v->guest_paging_levels);
/* On rio traces, we try to infer emulation by looking for accesses
in the linear pagetable */
if(e->pt_level > 0)
e->pf_case = PF_XEN_EMULATE;
else
e->pf_case = PF_XEN_NON_EMULATE;
}
}
static inline int is_kernel(int paging_levels, unsigned long long va) {
switch(paging_levels) {
case 2:
case 3:
if(va & 0x80000000)
return 1;
else
return 0;
break;
case 4:
if(HIGH_BIT(va))
return 1;
else return 0;
default:
return 0;
}
}
void hvm_pf_xen_postprocess(struct hvm_data *h) {
struct pf_xen_extra *e = &h->inflight.pf_xen;
if(opt.summary_info) {
if(e->pf_case)
update_cycles(&h->summary.pf_xen[e->pf_case],
h->arc_cycles);
else
fprintf(warn, "Strange, pf_case 0!\n");
switch(e->pf_case)
{
case PF_XEN_EMULATE:
update_eip(&h->v->d->emulate_eip_list,
h->rip,
h->arc_cycles,
0, NULL);
break;
case PF_XEN_NON_EMULATE:
if(is_kernel(h->v->guest_paging_levels, h->rip))
update_cycles(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_EIP_KERNEL],
h->arc_cycles);
else
update_cycles(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_EIP_USER],
h->arc_cycles);
if(is_kernel(h->v->guest_paging_levels, e->va))
update_cycles(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_VA_KERNEL],
h->arc_cycles);
else
update_cycles(&h->summary.pf_xen_non_emul[PF_XEN_NON_EMUL_VA_USER],
h->arc_cycles);
}
/* Set summary handler */
hvm_set_summary_handler(h, hvm_pf_xen_summary, NULL);
}
}
void hvm_pf_xen_process(struct record_info *ri, struct hvm_data *h) {
struct pf_xen_extra *e = &h->inflight.pf_xen;
if(ri->event == TRC_HVM_PF_XEN64
&& h->v->guest_paging_levels != 4)
fprintf(warn, "Strange, PF_XEN64 but guest_paging_levels %d!\n",
h->v->guest_paging_levels);
else if(ri->event == TRC_HVM_PF_XEN
&& h->v->guest_paging_levels == 4)
fprintf(warn, "Strange, PF_XEN but guest_paging_levels %d!\n",
h->v->guest_paging_levels);
hvm_pf_xen_preprocess(ri->event, h);
if(opt.dump_all)
{
if(e->pf_case == PF_XEN_EMULATE)
printf("]%s pf_xen:emulate va %llx ec %x level %d corr %llx e->pt_index[%d %d %d %d %d]\n",
ri->dump_header, e->va, e->error_code,
e->pt_level, e->corresponding_va,
e->pt_index[0], e->pt_index[1], e->pt_index[2],
e->pt_index[3],
e->pt_index[4]);
else
printf("]%s pf_xen va %llx ec %x e->pt_index[%d %d %d %d %d]\n",
ri->dump_header, e->va, e->error_code,
e->pt_index[0], e->pt_index[1], e->pt_index[2],
e->pt_index[3],
e->pt_index[4]);
}
if ( hvm_set_postprocess(h, hvm_pf_xen_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
char * hvm_vlapic_icr_dest_shorthand_name[4] = {
"dest_field", "self", "all-inc", "all-exc"
};
void hvm_vlapic_vmentry_cleanup(struct vcpu_data *v, tsc_t tsc)
{
int i;
struct vlapic_struct *vla = &v->vlapic;
for(i=0; ioutstanding.list + i;
if(!(o->valid && o->injected))
continue;
if(tsc >= o->first_tsc)
lat = tsc - o->first_tsc;
else
fprintf(warn, "Strange, vec %d first_tsc %lld > ri->tsc %lld!\n",
o->vec, o->first_tsc, tsc);
if(opt.dump_ipi_latency
|| (opt.dump_all && o->count > 1)) {
struct time_struct t;
cycles_to_time(lat, &t);
printf(" [vla] d%dv%d vec %d ipis %d, latency %lld (%u.%09u s)\n",
v->d->did, v->vid, o->vec, o->count, lat,
t.s, t.ns);
}
#if 0
/* FIXME: make general somehow */
if(opt.summary_info)
{
update_cycles(&h->summary.ipi_latency, lat);
h->summary.ipi_count[vla->outstanding_ipis]++;
}
#endif
o->vec = o->count = o->injected = o->valid = o->first_tsc = 0;
}
}
void hvm_vlapic_clear(struct vlapic_struct *vla)
{
bzero(vla, sizeof(*vla));
}
struct outstanding_ipi *find_vec(struct vlapic_struct *vla, int vec)
{
struct outstanding_ipi *o = NULL;
int i;
/* Find the entry for this vector, or the first empty one. */
for(i=0; ioutstanding.list[i].valid && vla->outstanding.list[i].vec == vec)
{
o = vla->outstanding.list + i;
break;
} else if(!vla->outstanding.list[i].valid && !o)
o = vla->outstanding.list + i;
}
if(o && !o->valid) {
o->vec = vec;
o->valid = 1;
}
return o;
}
void ipi_send(struct vcpu_data *ov, int vec)
{
struct vlapic_struct *vla;
struct outstanding_ipi *o = NULL;
if(ov->runstate.state == RUNSTATE_LOST) {
if(opt.dump_all)
fprintf(warn, "%s: v%d in state RUNSTATE_LOST, not counting ipi\n",
__func__, ov->vid);
return;
}
vla = &ov->vlapic;
o = find_vec(vla, vec);
if(!o)
{
fprintf(warn, "%s: Couldn't find an open slot!\n",
__func__);
return;
}
if(!o->first_tsc)
o->first_tsc = P.now;
if(opt.dump_all && o->count == 0 && o->injected)
printf(" [vla] Pre-injection\n");
o->count++;
if((opt.dump_all)
#if 0
&& (ov->runstate.state != RUNSTATE_RUNNING
|| ov->hvm.vmexit_valid)
#endif
)
printf(" [vla] d%dv%d vec %d state %s (outstanding ipis %d)\n",
ov->d->did, ov->vid,
o->vec,
runstate_name[ov->runstate.state],
o->count);
}
void hvm_vlapic_icr_handler(struct hvm_data *h)
{
struct mmio_info *m = &h->inflight.mmio;
union {
unsigned int val;
struct {
unsigned vec:8,
delivery_mode:3,
dest_mode:1,
delivery_status:1,
_res1:1,
level:1,
trigger:1,
_res2:2,
dest_shorthand:2;
};
} icr = { .val = m->data };
if(m->is_write) {
if(opt.dump_all) {
printf(" [vla] d%dv%d icr vec %d %s\n",
h->v->d->did, h->v->vid,
icr.vec,
hvm_vlapic_icr_dest_shorthand_name[icr.dest_shorthand]);
}
if(icr.dest_shorthand == 3)
{
struct vcpu_data *ov, *v = h->v;
struct domain_data *d = v->d;
int i;
for(i=0; ivcpu[i];
if(!ov || ov == v)
continue;
ipi_send(ov, icr.vec);
}
} else if(icr.dest_shorthand != 1) {
#if 0
fprintf(warn, "Strange, vlapic icr %s vec %d!\n",
hvm_vlapic_icr_dest_shorthand_name[icr.dest_shorthand],
icr.vec);
#endif
}
} else {
/* Read */
if(opt.dump_all) {
printf(" [vla] d%dv%d icr status %s\n",
h->v->d->did, h->v->vid,
icr.delivery_status?"pending":"idle");
}
}
}
void hvm_vlapic_inject(struct vcpu_data *v, int vec)
{
struct vlapic_struct *vla = &v->vlapic;
struct outstanding_ipi *o = NULL;
o = find_vec(vla, vec);
if(o) {
if(opt.dump_all)
printf(" [vla] d%dv%d vec %d injecting\n",
v->d->did, v->vid, vec);
o->injected=1;
} else {
fprintf(stderr, "%s: Couldn't find an open ipi slot!\n",
__func__);
}
}
void hvm_vlapic_eoi_handler(struct hvm_data *h) {
if(opt.dump_all)
printf(" [vla] d%dv%d eoi\n",
h->v->d->did, h->v->vid);
}
void hvm_vlapic_handler(struct hvm_data *h)
{
struct mmio_info *m = &h->inflight.mmio;
switch(m->gpa) {
case 0xfee00300:
hvm_vlapic_icr_handler(h);
break;
case 0xfee000b0:
hvm_vlapic_eoi_handler(h);
break;
}
}
/* Also called by shadow_mmio_postprocess */
#define MMIO_VGA_START (0xa0000)
#define MMIO_VGA_END (0xbffff)
void enumerate_mmio(struct hvm_data *h)
{
struct mmio_info *m = &h->inflight.mmio;
/* Skip vga area */
if ( opt.mmio_enumeration_skip_vga
&& m->gpa >= MMIO_VGA_START
&& m->gpa < MMIO_VGA_END)
{
warn_once("WARNING: Not enumerationg MMIO in VGA range. Use --mmio-enumeration-skip-vga=0 to override.\n");
return;
}
if ( m->data_valid )
update_io_address(&h->summary.io.mmio, m->gpa, m->is_write, h->arc_cycles, m->va);
}
void hvm_mmio_summary(struct hvm_data *h, void *data)
{
long reason=(long)data;
PRINT_SUMMARY(h->summary.mmio[reason],
" mmio ");
}
void hvm_mmio_assist_postprocess(struct hvm_data *h)
{
long reason;
switch(h->exit_reason)
{
case VMEXIT_NPF:
case EXIT_REASON_EPT_VIOLATION:
reason=NONPF_MMIO_NPF;
hvm_set_summary_handler(h, hvm_mmio_summary, (void *)reason);
break;
case EXIT_REASON_APIC_ACCESS:
reason=NONPF_MMIO_APIC;
hvm_set_summary_handler(h, hvm_mmio_summary, (void *)reason);
break;
default:
{
static int warned = 0;
if (!warned)
{
fprintf(warn, "%s: Strange, MMIO with unexpected exit reason %d\n",
__func__, h->exit_reason);
warned=1;
}
reason=NONPF_MMIO_UNKNOWN;
hvm_set_summary_handler(h, hvm_mmio_summary, (void *)reason);
break;
}
}
if(opt.summary_info)
{
update_cycles(&h->summary.mmio[reason],
h->arc_cycles);
}
if ( opt.with_mmio_enumeration )
enumerate_mmio(h);
}
#define HVM_IO_ASSIST_WRITE 0x200
void hvm_mmio_assist_process(struct record_info *ri, struct hvm_data *h)
{
struct mmio_info *m = &h->inflight.mmio;
union {
struct {
unsigned int gpa;
unsigned int data;
} x32;
struct {
unsigned long long gpa;
unsigned int data;
} x64;
} *r = (typeof(r))h->d;
union {
unsigned event;
struct {
unsigned minor:8,
x64:1,
write:2;
};
} mevt = { .event = ri->event };
if(mevt.x64) {
m->gpa = r->x64.gpa;
m->data = r->x64.data;
if(ri->extra_words*(sizeof(unsigned int))==sizeof(r->x64))
m->data_valid=1;
} else {
m->gpa = r->x32.gpa;
m->data = r->x32.data;
if(ri->extra_words*(sizeof(unsigned int))==sizeof(r->x32))
m->data_valid=1;
}
m->is_write = mevt.write;
if(opt.dump_all)
{
if(m->data_valid)
printf("]%s mmio_assist %c gpa %llx data %x\n",
ri->dump_header,
mevt.write?'w':'r',
m->gpa, m->data);
else
printf("]%s mmio_assist %c gpa %llx (no data)\n", ri->dump_header,
mevt.write?'w':'r', m->gpa);
}
if((m->gpa & 0xfffff000) == 0xfee00000)
hvm_vlapic_handler(h);
/* Catch MMIOs that don't go through the shadow code; tolerate
* failures to set (probably shadow_mmio) */
hvm_set_postprocess(h, hvm_mmio_assist_postprocess);
}
void hvm_inj_virq_process(struct record_info *ri, struct hvm_data *h) {
struct {
int vector, fake;
} *r = (typeof(r))h->d;
if(opt.dump_all) {
printf(" %s inj_virq vec %u %s\n",
ri->dump_header,
r->vector, r->fake?"fake":"real");
}
if(opt.summary_info)
{
int vector = r->vector;
if(vector >= GUEST_INTERRUPT_MAX)
vector = GUEST_INTERRUPT_MAX;
h->summary.guest_interrupt[vector].count++;
if(opt.interval.output == INTERVAL_DOMAIN_GUEST_INTERRUPT)
interval_domain_guest_interrupt(h, vector);
}
/* If we're waking, make this the wake vector */
if(r->vector < GUEST_INTERRUPT_MAX ) {
int vector = r->vector;
if ( h->w2h.waking && h->w2h.vector == 0 ) {
if(h->summary.guest_interrupt[vector].start_tsc) {
fprintf(warn, "Strange, d%dv%d waking && wake_vector 0 but vec %d start_tsc %lld!\n",
h->v->d->did, h->v->vid,
vector,
h->summary.guest_interrupt[vector].start_tsc);
error(ERR_WARN, NULL);
}
if(h->w2h.interrupts)
fprintf(warn, "Strange, waking && wake_vector 0 but interrupts_this_wait_to_halt %d!\n",
h->w2h.interrupts);
if(opt.dump_all)
printf(" [w2h] d%dv%d Setting wake_vector %d\n",
h->v->d->did, h->v->vid, vector);
/* In svm mode, vector information is invalid */
if ( opt.svm_mode && r->fake )
h->w2h.vector = FAKE_VECTOR;
else
h->w2h.vector = vector;
h->summary.guest_interrupt[vector].is_wake = 1;
}
if( h->summary.guest_interrupt[vector].start_tsc == 0 ) {
/* Note that we want start_tsc set at the next vmentry */
h->summary.guest_interrupt[vector].start_tsc = 1;
h->w2h.interrupts_wanting_tsc++;
h->w2h.interrupts++;
if(opt.dump_all)
printf(" [w2h] d%dv%d Starting vec %d\n",
h->v->d->did, h->v->vid, vector);
}
}
hvm_vlapic_inject(h->v, r->vector);
}
/* I/O Handling */
struct io_address {
struct io_address *next;
unsigned int pa;
unsigned int va;
struct cycle_summary summary[2];
};
void update_io_address(struct io_address ** list, unsigned int pa, int dir,
tsc_t arc_cycles, unsigned int va) {
struct io_address *p, *q=NULL;
/* Keep list in order */
for(p=*list; p && (p->pa != pa) && (p->pa < pa); q=p, p=p->next);
/* If we didn't find it, make a new element. */
if(!p || (p->pa != pa)) {
if((p=malloc(sizeof(*p)))==NULL) {
fprintf(stderr, "malloc() failed.\n");
error(ERR_SYSTEM, NULL);
}
bzero(p, sizeof(*p));
p->pa=pa;
p->va=va;
/* If we stopped in the middle or at the end, add it in */
if(q) {
p->next=q->next;
q->next=p;
} else {
/* Otherwise, we stopped after the first element; put it at the beginning */
p->next = *list;
*list = p;
}
}
update_cycles(&p->summary[dir], arc_cycles);
}
void hvm_io_address_summary(struct io_address *list, char * s) {
if(!list)
return;
printf("%s\n", s);
for(; list; list=list->next) {
if ( list->va )
{
PRINT_SUMMARY(list->summary[0], "%8x@%8x:[r] ", list->pa, list->va);
PRINT_SUMMARY(list->summary[1], "%8x@%8x:[w] ", list->pa, list->va);
}
else
{
PRINT_SUMMARY(list->summary[0], "%8x:[r] ", list->pa);
PRINT_SUMMARY(list->summary[1], "%8x:[w] ", list->pa);
}
}
}
void hvm_io_write_postprocess(struct hvm_data *h)
{
if(opt.with_pio_enumeration)
update_io_address(&h->summary.io.pio, h->inflight.io.port, 1, h->arc_cycles, 0);
}
void hvm_io_read_postprocess(struct hvm_data *h)
{
if(opt.with_pio_enumeration)
update_io_address(&h->summary.io.pio, h->inflight.io.port, 0, h->arc_cycles, 0);
if(opt.scatterplot_io && h->inflight.io.port == opt.scatterplot_io_port)
scatterplot_vs_time(h->exit_tsc, P.now - h->exit_tsc);
}
void hvm_io_assist_process(struct record_info *ri, struct hvm_data *h)
{
union {
struct {
unsigned int port;
unsigned int data;
} x32;
} *r = (typeof(r))h->d;
union {
unsigned event;
struct {
unsigned minor:8,
x64:1,
write:2;
};
} mevt = { .event = ri->event };
if(mevt.x64) {
fprintf(stderr, "FATAL: Unexpected 64-bit PIO\n");
error(ERR_RECORD, ri);
return;
}
h->inflight.io.port = r->x32.port;
h->inflight.io.val = r->x32.data;
if(mevt.write) {
h->inflight.io.is_write = 1;
if ( hvm_set_postprocess(h, hvm_io_write_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
} else {
h->inflight.io.is_write = 0;
if ( hvm_set_postprocess(h, hvm_io_read_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
if(opt.dump_all)
{
printf(" %s io %s port %x val %x\n",
ri->dump_header,
mevt.write?"write":"read",
r->x32.port,
r->x32.data);
}
}
/* cr_write */
/* CR3 list */
void cr3_switch(unsigned long long val, struct hvm_data *h) {
struct vcpu_data *v = h->v;
/* Really only need absolute tsc here. Later change to global time. */
unsigned long long now = P.now;
unsigned long long gmfn = val >> 12;
if ( !h->init )
return;
if(opt.with_cr3_enumeration) {
if(v->cr3.data) {
struct cr3_value_struct *cur = v->cr3.data;
unsigned long long cycles = now - v->cr3.start_time;
if(opt.summary_info)
update_cycles(&cur->total_time, cycles);
cur->last_time = now;
}
if(gmfn) {
struct cr3_value_struct *p, **last=&v->d->cr3_value_head;
/* Always add to tail, so that we get consistent interval
ouptut as the number of cr3s grow */
for(p=*last; p; last = (&p->next), p=p->next)
if(p->gmfn == gmfn)
break;
if(!p) {
if((p=malloc(sizeof(*p)))==NULL) {
fprintf(stderr, "malloc() failed.\n");
error(ERR_SYSTEM, NULL);
}
bzero(p, sizeof(*p));
p->gmfn = gmfn;
p->cr3_id = P.cr3.id;
p->first_time = now;
p->next=*last;
*last=p;
p->gnext = NULL;
if(P.cr3.head)
*P.cr3.tail = p;
else
P.cr3.head = p;
P.cr3.tail = &p->gnext;
P.cr3.id++;
/* Add to the interval list if appropriate */
if(opt.interval.check == INTERVAL_CHECK_CR3
&& v->d->did != DEFAULT_DOMAIN)
interval_cr3_value_check(p);
}
if(p->prealloc_unpin.now) {
fprintf(warn, "Re-promoting previously unpinned cr3 %llx!\n",
p->gmfn);
p->prealloc_unpin.now = 0;
h->inflight.cr_write.repromote = 1;
}
/* Accounting for new toplevel */
v->cr3.start_time = now;
p->switch_count++;
if(p->destroy.callback)
p->destroy.switch_count++;
v->cr3.data = p;
} else {
v->cr3.data = NULL;
}
if (opt.scatterplot_cr3_switch) {
scatterplot_vs_time(h->exit_tsc,
v->cr3.data ? (v->cr3.data->cr3_id) : 0);
}
} else {
if (opt.scatterplot_cr3_switch)
scatterplot_vs_time(h->exit_tsc, gmfn);
}
v->cr3.val = val;
};
void cr3_prealloc_unpin(struct vcpu_data *v, unsigned long long gmfn) {
struct cr3_value_struct *cr3;
/* Look for it in the list */
for(cr3 = v->d->cr3_value_head; cr3; cr3=cr3->next)
if(cr3->gmfn == gmfn)
break;
if(!cr3)
return;
if(cr3->prealloc_unpin.now)
fprintf(warn, "Strange, gmfn %llx multiple unpins w/o access!\n",
gmfn);
cr3->prealloc_unpin.now = 1;
cr3->prealloc_unpin.count++;
if(opt.dump_all)
printf(" cr3 %llx unpinned %d times\n",
gmfn, cr3->prealloc_unpin.count);
}
int cr3_compare_start(const void *_a, const void *_b) {
struct cr3_value_struct *a=*(typeof(&a))_a;
struct cr3_value_struct *b=*(typeof(&a))_b;
if(a->first_time > b->first_time)
return 1;
else if(b->first_time == a->first_time)
return 0;
else
return -1;
}
void cr3_dump_list(struct cr3_value_struct *head){
struct cr3_value_struct *p;
struct cr3_value_struct **qsort_array;
int i, N=0;
if(!head)
return;
/* Count the number of elements */
for(p=head; p; p=p->next)
N++;
/* Alloc a struct of the right size */
qsort_array = malloc(N * sizeof(struct eip_list_struct *));
/* Point the array into it */
for(i=0, p=head; p; p=p->next, i++)
qsort_array[i]=p;
/* Sort the array by time */
qsort(qsort_array, N, sizeof(struct eip_list_struct *),
cr3_compare_start);
/* WARNING: don't use N after this point unless you copy this variable */
#if 0
if(opt.summary_eip_limit && opt.summary_eip_limit < N)
N=opt.summary_eip_limit;
#endif
/* Now print the results */
printf(" cr3 values:\n");
for(i=0; ifirst_time, &first);
abs_cycles_to_time(p->last_time, &last);
snprintf(desc, 30, " %8llx (id %d)", p->gmfn, p->cr3_id);
print_cycle_summary(&p->total_time, desc);
snprintf(desc, 30, " guest");
print_cycle_percent_summary(&p->guest_time, p->run_time, desc);
snprintf(desc, 30, " hv ");
print_cycle_percent_summary(&p->hv_time, p->run_time, desc);
hvm_short_summary(&p->hvm, p->run_time, " + ");
printf(" Seen: %4u.%09u-%4u.%09u switch %d flush %d\n",
first.s, first.ns,
last.s, last.ns,
p->switch_count, p->flush_count);
if(p->destroy.callback)
printf(" destroy: flush %u switch %u fixup %u emulate %u\n",
p->destroy.flush_count,
p->destroy.switch_count,
p->destroy.fixup_user,
p->destroy.emulate_corr_user);
}
free(qsort_array);
}
void hvm_cr3_write_summary(struct hvm_data *h) {
int j;
for(j=0; jsummary.cr3_write_resyncs[j],
" *[%3d] ", j);
PRINT_SUMMARY(h->summary.cr3_write_resyncs[j],
" *[MAX] ");
}
void hvm_cr_write_summary(struct hvm_data *h, void *data)
{
long cr=(long)data;
PRINT_SUMMARY(h->summary.cr_write[cr],
" cr%ld ", cr);
if ( cr==3 )
hvm_cr3_write_summary(h);
}
void hvm_cr_write_postprocess(struct hvm_data *h)
{
if(h->inflight.cr_write.cr == 3) {
struct vcpu_data *v = h->v;
unsigned long long new_val = h->inflight.cr_write.val;
unsigned long long oval;
int flush=0;
if(v->cr3.val) {
oval = v->cr3.val;
if(new_val == oval) {
if(v->cr3.data) {
v->cr3.data->flush_count++;
if(v->cr3.data->destroy.callback)
v->cr3.data->destroy.flush_count++;
}
flush=1;
}
}
if(opt.summary_info) {
int resyncs = h->resyncs;
if(resyncs > RESYNCS_MAX)
resyncs = RESYNCS_MAX;
update_cycles(&h->summary.cr3_write_resyncs[resyncs],
h->arc_cycles);
update_cycles(&h->summary.cr_write[3],
h->arc_cycles);
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_CR3);
}
if(!flush)
cr3_switch(new_val, h);
} else {
if(opt.summary_info)
{
if(h->inflight.cr_write.cr < CR_MAX)
update_cycles(&h->summary.cr_write[h->inflight.cr_write.cr],
h->arc_cycles);
}
}
/* Set summary handler */
/* FIXME - deal with cr_read_summary */
if(h->exit_reason < h->exit_reason_max)
{
/* Want a different "set" for each cr */
switch(h->inflight.cr_write.cr)
{
#define case_cr(_x) \
case (_x): \
hvm_set_summary_handler(h, hvm_cr_write_summary, (void *)(_x)); \
break
case_cr(0);
case_cr(1);
case_cr(2);
case_cr(3);
case_cr(4);
case_cr(5);
case_cr(6);
case_cr(7);
case_cr(8);
case_cr(9);
case_cr(10);
case_cr(11);
case_cr(12);
case_cr(13);
case_cr(14);
case_cr(15);
#undef case_cr
default:
fprintf(stderr, "Unexpected cr: %d\n", h->inflight.cr_write.cr);
error(ERR_SANITY, NULL);
break;
}
}
}
void hvm_cr_write_process(struct record_info *ri, struct hvm_data *h)
{
union {
struct {
unsigned cr;
unsigned int val;
} x32;
struct {
unsigned cr;
unsigned long long val;
} __attribute__((packed)) x64;
} *r = (typeof(r))h->d;
unsigned cr;
unsigned long long val;
if(ri->event & TRC_64_FLAG) {
h->inflight.cr_write.cr = cr = r->x64.cr;
h->inflight.cr_write.val = val = r->x64.val;
} else {
h->inflight.cr_write.cr = cr = r->x32.cr;
h->inflight.cr_write.val = val = r->x32.val;
}
/* In vmx, in real mode, cr accesses may cause EXNMI vmexits.
* Account them under that heading; otherwise, complain */
if ( hvm_set_postprocess(h, hvm_cr_write_postprocess) )
fprintf(warn, "%s: Strange, h->postprocess already set!\n",
__func__);
if(opt.dump_all)
{
if(cr == 3 && h->v->cr3.val) {
printf("]%s cr_write cr3 val %llx oval %llx %s\n",
ri->dump_header,
val,
h->v->cr3.val,
(h->v->cr3.val == val)?"flush":"switch");
} else {
printf(" %s cr_write cr%d val %llx\n",
ri->dump_header,
cr, val);
}
}
}
/* msr_write */
void hvm_msr_write_summary(struct hvm_data *h, void *d)
{
}
void hvm_msr_write_postprocess(struct hvm_data *h)
{
if(opt.summary_info) {
}
/* Set summary handler */
hvm_set_summary_handler(h, hvm_msr_write_summary, NULL);
}
void hvm_msr_write_process(struct record_info *ri, struct hvm_data *h)
{
struct {
unsigned int addr;
unsigned long long val;
} __attribute__((packed)) *r = (typeof(r))h->d;
if(check_extra_words(ri, sizeof(*r), "msr_write"))
return;
h->inflight.msr.addr = r->addr;
h->inflight.msr.val = r->val;
if(opt.dump_all)
{
printf(" %s msr_write addr %x val %llx\n",
ri->dump_header,
r->addr, r->val);
}
if ( hvm_set_postprocess(h, hvm_msr_write_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
/* msr_read */
void hvm_msr_read_summary(struct hvm_data *h, void *d)
{
}
void hvm_msr_read_postprocess(struct hvm_data *h)
{
if(opt.summary_info) {
}
/* Set summary handler */
hvm_set_summary_handler(h, hvm_msr_read_summary, NULL);
}
void hvm_msr_read_process(struct record_info *ri, struct hvm_data *h)
{
struct {
unsigned int addr;
unsigned long long val;
} __attribute__((packed)) *r = (typeof(r))h->d;
if(check_extra_words(ri, sizeof(*r), "msr_read"))
return;
h->inflight.msr.addr = r->addr;
h->inflight.msr.val = r->val;
if(opt.dump_all)
{
printf(" %s msr_read addr %x val %llx\n",
ri->dump_header,
r->addr, r->val);
}
if ( hvm_set_postprocess(h, hvm_msr_read_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void hvm_vmcall_summary(struct hvm_data *h, void *d)
{
int i;
for ( i=0; isummary.vmcall[i],
" [%10s] ", hypercall_name[i]);
}
PRINT_SUMMARY(h->summary.vmcall[HYPERCALL_MAX],
" [%10s] ", "max");
}
void hvm_vmcall_postprocess(struct hvm_data *h)
{
unsigned eax = h->inflight.vmcall.eax ;
if(opt.summary)
{
if ( eax < HYPERCALL_MAX )
update_cycles(&h->summary.vmcall[eax],
h->arc_cycles);
else
update_cycles(&h->summary.vmcall[HYPERCALL_MAX],
h->arc_cycles);
hvm_set_summary_handler(h, hvm_vmcall_summary, NULL);
}
}
void hvm_vmcall_process(struct record_info *ri, struct hvm_data *h)
{
struct {
unsigned int eax;
} *r = (typeof(r))h->d;
if(opt.dump_all) {
if(r->eax < HYPERCALL_MAX)
printf(" %s vmcall %2x (%s)\n",
ri->dump_header,
r->eax,
hypercall_name[r->eax]);
else
printf(" %s vmcall %2x\n",
ri->dump_header,
r->eax);
}
h->inflight.vmcall.eax = r->eax;
if ( hvm_set_postprocess(h, hvm_vmcall_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void hvm_inj_exc_process(struct record_info *ri, struct hvm_data *h)
{
struct {
unsigned vec, ec;
} *r = (typeof(r))h->d;
if ( opt.dump_all )
{
if(r->vec < HVM_TRAP_MAX)
printf(" %3u.%09u %s inj_exc trap %s ec %x\n",
ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
hvm_trap_name[r->vec], r->ec);
else
printf(" %3u.%09u %s inj_exc trap %u ec %x\n",
ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
r->vec, r->ec);
}
}
void hvm_intr_summary(struct hvm_data *h, void *d)
{
int i;
for(i=0; isummary.extint[i])
{
if(hvm_extint_vector_name[i])
printf(" %10s(%3d): %d\n",
hvm_extint_vector_name[i],
i,
h->summary.extint[i]);
else
printf(" [%3d]: %d\n",
i,
h->summary.extint[i]);
}
if(h->summary.extint[EXTERNAL_INTERRUPT_MAX])
printf(" Other: : %d\n",
h->summary.extint[EXTERNAL_INTERRUPT_MAX]);
}
void hvm_intr_process(struct record_info *ri, struct hvm_data *h)
{
unsigned vec = *(unsigned *)h->d;
/* Vector is difficult to get in SVM mode */
if ( opt.svm_mode )
vec = 0;
if( (h->rip >> ADDR_SPACE_BITS) != 00
&& (h->rip >> ADDR_SPACE_BITS) != ((0ULL-1)>> ADDR_SPACE_BITS) ) {
fprintf(stderr, "Unexpected rip %llx (shift %llx)\n",
h->rip,
h->rip >> ADDR_SPACE_BITS);
error(ERR_RECORD, NULL);
/* Can process with strange rip */
}
h->inflight.intr.vec = vec;
if ( opt.dump_all )
{
if ( vec < EXTERNAL_INTERRUPT_MAX &&
hvm_extint_vector_name[vec] )
printf(" %s intr vec %s(%x)\n",
ri->dump_header,
hvm_extint_vector_name[vec],
vec);
else
printf(" %s intr vec %x\n",
ri->dump_header, vec);
}
if(opt.scatterplot_interrupt_eip
&& vec == opt.scatterplot_interrupt_vector)
{
struct time_struct t;
/* Truncate to 40 bits */
unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1);
/* Want absolute tsc to global tsc */
abs_cycles_to_time(h->exit_tsc, &t);
printf("d%dv%d %u.%09u %lld\n",
h->v->d->did, h->v->vid,
t.s, t.ns,
rip);
}
if(opt.histogram_interrupt_eip
&& vec == opt.histogram_interrupt_vector)
{
/* Truncate to 40 bits */
unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1);
unsigned index = rip / opt.histogram_interrupt_increment;
h->summary.extint_histogram[index]++;
}
if(opt.with_interrupt_eip_enumeration
&& vec == opt.interrupt_eip_enumeration_vector)
{
/* Truncate to 40 bits */
unsigned long long rip = h->rip & ((1ULL << ADDR_SPACE_BITS)-1);
/* Want absolute tsc to global tsc */
update_eip(&h->v->d->interrupt_eip_list, rip, 0, 0, NULL);
}
/* Disable generic postprocessing */
/* FIXME: Do the summary stuff in a post-processor */
h->post_process = NULL;
if(opt.summary_info) {
if(opt.summary)
hvm_set_summary_handler(h, hvm_intr_summary, NULL);
if(vec < EXTERNAL_INTERRUPT_MAX)
h->summary.extint[vec]++;
else
h->summary.extint[EXTERNAL_INTERRUPT_MAX]++;
}
}
void hvm_intr_window_process(struct record_info *ri, struct hvm_data *h)
{
struct {
uint32_t vector;
uint32_t source;
int32_t intr;
} *r = (typeof(r))h->d;
char *intsrc_name[] = {
"none",
"pic",
"lapic",
"nmi",
"mce",
"vector"
};
if ( opt.dump_all )
{
printf(" %s intr_window vec %u src %u(%s) ",
ri->dump_header,
(unsigned)r->vector,
(unsigned)r->source,
r->source < 6 ? intsrc_name[r->source]: "?");
if ( r->intr > 0 )
printf("intr %x\n",
(unsigned)r->intr);
else
printf("intr #\n");
}
}
void hvm_pf_inject_process(struct record_info *ri, struct hvm_data *h)
{
union {
struct {
unsigned ec;
unsigned int cr2;
} x32;
struct {
unsigned ec;
unsigned long long cr2;
} __attribute__((packed)) x64;
} *r = (typeof(r))h->d;
unsigned int ec;
unsigned long long cr2;
int is_64 = 0;
if(ri->event & TRC_64_FLAG) {
is_64 = 1;
cr2 = r->x64.cr2;
ec = r->x64.ec;
} else {
cr2 = r->x32.cr2;
ec = r->x32.ec;
}
if ( opt.dump_all )
{
printf(" %3u.%09u %s pf_inject%s guest_cr2 %llx guest_ec %x\n",
ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
is_64?"64":"",
cr2, ec);
}
}
void hvm_generic_postprocess_init(struct record_info *ri, struct hvm_data *h);
void hvm_npf_process(struct record_info *ri, struct hvm_data *h)
{
struct {
uint64_t gpa;
uint64_t mfn;
uint32_t qualification;
uint32_t p2mt;
} *r = (typeof(r))h->d;
if ( opt.dump_all )
printf(" %s npf gpa %llx q %x mfn %llx t %d\n",
ri->dump_header,
(unsigned long long)r->gpa, r->qualification,
(unsigned long long)r->mfn, r->p2mt);
if ( opt.summary_info )
hvm_generic_postprocess_init(ri, h);
}
void hvm_rdtsc_process(struct record_info *ri, struct hvm_data *h)
{
struct {
unsigned long long tsc;
} *r = (typeof(r))h->d;
if ( opt.dump_all )
printf(" %s rdtsc %llx %lld %s\n",
ri->dump_header,
(unsigned long long)r->tsc,
(unsigned long long)r->tsc,
h->last_rdtsc > r->tsc ? "BACKWARDS" : "");
if ( opt.scatterplot_rdtsc )
{
struct time_struct t;
abs_cycles_to_time(ri->tsc, &t);
printf("%dv%d %u.%09u %llu\n",
h->v->d->did, h->v->vid,
t.s, t.ns,
r->tsc);
}
h->last_rdtsc = r->tsc;
}
void hvm_generic_summary(struct hvm_data *h, void *data)
{
long evt = (long)data;
assert(evt < HVM_EVENT_HANDLER_MAX);
PRINT_SUMMARY(h->summary.generic[evt],
" %s ", hvm_event_handler_name[evt]);
}
void hvm_generic_postprocess_init(struct record_info *ri, struct hvm_data *h)
{
if ( h->post_process != hvm_generic_postprocess )
fprintf(warn, "%s: Strange, h->postprocess set!\n",
__func__);
h->inflight.generic.event = ri->event;
bcopy(h->d, h->inflight.generic.d, sizeof(unsigned int) * 4);
}
void hvm_generic_postprocess(struct hvm_data *h)
{
long evt = 0;
static unsigned registered[HVM_EVENT_HANDLER_MAX] = { 0 };
if ( h->inflight.generic.event )
evt = (h->inflight.generic.event - TRC_HVM_HANDLER)
& ~(TRC_64_FLAG|HVM_IO_ASSIST_WRITE);
else {
static unsigned warned[HVM_EXIT_REASON_MAX] = { 0 };
/* Some exits we don't expect a handler; just return */
if(opt.svm_mode)
{
}
else
{
switch(h->exit_reason)
{
/* These just need us to go through the return path */
case EXIT_REASON_PENDING_INTERRUPT:
case EXIT_REASON_TPR_BELOW_THRESHOLD:
/* Not much to log now; may need later */
case EXIT_REASON_WBINVD:
return;
default:
break;
}
}
if ( !warned[h->exit_reason] )
{
/* If we aren't a known exception, warn and log results */
fprintf(warn, "%s: Strange, exit %x(%s) missing a handler\n",
__func__, h->exit_reason,
(h->exit_reason > h->exit_reason_max)
? "[clipped]"
: h->exit_reason_name[h->exit_reason]);
warned[h->exit_reason]=1;
}
}
if ( evt >= HVM_EVENT_HANDLER_MAX || evt < 0)
{
fprintf(warn, "%s: invalid hvm event %lx(%x)\n",
__func__, evt, h->inflight.generic.event);
error(ERR_RECORD, NULL);
return;
}
if(opt.summary_info) {
update_cycles(&h->summary.generic[evt],
h->arc_cycles);
/* NB that h->exit_reason may be 0, so we offset by 1 */
if ( registered[evt] )
{
static unsigned warned[HVM_EXIT_REASON_MAX] = { 0 };
if ( registered[evt] != h->exit_reason+1 && !warned[h->exit_reason])
{
fprintf(warn, "%s: HVM evt %lx in %x and %x!\n",
__func__, evt, registered[evt]-1, h->exit_reason);
warned[h->exit_reason]=1;
}
}
else
{
int ret;
if((ret=__hvm_set_summary_handler(h, hvm_generic_summary, (void *)evt)))
fprintf(stderr, "%s: hvm_set_summary_handler returned %d\n",
__func__, ret);
registered[evt]=h->exit_reason+1;
}
/* HLT checked at hvm_vmexit_close() */
}
}
void hvm_generic_dump(struct record_info *ri, char * prefix)
{
struct {
unsigned vcpu:16, domain:16;
unsigned d[4];
} *cr = (typeof(cr))ri->d;
char *evt_string, evt_number[256];
int i, evt, is_64 = 0;
evt = ri->event - TRC_HVM_HANDLER;
if(evt & TRC_64_FLAG) {
evt &= ~(TRC_64_FLAG);
is_64=1;
}
if(evt < HVM_EVENT_HANDLER_MAX)
{
evt_string = hvm_event_handler_name[evt];
}
else
{
snprintf(evt_number, 256, "hvm_handler %d", evt);
evt_string = evt_number;
}
printf("%s%s %s%s [",
prefix,
ri->dump_header,
evt_string,
is_64?"64":"");
for(i=0; iextra_words; i++) {
printf(" %x", ri->d[i]);
}
printf(" ]\n");
}
void hvm_handler_process(struct record_info *ri, struct hvm_data *h) {
/* Wait for first vmexit to initialize */
if(!h->init)
{
if(opt.dump_all)
hvm_generic_dump(ri,"!");
return;
}
h->d = ri->d;
/* Handle things that don't need a vmexit */
switch(ri->event) {
default:
goto needs_vmexit;
/* Records about changing guest state */
case TRC_HVM_PF_INJECT:
case TRC_HVM_PF_INJECT64:
hvm_pf_inject_process(ri, h);
break;
case TRC_HVM_REINJ_VIRQ:
if ( opt.dump_all )
{
printf(" %3u.%09u %s inj_virq vec %u\n",
ri->t.s, ri->t.ns, pcpu_string(ri->cpu),
*(unsigned*)h->d);
}
break;
case TRC_HVM_INJ_EXC:
hvm_inj_exc_process(ri, h);
break;
case TRC_HVM_INJ_VIRQ:
hvm_inj_virq_process(ri, h);
break;
case TRC_HVM_INTR_WINDOW:
hvm_intr_window_process(ri, h);
break;
case TRC_HVM_OP_DESTROY_PROC:
if(h->v->cr3.data) {
struct cr3_value_struct *cur = h->v->cr3.data;
if(cur->destroy.callback)
fprintf(warn, "Strange, double callback for cr3 gmfn %llx!\n",
cur->gmfn);
cur->destroy.callback = 1;
} else if(opt.with_cr3_enumeration) {
fprintf(warn, "Warning: destroy_proc: don't know current cr3\n");
}
if ( opt.dump_all )
{
printf(" %3u.%09u %s destroy_proc cur_cr3 %llx\n",
ri->t.s, ri->t.ns, pcpu_string(ri->cpu), h->v->cr3.val);
}
break;
}
return;
needs_vmexit:
/* Wait for the next vmexit */
if(!h->vmexit_valid)
{
if(opt.dump_all)
hvm_generic_dump(ri,"!");
return;
}
/* Keep generic "event handler" info */
h->event_handler = ri->event - TRC_HVM_HANDLER;
switch(ri->event) {
/* Records adding to the vmexit reason */
case TRC_HVM_INTR:
hvm_intr_process(ri, h);
break;
case TRC_HVM_PF_XEN:
case TRC_HVM_PF_XEN64:
hvm_pf_xen_process(ri, h);
break;
case TRC_HVM_IOPORT_READ:
case TRC_HVM_IOPORT_WRITE:
hvm_io_assist_process(ri, h);
break;
case TRC_HVM_IOMEM_READ:
case TRC_HVM_IOMEM_WRITE:
case TRC_HVM_IOMEM_READ|TRC_64_FLAG:
case TRC_HVM_IOMEM_WRITE|TRC_64_FLAG:
hvm_mmio_assist_process(ri, h);
break;
case TRC_HVM_CR_WRITE:
case TRC_HVM_CR_WRITE64:
hvm_cr_write_process(ri, h);
break;
case TRC_HVM_MSR_WRITE:
hvm_msr_write_process(ri, h);
break;
case TRC_HVM_MSR_READ:
hvm_msr_read_process(ri, h);
break;
case TRC_HVM_VMMCALL:
hvm_vmcall_process(ri, h);
break;
case TRC_HVM_NPF:
hvm_npf_process(ri, h);
break;
case TRC_HVM_RDTSC:
hvm_rdtsc_process(ri, h);
break;
case TRC_HVM_DR_READ:
case TRC_HVM_DR_WRITE:
case TRC_HVM_CPUID:
case TRC_HVM_SMI:
case TRC_HVM_HLT:
case TRC_HVM_INVLPG:
case TRC_HVM_INVLPG64:
case TRC_HVM_MCE:
case TRC_HVM_CLTS:
case TRC_HVM_LMSW:
case TRC_HVM_LMSW64:
case TRC_HVM_NMI:
case TRC_HVM_REALMODE_EMULATE:
case TRC_HVM_TRAP:
case TRC_HVM_TRAP_DEBUG:
case TRC_HVM_CR_READ:
case TRC_HVM_CR_READ64:
default:
if(opt.dump_all)
hvm_generic_dump(ri, "]");
if(opt.summary_info)
hvm_generic_postprocess_init(ri, h);
break;
}
}
void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc);
void vcpu_prev_update(struct pcpu_info *p, struct vcpu_data *prev,
tsc_t tsc, int new_runstate);
struct vcpu_data * vcpu_find(int did, int vid);
void lose_vcpu(struct vcpu_data *v, tsc_t tsc);
int domain_runstate(struct domain_data *d) {
int i;
int runstates[RUNSTATE_MAX];
int ret=-1;
int max_vcpus = 0;
if(d->did == DEFAULT_DOMAIN)
return 0;
for(i=0; imax_vid; i++)
if(d->vcpu[i] && d->vcpu[i]->runstate.state != RUNSTATE_INIT) {
max_vcpus++;
runstates[d->vcpu[i]->runstate.state]++;
}
if(runstates[RUNSTATE_LOST] == max_vcpus)
ret=DOMAIN_RUNSTATE_LOST;
else if(runstates[RUNSTATE_RUNNING])
{
if(runstates[RUNSTATE_RUNNABLE])
ret=DOMAIN_RUNSTATE_CONCURRENCY_HAZARD;
else if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE])
ret= DOMAIN_RUNSTATE_PARTIAL_RUN;
else
ret= DOMAIN_RUNSTATE_FULL_RUN;
}
else if(runstates[RUNSTATE_RUNNABLE])
{
if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE])
ret= DOMAIN_RUNSTATE_PARTIAL_CONTENTION;
else
ret= DOMAIN_RUNSTATE_FULL_CONTENTION;
}
else if(runstates[RUNSTATE_BLOCKED]||runstates[RUNSTATE_OFFLINE])
{
ret= DOMAIN_RUNSTATE_BLOCKED;
} else {
fprintf(warn, "Strange, no meaningful runstates for d%d!\n",
d->did);
}
if ( ret < 0 )
{
printf(" Max vid: %d (max_vcpus %d)\n", d->max_vid, max_vcpus);
for(i=0; i<=d->max_vid; i++)
if(d->vcpu[i])
fprintf(warn, " v%d: %s\n",
i, runstate_name[d->vcpu[i]->runstate.state]);
for(i=0; i= 0)
return ret;
error(ERR_ASSERT, NULL);
return -1; /* Never happens */
}
static inline void runstate_update(struct vcpu_data *v, int new_runstate,
tsc_t tsc)
{
struct domain_data *d = v->d;
if ( opt.scatterplot_runstate )
{
struct time_struct t;
abs_cycles_to_time(tsc, &t);
printf("%dv%d %u.%09u %d\n",
d->did, v->vid,
t.s, t.ns,
runstate_graph[v->runstate.state]);
printf("%dv%d %u.%09u %d\n",
d->did, v->vid,
t.s, t.ns,
runstate_graph[new_runstate]);
}
if(v->runstate.tsc > 0 && v->runstate.tsc < tsc) {
update_cycles(v->runstates + v->runstate.state, tsc - v->runstate.tsc);
if ( opt.scatterplot_runstate_time )
{
struct time_struct t, dt;
abs_cycles_to_time(tsc, &t);
cycles_to_time(tsc - v->runstate.tsc, &dt);
printf("%dv%d %u.%09u %u.%09u\n",
d->did, v->vid,
t.s, t.ns,
dt.s, dt.ns);
}
if(v->runstate.state == RUNSTATE_RUNNING)
update_cycles(&v->d->total_time, tsc - v->runstate.tsc);
if(v->runstate.state == RUNSTATE_RUNNABLE)
update_cycles(v->runnable_states + v->runstate.runnable_state, tsc - v->runstate.tsc);
/* How much did dom0 run this buffer? */
if(v->d->did == 0) {
int i;
for(i=0; iactive)
continue;
start_tsc = (p->volume.buffer_first_tsc > v->runstate.tsc) ?
p->volume.buffer_first_tsc :
v->runstate.tsc;
p->volume.buffer_dom0_runstate_cycles[v->runstate.state]
+= tsc - start_tsc;
#if 0
printf(" - updated p%d dom0_runstate %s to %lld cycles (+%lld)\n",
p->pid, runstate_name[v->runstate.state],
p->volume.buffer_dom0_runstate_cycles[v->runstate.state],
tsc - start_tsc);
#endif
p->volume.buffer_dom0_runstate = new_runstate;
p->volume.buffer_dom0_runstate_tsc = tsc;
}
}
}
/* Detect "runnable" states */
if ( new_runstate == RUNSTATE_RUNNABLE )
{
switch(v->runstate.state)
{
case RUNSTATE_RUNNING:
v->runstate.runnable_state=RUNNABLE_STATE_PREEMPT;
break;
case RUNSTATE_BLOCKED:
case RUNSTATE_OFFLINE:
v->runstate.runnable_state=RUNNABLE_STATE_WAKE;
break;
default:
v->runstate.runnable_state=RUNNABLE_STATE_OTHER;
break;
}
} else
v->runstate.runnable_state=RUNNABLE_STATE_INVALID;
v->runstate.state = new_runstate;
v->runstate.tsc = tsc;
/* Determine the domain runstate */
if(d->runstate_tsc > 0 && d->runstate_tsc < tsc)
update_cycles(d->runstates + d->runstate, tsc - d->runstate_tsc);
d->runstate = domain_runstate(d);
d->runstate_tsc = tsc;
}
void hvm_vmexit_process(struct record_info *ri, struct hvm_data *h,
struct vcpu_data *v) {
struct {
union {
struct {
unsigned int exit_reason;
unsigned long long rip;
} __attribute__((packed)) x64;
struct {
unsigned int exit_reason;
unsigned int eip;
} x32;
};
} *r;
if ( ri->event & TRC_64_FLAG )
{
if (check_extra_words(ri, sizeof(r->x64), "vmexit"))
return;
}
else
{
if (check_extra_words(ri, sizeof(r->x32), "vmexit"))
return;
}
r = (typeof(r))ri->d;
if(!h->init)
init_hvm_data(h, v);
h->vmexit_valid=1;
bzero(&h->inflight, sizeof(h->inflight));
if(ri->event == TRC_HVM_VMEXIT64) {
if(v->guest_paging_levels != 4)
{
if ( verbosity >= 6 )
fprintf(warn, "%s: VMEXIT64, but guest_paging_levels %d. Switching to 4.\n",
__func__, v->guest_paging_levels);
v->guest_paging_levels = 4;
}
if(!is_valid_addr64(r->x64.rip))
fprintf(warn, "%s: invalid va %llx\n",
__func__, r->x64.rip);
h->rip = r->x64.rip;
h->exit_reason = r->x64.exit_reason;
} else {
if(v->guest_paging_levels == 4)
{
int new_paging_levels = opt.default_guest_paging_levels;
if(new_paging_levels == 4)
new_paging_levels = 2; /* Wild guess */
if ( verbosity >= 6 )
fprintf(warn, "%s: VMEXIT, but guest_paging_levels %d. Switching to %d(default).\n",
__func__, v->guest_paging_levels, new_paging_levels);
v->guest_paging_levels = new_paging_levels;
}
h->rip = r->x32.eip;
h->exit_reason = r->x32.exit_reason;
}
if(opt.scatterplot_vmexit_eip)
scatterplot_vs_time(ri->tsc, h->rip);
if(h->exit_reason > h->exit_reason_max)
{
fprintf(warn, "h->exit_reason %x > exit_reason_max %x!\n",
(unsigned int)h->exit_reason,
(unsigned int)h->exit_reason_max);
error(ERR_RECORD, ri);
return;
}
if(opt.dump_all) {
if ( h->exit_reason < h->exit_reason_max
&& h->exit_reason_name[h->exit_reason] != NULL)
printf("]%s vmexit exit_reason %s eip %llx%s\n",
ri->dump_header,
h->exit_reason_name[h->exit_reason],
h->rip,
find_symbol(h->rip));
else
printf("]%s vmexit exit_reason %x eip %llx%s\n",
ri->dump_header,
h->exit_reason,
h->rip,
find_symbol(h->rip));
}
if(h->v->cr3.data && h->entry_tsc) {
update_cycles(&h->v->cr3.data->guest_time,
ri->tsc - h->entry_tsc);
h->v->cr3.data->run_time += (ri->tsc - h->entry_tsc);
}
h->exit_tsc = ri->tsc;
h->entry_tsc = 0;
h->resyncs = 0;
h->prealloc_unpin = 0;
h->wrmap_bf = 0;
h->short_summary_done = 0;
h->post_process = hvm_generic_postprocess;
h->inflight.generic.event = 0;
}
void hvm_close_vmexit(struct hvm_data *h, tsc_t tsc) {
if(h->exit_tsc) {
if(h->exit_tsc > tsc)
h->arc_cycles = 0;
else {
h->arc_cycles = tsc - h->exit_tsc;
if(opt.summary_info) {
update_cycles(&h->summary.exit_reason[h->exit_reason],
h->arc_cycles);
h->summary_info = 1;
}
if ( opt.scatterplot_extint_cycles
&& h->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT
&& h->inflight.intr.vec == opt.scatterplot_extint_cycles_vector )
{
struct time_struct t;
abs_cycles_to_time(tsc, &t);
printf("d%dv%d %u.%09u %lld\n",
h->v->d->did,
h->v->vid,
t.s, t.ns,
h->arc_cycles);
}
}
}
if(h->post_process)
(h->post_process)(h);
if(h->arc_cycles) {
if(opt.summary_info && !h->short_summary_done) {
switch(h->event_handler) {
case HVM_EVENT_HANDLER_VMCALL:
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_VMCALL);
break;
case HVM_EVENT_HANDLER_INTR:
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_INTERRUPT);
break;
case HVM_EVENT_HANDLER_HLT:
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_HLT);
break;
default:
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_OTHER);
break;
}
}
if(h->v->cr3.data) {
h->v->cr3.data->run_time += h->arc_cycles;
if(opt.summary_info)
update_cycles(&h->v->cr3.data->hv_time,
h->arc_cycles);
}
}
h->exit_tsc = 0;
h->vmexit_valid = 0;
h->post_process = NULL;
}
void hvm_vmentry_process(struct record_info *ri, struct hvm_data *h) {
if(!h->init)
{
if(opt.dump_all)
printf("!%s vmentry\n",
ri->dump_header);
return;
}
/* Vista bug
* This has to be done here because irqs are injected on the path out
* to vmexit. */
hvm_vlapic_vmentry_cleanup(h->v, ri->tsc);
if(h->w2h.waking && opt.dump_all)
printf(" [w2h] d%dv%d Finishing waking\n",
h->v->d->did, h->v->vid);
h->w2h.waking = 0;
if ( h->w2h.interrupts_wanting_tsc ) {
int i;
for(i=0; isummary.guest_interrupt[i].start_tsc == 1 )
{
if(opt.dump_all)
printf(" [w2h] d%dv%d Setting vec %d tsc to %lld\n",
h->v->d->did, h->v->vid, i, ri->tsc);
h->summary.guest_interrupt[i].start_tsc = ri->tsc;
h->w2h.interrupts_wanting_tsc--;
if ( h->w2h.interrupts_wanting_tsc == 0 )
break;
}
}
}
if(!h->vmexit_valid)
{
if(opt.dump_all)
printf("!%s vmentry\n",
ri->dump_header);
return;
}
if(opt.dump_all) {
unsigned long long arc_cycles = ri->tsc - h->exit_tsc;
printf("]%s vmentry cycles %lld %s\n",
ri->dump_header, arc_cycles, (arc_cycles>10000)?"!":"");
}
hvm_close_vmexit(h, ri->tsc);
h->entry_tsc = ri->tsc;
}
void hvm_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct vcpu_data *v = p->current;
struct hvm_data *h = &v->hvm;
assert(p->current);
if(vcpu_set_data_type(p->current, VCPU_DATA_HVM))
return;
if(ri->evt.sub == 2)
{
UPDATE_VOLUME(p, hvm[HVM_VOL_HANDLER], ri->size);
hvm_handler_process(ri, h);
}
else
{
switch(ri->event) {
/* HVM */
case TRC_HVM_VMEXIT:
case TRC_HVM_VMEXIT64:
UPDATE_VOLUME(p, hvm[HVM_VOL_VMEXIT], ri->size);
hvm_vmexit_process(ri, h, v);
break;
case TRC_HVM_VMENTRY:
UPDATE_VOLUME(p, hvm[HVM_VOL_VMENTRY], ri->size);
hvm_vmentry_process(ri, &p->current->hvm);
break;
default:
fprintf(warn, "Unknown hvm event: %x\n", ri->event);
}
}
}
void hvm_summary(struct hvm_data *h) {
int i;
if(!h->summary_info)
return;
printf("Exit reasons:\n");
for(i=0; iexit_reason_max; i++) {
struct hvm_summary_handler_node *p;
if ( h->exit_reason_name[i] )
PRINT_SUMMARY(h->summary.exit_reason[i],
" %-20s ", h->exit_reason_name[i]);
else
PRINT_SUMMARY(h->summary.exit_reason[i],
" %20d ", i);
p=h->exit_reason_summary_handler_list[i];
while(p)
{
p->handler(h, p->data);
p=p->next;
}
}
printf("Guest interrupt counts:\n");
for(i=0; isummary.guest_interrupt[i].count) {
int j;
printf(" [%3d] %d\n",
i, h->summary.guest_interrupt[i].count);
for(j=1; jsummary.guest_interrupt[i].runtime+j, desc);
}
}
if(h->summary.guest_interrupt[i].count)
printf(" [%d+] %d\n",
i, h->summary.guest_interrupt[i].count);
if(opt.histogram_interrupt_eip)
{
unsigned max = ((1ULL<summary.extint_histogram[i])
{
printf("[%llx-%llx]: %d\n",
opt.histogram_interrupt_increment * i,
(opt.histogram_interrupt_increment * (i+1)) - 1,
h->summary.extint_histogram[i]);
}
}
PRINT_SUMMARY(h->summary.ipi_latency,
"IPI latency \n");
for(i=0; i<256; i++)
if(h->summary.ipi_count[i])
printf(" [%3d] %10d\n",
i, h->summary.ipi_count[i]);
hvm_io_address_summary(h->summary.io.pio, "IO address summary:");
hvm_io_address_summary(h->summary.io.mmio, "MMIO address summary:");
}
/* ---- Shadow records ---- */
union shadow_event
{
unsigned event;
struct {
unsigned minor:8,
paging_levels:4;
};
};
/* WARNING - not thread safe */
#define FLAGSTRING(_name, _char) \
if(e->flag_ ## _name) \
flagstring[i] = _char; \
i++;
char * flag_string(struct pf_xen_extra *e)
{
static char flagstring[32];
int i=0;
for(i=0; i<32; i++)
flagstring[i]='-';
i=0;
if(e->flag_set_ad)
flagstring[i]='d';
else if(e->flag_set_a)
flagstring[i]='a';
i++;
FLAGSTRING(shadow_l1_get_ref, 'g');
FLAGSTRING(shadow_l1_put_ref, 'p');
//FLAGSTRING(l2_propagate, '2');
FLAGSTRING(demote, 'D');
FLAGSTRING(promote, 'P');
FLAGSTRING(wrmap, 'w');
FLAGSTRING(wrmap_guess_found, 'G');
//FLAGSTRING(wrmap_brute_force, 'b');
FLAGSTRING(early_unshadow, 'e');
FLAGSTRING(prealloc_unhook, 'H');
FLAGSTRING(unsync, 'u');
FLAGSTRING(oos_fixup_add, 'a');
FLAGSTRING(oos_fixup_evict, 'x');
flagstring[i]=0;
return flagstring;
}
void shadow_emulate_postprocess(struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
if ( opt.summary_info )
{
update_eip(&h->v->d->emulate_eip_list,
h->rip,
h->arc_cycles,
0, NULL);
update_cycles(&h->summary.pf_xen[PF_XEN_EMULATE], h->arc_cycles);
update_cycles(&h->summary.pf_xen_emul[e->pt_level], h->arc_cycles);
if(h->prealloc_unpin)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_PREALLOC_UNPIN], h->arc_cycles);
if(e->flag_prealloc_unhook)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_PREALLOC_UNHOOK], h->arc_cycles);
if(e->flag_early_unshadow)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_EARLY_UNSHADOW], h->arc_cycles);
if(e->flag_set_changed)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_CHANGED], h->arc_cycles);
else
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_UNCHANGED], h->arc_cycles);
if(e->flag_set_flush)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_FLUSH], h->arc_cycles);
if(e->flag_set_error)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_SET_ERROR], h->arc_cycles);
if(e->flag_promote)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_PROMOTE], h->arc_cycles);
if(e->flag_demote)
update_cycles(&h->summary.pf_xen_emul[PF_XEN_EMUL_DEMOTE], h->arc_cycles);
/* more summary info */
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_EMULATE);
}
if(opt.scatterplot_unpin_promote) {
if(e->flag_early_unshadow)
scatterplot_vs_time(h->exit_tsc, -10);
if(h->prealloc_unpin)
scatterplot_vs_time(h->exit_tsc, 0);
if(e->flag_promote) {
if(opt.with_cr3_enumeration) {
if(h->v->cr3.data)
scatterplot_vs_time(h->exit_tsc, h->v->cr3.data->cr3_id);
} else
scatterplot_vs_time(h->exit_tsc, 2);
}
}
}
void shadow_emulate_process(struct record_info *ri, struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
union {
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
so put it first for alignment sake. */
struct {
unsigned gl1e, write_val;
unsigned va;
unsigned flags:29, emulation_count:3;
} gpl2;
struct {
unsigned long long gl1e, write_val;
unsigned va;
unsigned flags:29, emulation_count:3;
} gpl3;
struct {
unsigned long long gl1e, write_val;
unsigned long long va;
unsigned flags:29, emulation_count:3;
} gpl4;
} *r = (typeof(r))ri->d;
union shadow_event sevt = { .event = ri->event };
int rec_gpl = sevt.paging_levels + 2;
if ( rec_gpl != h->v->guest_paging_levels )
{
fprintf(warn, "%s: record paging levels %d, guest paging levels %d. Switching.\n",
__func__, rec_gpl, h->v->guest_paging_levels);
h->v->guest_paging_levels = rec_gpl;
}
/* Fill in extended information */
switch(rec_gpl)
{
case 2:
if(sizeof(r->gpl2) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl2), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl2.va;
e->flags = r->gpl2.flags;
e->gl1e = r->gpl2.gl1e;
e->wval = r->gpl2.write_val;
break;
case 3:
if(sizeof(r->gpl3) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl3), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl3.va;
e->flags = r->gpl3.flags;
e->gl1e = r->gpl3.gl1e;
e->wval = r->gpl3.write_val;
break;
case 4:
if(sizeof(r->gpl4) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl4), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl4.va;
e->flags = r->gpl4.flags;
e->gl1e = r->gpl4.gl1e;
e->wval = r->gpl4.write_val;
break;
}
pf_preprocess(e,rec_gpl);
if(opt.dump_all)
printf("]%s emulate va %llx gl1e %8llx wval %8llx flags %s(%x) pt_level %d corr %8llx\n",
ri->dump_header,
e->va,
e->gl1e, e->wval,
flag_string(e), e->flags,
e->pt_level, e->corresponding_va);
if ( hvm_set_postprocess(h, shadow_emulate_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
struct shadow_emulate_other {
unsigned long long gfn, va;
};
#define SHADOW_OTHER_LOGS_GFN_NOT_GMFN 1
void shadow_parse_other(struct record_info *ri,
struct shadow_emulate_other *o,
struct hvm_data *h) {
union {
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
so put it first for alignment sake. */
#if SHADOW_OTHER_LOGS_GFN_NOT_GMFN
/* D'OH! Accidentally used mfn_t in the struct, so gmfns are always
64-bit... :-/ */
struct {
unsigned int gfn, va;
} gpl2;
#endif
struct {
unsigned long long gfn;
unsigned int va;
} gpl3;
struct {
unsigned long long gfn, va;
} gpl4;
} *r = (typeof(r))ri->d;
union shadow_event sevt = { .event = ri->event };
int rec_gpl = sevt.paging_levels + 2;
if ( rec_gpl != h->v->guest_paging_levels )
{
fprintf(warn, "%s: record paging levels %d, guest paging levels %d. Switching.\n",
__func__, rec_gpl, h->v->guest_paging_levels);
h->v->guest_paging_levels = rec_gpl;
}
switch(rec_gpl)
{
#if SHADOW_OTHER_LOGS_GFN_NOT_GMFN
case 2:
if(sizeof(r->gpl2) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl2), rec_gpl,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
o->va = r->gpl2.va;
o->gfn = r->gpl2.gfn;
break;
#else
case 2:
/* FALLTHRU */
#endif
case 3:
if(sizeof(r->gpl3) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl3), rec_gpl,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
o->va = r->gpl3.va;
o->gfn = r->gpl3.gfn;
break;
case 4:
if(sizeof(r->gpl4) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl4), rec_gpl,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
o->va = r->gpl4.va;
o->gfn = r->gpl4.gfn;
break;
}
}
#if 0
void shadow_unsync_postprocess(struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
if(h->resyncs > 1)
fprintf(warn, "Strange, %d resyncs for an unsync!\n",
h->resyncs);
if(opt.summary_info) {
update_cycles(&h->summary.pf_xen[PF_XEN_EMULATE_UNSYNC],
h->arc_cycles);
if(h->resyncs <= 1)
update_cycles(&h->summary.pf_xen_unsync[h->resyncs],
h->arc_cycles);
}
}
void shadow_unsync_process(struct record_info *ri, struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
struct shadow_emulate_other r;
shadow_parse_other(ri, &r, h);
e->gmfn = r.gmfn;
e->va = r.va;
pf_preprocess(e, h->v->guest_paging_levels);
if(opt.dump_all)
printf("]%s shadow unsync gmfn %llx va %llx pt_level %d corr %llx\n",
ri->dump_header,
e->gmfn,
e->va,
e->pt_level,
e->corresponding_va);
if ( hvm_set_postprocess(h, shadow_unsync_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
#endif
void shadow_fault_generic_postprocess(struct hvm_data *h);
void shadow_emulate_other_process(struct record_info *ri, struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
struct shadow_emulate_other r;
union shadow_event sevt = { .event = ri->event };
shadow_parse_other(ri, &r, h);
e->gfn = r.gfn;
e->va = r.va;
e->pf_case = sevt.minor;
pf_preprocess(e, h->v->guest_paging_levels);
if(opt.dump_all)
printf("]%s shadow %s gfn %llx va %llx\n",
ri->dump_header,
pf_xen_name[sevt.minor],
e->gfn,
e->va);
if ( hvm_set_postprocess(h, shadow_fault_generic_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void shadow_fixup_postprocess(struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
if ( opt.summary_info )
{
update_cycles(&h->summary.pf_xen[PF_XEN_FIXUP], h->arc_cycles);
if(h->prealloc_unpin) {
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_PREALLOC_UNPIN], h->arc_cycles);
}
if(e->flag_unsync) {
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_UNSYNC], h->arc_cycles);
if(h->resyncs < PF_XEN_FIXUP_UNSYNC_RESYNC_MAX)
update_cycles(&h->summary.pf_xen_fixup_unsync_resync[h->resyncs],
h->arc_cycles);
else
update_cycles(&h->summary.pf_xen_fixup_unsync_resync[PF_XEN_FIXUP_UNSYNC_RESYNC_MAX],
h->arc_cycles);
}
if(e->flag_oos_fixup_add)
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_OOS_ADD], h->arc_cycles);
if(e->flag_oos_fixup_evict)
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_OOS_EVICT], h->arc_cycles);
if(e->flag_promote)
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_PROMOTE], h->arc_cycles);
if(e->flag_wrmap) {
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_WRMAP], h->arc_cycles);
if(e->flag_wrmap_brute_force || h->wrmap_bf)
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_BRUTE_FORCE], h->arc_cycles);
} else if(e->flag_wrmap_brute_force || h->wrmap_bf) {
fprintf(warn, "Strange: wrmap_bf but not wrmap!\n");
}
if(!(e->flag_promote || h->prealloc_unpin || e->flag_unsync))
update_cycles(&h->summary.pf_xen_fixup[PF_XEN_FIXUP_UPDATE_ONLY], h->arc_cycles);
/* more summary info */
if(e->flag_unsync)
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_UNSYNC);
else
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_FIXUP);
}
if(opt.scatterplot_unpin_promote) {
if(h->prealloc_unpin)
scatterplot_vs_time(h->exit_tsc, 0);
if(e->flag_promote) {
if(opt.with_cr3_enumeration) {
if(h->v->cr3.data)
scatterplot_vs_time(h->exit_tsc, h->v->cr3.data->cr3_id);
} else
scatterplot_vs_time(h->exit_tsc, 2);
}
}
}
void shadow_fixup_process(struct record_info *ri, struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
union {
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
so put it first for alignment sake. */
struct {
unsigned int gl1e, va, flags;
} gpl2;
struct {
unsigned long long gl1e;
unsigned int va, flags;
} gpl3;
struct {
unsigned long long gl1e, va;
unsigned int flags;
} gpl4;
} *r = (typeof(r))ri->d;
union shadow_event sevt = { .event = ri->event };
int rec_gpl = sevt.paging_levels + 2;
if ( rec_gpl != h->v->guest_paging_levels )
{
fprintf(warn, "%s: record paging levels %d, guest paging levels %d. Switching.\n",
__func__, rec_gpl, h->v->guest_paging_levels);
h->v->guest_paging_levels = rec_gpl;
}
switch(rec_gpl)
{
case 2:
if(sizeof(r->gpl2) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl2), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl2.va;
e->flags = r->gpl2.flags;
e->gl1e = r->gpl2.gl1e;
break;
case 3:
if(sizeof(r->gpl3) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl3), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl3.va;
e->flags = r->gpl3.flags;
e->gl1e = r->gpl3.gl1e;
break;
case 4:
if(sizeof(r->gpl4) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl4), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl4.va;
e->flags = r->gpl4.flags;
e->gl1e = r->gpl4.gl1e;
break;
}
pf_preprocess(e,rec_gpl);
if(opt.dump_all)
{
if ( e->flag_unsync )
printf("]%s fixup:unsync va %llx gl1e %llx corr %llx flags (%x)%s\n",
ri->dump_header,
e->va, e->gl1e,
e->corresponding_va,
e->flags,
flag_string(e));
else
printf("]%s fixup va %llx gl1e %llx flags (%x)%s\n",
ri->dump_header,
e->va, e->gl1e, e->flags,
flag_string(e));
}
if ( hvm_set_postprocess(h, shadow_fixup_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void shadow_mmio_postprocess(struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
if ( opt.summary_info )
{
if(e->pf_case)
update_cycles(&h->summary.pf_xen[e->pf_case],
h->arc_cycles);
else
fprintf(warn, "Strange, pf_case 0!\n");
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_MMIO);
}
if(opt.with_mmio_enumeration)
enumerate_mmio(h);
}
void shadow_mmio_process(struct record_info *ri, struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
struct mmio_info *m = &h->inflight.mmio;
union {
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
so put it first for alignment sake. */
struct {
unsigned int va;
} gpl2;
struct {
unsigned long long va;
} gpl4;
} *r = (typeof(r))ri->d;
union shadow_event sevt = { .event = ri->event };
int rec_gpl = sevt.paging_levels + 2;
if ( rec_gpl != h->v->guest_paging_levels )
{
fprintf(warn, "%s: record paging levels %d, guest paging levels %d. Switching.\n",
__func__, rec_gpl, h->v->guest_paging_levels);
h->v->guest_paging_levels = rec_gpl;
}
switch(rec_gpl)
{
case 2:
case 3:
if(sizeof(r->gpl2) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl2), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = m->va = r->gpl2.va;
break;
case 4:
if(sizeof(r->gpl4) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl4), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = m->va = r->gpl4.va;
break;
}
if(opt.dump_all)
printf("]%s %smmio va %llx\n",
ri->dump_header,
(e->pf_case==PF_XEN_FAST_MMIO)?"fast ":"",
e->va);
if ( hvm_set_postprocess(h, shadow_mmio_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void shadow_propagate_postprocess(struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
if ( opt.summary_info )
{
if(e->pf_case)
update_cycles(&h->summary.pf_xen[e->pf_case],
h->arc_cycles);
else
fprintf(warn, "Strange, pf_case 0!\n");
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_PROPAGATE);
}
}
void shadow_propagate_process(struct record_info *ri, struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
union {
/* for PAE, guest_l1e may be 64 while guest_va may be 32;
so put it first for alignment sake. */
struct {
unsigned int gl1e, va, flags;
} gpl2;
struct {
unsigned long long gl1e;
unsigned int va, flags;
} gpl3;
struct {
unsigned long long gl1e, va;
unsigned int flags;
} gpl4;
} *r = (typeof(r))ri->d;
union shadow_event sevt = { .event = ri->event };
int rec_gpl = sevt.paging_levels + 2;
if ( rec_gpl != h->v->guest_paging_levels )
{
fprintf(warn, "%s: record paging levels %d, guest paging levels %d. Switching.\n",
__func__, rec_gpl, h->v->guest_paging_levels);
h->v->guest_paging_levels = rec_gpl;
}
switch(rec_gpl)
{
case 2:
if(sizeof(r->gpl2) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl2), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl2.va;
e->flags = r->gpl2.flags;
e->gl1e = r->gpl2.gl1e;
break;
case 3:
if(sizeof(r->gpl3) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl3), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl3.va;
e->flags = r->gpl3.flags;
e->gl1e = r->gpl3.gl1e;
break;
case 4:
if(sizeof(r->gpl4) != ri->extra_words * 4)
{
fprintf(warn, "%s: expected %zd bytes for %d-level guest, got %d!\n",
__func__, sizeof(r->gpl4), h->v->guest_paging_levels,
ri->extra_words * 4);
error(ERR_RECORD, ri);
return;
}
e->va = r->gpl4.va;
e->flags = r->gpl4.flags;
e->gl1e = r->gpl4.gl1e;
break;
}
if(opt.dump_all)
printf("]%s propagate va %llx gl1e %llx flags (%x)%s\n",
ri->dump_header,
e->va, e->gl1e, e->flags,
flag_string(e));
if ( hvm_set_postprocess(h, shadow_propagate_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void shadow_fault_generic_dump(unsigned int event, uint32_t *d, char *prefix,
char * dump_header)
{
char *evt_string, evt_number[10];
union shadow_event sevt = { .event = event };
int i;
if(sevt.minor < PF_XEN_MAX && pf_xen_name[sevt.minor])
{
evt_string = pf_xen_name[sevt.minor];
}
else
{
snprintf(evt_number, 10, "%d", sevt.minor);
evt_string = evt_number;
}
printf("%s%s shadow %s gl %d [",
prefix,
dump_header,
evt_string,
sevt.paging_levels);
for(i=0; i<4; i++)
{
printf(" %x", d[i]);
}
printf(" ]\n");
}
void shadow_fault_generic_postprocess(struct hvm_data *h)
{
struct pf_xen_extra *e = &h->inflight.pf_xen;
if ( e->pf_case < PF_XEN_NOT_SHADOW || e->pf_case > PF_XEN_LAST_FAULT )
{
fprintf(warn, "%s: Strange, unexpected case %d\n",
__func__, e->pf_case);
return;
}
if(opt.summary_info) {
update_cycles(&h->summary.pf_xen[e->pf_case],
h->arc_cycles);
hvm_update_short_summary(h, HVM_SHORT_SUMMARY_PROPAGATE);
}
}
void shadow_fault_generic_process(struct record_info *ri, struct hvm_data *h)
{
union shadow_event sevt = { .event = ri->event };
/* pf-case traces, vs others */
h->inflight.generic.event = ri->event;
bcopy(ri->d, h->inflight.generic.d, sizeof(unsigned int) * 4);
if(opt.dump_all)
shadow_fault_generic_dump(h->inflight.generic.event,
h->inflight.generic.d,
"]", ri->dump_header);
h->inflight.pf_xen.pf_case = sevt.minor;
if ( hvm_set_postprocess(h, shadow_fault_generic_postprocess) )
fprintf(warn, "%s: Strange, postprocess already set\n", __func__);
}
void shadow_resync_process(struct record_info *ri, struct hvm_data *h)
{
struct {
unsigned long long gfn;
} *r = (typeof(r))ri->d;
if(opt.dump_all)
printf(" %s oos resync %s gfn %llx\n",
ri->dump_header,
(ri->event == TRC_SHADOW_RESYNC_FULL)?"full":"only",
r->gfn);
h->resyncs++;
}
void shadow_prealloc_unpin_process(struct record_info *ri, struct hvm_data *h) {
struct {
unsigned long long gfn;
} *r = (typeof(r))ri->d;
if(opt.dump_all)
printf(" %s prealloc-unpin gfn %llx\n",
ri->dump_header, r->gfn);
if(h->prealloc_unpin)
fprintf(warn, "Strange, more than one prealloc_unpin per arc!\n");
h->prealloc_unpin = 1;
if(opt.with_cr3_enumeration)
cr3_prealloc_unpin(h->v, r->gfn);
}
void shadow_wrmap_bf_process(struct record_info *ri, struct hvm_data *h) {
struct {
unsigned long long gfn;
} *r = (typeof(r))ri->d;
if(opt.dump_all)
printf(" %s wrmap-bf gfn %llx\n",
ri->dump_header, r->gfn);
h->wrmap_bf = 1;
}
void shadow_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct hvm_data *h;
union shadow_event sevt = { .event = ri->event };
int gpl = sevt.paging_levels + 2;
assert(p->current);
if(vcpu_set_data_type(p->current, VCPU_DATA_HVM))
return;
h = &p->current->hvm;
if(!h->init || !h->vmexit_valid)
{
if(opt.dump_all)
shadow_fault_generic_dump(ri->event,
ri->d,
"!", ri->dump_header);
return;
}
if(sevt.minor <= PF_XEN_NOT_SHADOW) {
if(p->current->guest_paging_levels != gpl)
{
fprintf(warn, "%s: Changing guest paging levels to %d\n",
__func__, gpl);
p->current->guest_paging_levels = gpl;
}
}
if(sevt.minor <= PF_XEN_LAST_FAULT) {
h->inflight.pf_xen.pf_case = sevt.minor;
if(opt.summary) {
hvm_set_summary_handler(h, hvm_pf_xen_summary, NULL);
}
}
/* FIXME - mask out paging levels */
switch(sevt.minor)
{
case PF_XEN_NOT_SHADOW:
shadow_propagate_process(ri, h);
break;
case PF_XEN_EMULATE:
shadow_emulate_process(ri, h);
break;
case PF_XEN_FIXUP:
shadow_fixup_process(ri, h);
break;
case PF_XEN_MMIO:
case PF_XEN_FAST_MMIO:
shadow_mmio_process(ri, h);
break;
case PF_XEN_EMULATE_UNSHADOW_USER:
case PF_XEN_EMULATE_UNSHADOW_EVTINJ:
case PF_XEN_EMULATE_UNSHADOW_UNHANDLED:
shadow_emulate_other_process(ri, h);
break;
#if 0
case PF_XEN_EMULATE_UNSYNC:
shadow_unsync_process(ri, h);
break;
#endif
case SHADOW_RESYNC_FULL:
case SHADOW_RESYNC_ONLY:
shadow_resync_process(ri, h);
break;
case SHADOW_PREALLOC_UNPIN:
shadow_prealloc_unpin_process(ri, h);
break;
case SHADOW_WRMAP_BF:
shadow_wrmap_bf_process(ri, h);
break;
default:
if(sevt.minor <= PF_XEN_LAST_FAULT) {
shadow_fault_generic_process(ri, h);
} else {
warn_once("Warning: processing shadow as generic\n");
process_generic(ri);
}
break;
}
}
/* ---- PV guests ---- */
union pv_event {
unsigned event;
struct {
unsigned minor:8,
x64:1,
unused1:3,
sub:4,
main:12,
unused:4;
};
};
void pv_hypercall_process(struct record_info *ri, struct pv_data *pv) {
union {
struct {
uint32_t eip, eax;
} x32;
struct {
uint64_t eip;
uint32_t eax;
} x64;
} * r = (typeof(r)) ri->d;
union pv_event pevt = { .event = ri->event };
unsigned long long eip;
unsigned int eax;
if(pevt.x64) {
eip = r->x64.eip;
eax = r->x64.eax;
} else {
eip = r->x32.eip;
eax = r->x32.eax;
}
if(opt.summary_info) {
if(eax < PV_HYPERCALL_MAX)
pv->hypercall_count[eax]++;
}
if(opt.dump_all) {
if(eax < HYPERCALL_MAX)
printf(" %s hypercall %2x (%s) eip %llx\n",
ri->dump_header, eax,
hypercall_name[eax], eip);
else
printf(" %s hypercall %x eip %llx\n",
ri->dump_header, eax, eip);
}
}
void pv_trap_process(struct record_info *ri, struct pv_data *pv) {
union {
struct {
unsigned int eip;
unsigned trapnr:15,
use_error_code:1,
error_code:16;
} x32;
struct {
unsigned long long eip;
unsigned trapnr:15,
use_error_code:1,
error_code:16;
} x64;
} * r = (typeof(r)) ri->d;
union pv_event pevt = { .event = ri->event };
unsigned long long eip;
unsigned trapnr, use_ec, ec;
if(pevt.x64) {
eip = r->x64.eip;
trapnr = r->x64.trapnr;
use_ec = r->x64.use_error_code;
ec = r->x64.error_code;
} else {
eip = r->x32.eip;
trapnr = r->x32.trapnr;
use_ec = r->x32.use_error_code;
ec = r->x32.error_code;
}
if(opt.summary_info) {
if(trapnr < PV_TRAP_MAX)
pv->trap_count[trapnr]++;
}
if(opt.dump_all) {
printf(" %s trap %x eip %llx",
ri->dump_header, trapnr, eip);
if(use_ec)
printf(" ec %x\n", ec);
else
printf("\n");
}
}
void pv_ptwr_emulation_process(struct record_info *ri, struct pv_data *pv) {
union pv_event pevt = { .event = ri->event };
union {
/* gpl2 is deprecated */
struct {
unsigned long long pte;
unsigned int addr, eip;
} gpl3;
struct {
unsigned long long pte;
unsigned long long addr, eip;
} gpl4;
} *r = (typeof(r))ri->d;
struct {
unsigned long long pte, addr, eip;
} e;
switch ( pevt.minor ) {
case PV_PTWR_EMULATION_PAE:
if ( pevt.x64 )
{
fprintf(warn, "Strange: PV_PTWR_EMULATION, but x64! %x\n",
ri->event);
error(ERR_RECORD, ri);
}
e.pte = r->gpl3.pte;
e.addr = r->gpl3.addr;
e.eip = r->gpl3.eip;
break;
case PV_PTWR_EMULATION:
if ( !pevt.x64 )
{
fprintf(warn, "Strange: PV_PTWR_EMULATION, but !x64! %x\n",
ri->event);
error(ERR_RECORD, ri);
}
e.pte = r->gpl4.pte;
e.addr = r->gpl4.addr;
e.eip = r->gpl4.eip;
break;
default:
fprintf(warn, "ERROR: Unknown PV_PTRW minor type %d!\n",
pevt.minor);
error(ERR_RECORD, ri);
return;
}
if ( opt.dump_all )
{
printf(" %s ptwr l1e %llx eip %llx addr %llx\n",
ri->dump_header,
e.pte, e.eip, e.addr);
}
}
void pv_generic_process(struct record_info *ri, struct pv_data *pv) {
union pv_event pevt = { .event = ri->event };
if ( opt.dump_all ) {
if(pevt.minor < PV_MAX && pv_name[pevt.minor])
printf(" %s %s",
ri->dump_header,
pv_name[pevt.minor]);
else
printf(" %s PV-%d ",
ri->dump_header, pevt.minor);
if (ri->extra_words) {
int i;
printf("[ ");
for(i=0; iextra_words; i++) {
printf("%x ", (unsigned)ri->d[i]);
}
printf("]");
}
printf("\n");
}
}
void pv_summary(struct pv_data *pv) {
int i, j;
if(!pv->summary_info)
return;
printf("PV events:\n");
for(i=0; icount[i];
if (i == PV_HYPERCALL_V2)
count += pv->count[PV_HYPERCALL_SUBCALL];
if (count == 0)
continue;
printf(" %s %d\n", pv_name[i], count);
switch(i) {
case PV_HYPERCALL:
case PV_HYPERCALL_V2:
for(j=0; jhypercall_count[j])
printf(" %-29s[%2d]: %6d\n",
hypercall_name[j],
j,
pv->hypercall_count[j]);
}
break;
case PV_TRAP:
for(j=0; jtrap_count[j])
printf(" [%d] %d\n",
j, pv->trap_count[j]);
}
break;
}
}
}
static const char *grant_table_op_str[] = {
"map_grant_ref", "unmap_grant_ref", "setup_table", "dump_table",
"transfer", "copy", "query_size", "unmap_and_replace",
"set_version", "get_status_frames", "get_version", "swap_grant_ref",
};
static const char *vcpu_op_str[] = {
"initialise", "up", "down", "is_up", "get_runstate_info",
"register_runstate_memory_area", "set_periodic_timer",
"stop_periodic_timer", "set_singleshot_timer", "stop_singleshot_timer",
"register_vcpu_info", "send_nmi", "get_physid",
"register_vcpu_time_memory_area",
};
static const char *sched_op_str[] = {
"yield", "block", "shutdown", "poll", "remote_shutdown", "shutdown_code",
"watchdog",
};
static const char *cmd_to_str(const char *strings[], size_t n, uint32_t cmd)
{
static char buf[32];
if (cmd < n)
return strings[cmd];
snprintf(buf, sizeof(buf), "unknown (%d)", cmd);
return buf;
}
#define CMD_TO_STR(op) \
static const char * op ## _to_str(uint32_t cmd) { \
return cmd_to_str(op ## _str, ARRAY_SIZE(op ## _str), cmd); \
}
CMD_TO_STR(grant_table_op);
CMD_TO_STR(vcpu_op);
CMD_TO_STR(sched_op);
void pv_hypercall_gather_args(const struct record_info *ri, uint64_t *args)
{
int i, word;
/* Missing arguments are zeroed. */
memset(args, 0, 6 * sizeof(uint64_t));
for (i = 0, word = 1; i < 6 && word < ri->extra_words; i++) {
int present = pv_hypercall_arg_present(ri, i);
switch (present) {
case ARG_32BIT:
args[i] = ri->d[word];
break;
case ARG_64BIT:
args[i] = ((uint64_t)ri->d[word + 1] << 32) | ri->d[word];
break;
}
/* Skip over any words for this argument. */
word += present;
}
}
static void pv_hypercall_print_args(const struct record_info *ri)
{
int i, word;
for (i = 0, word = 1; i < 6 && word < ri->extra_words; i++) {
int present = pv_hypercall_arg_present(ri, i);
switch (present) {
case ARG_MISSING:
printf(" ??");
break;
case ARG_32BIT:
printf(" %08x", ri->d[word]);
break;
case ARG_64BIT:
printf(" %016"PRIu64"", ((uint64_t)ri->d[word + 1] << 32) | ri->d[word]);
break;
}
word += present;
}
}
void pv_hypercall_v2_process(struct record_info *ri, struct pv_data *pv,
const char *indent)
{
int op = pv_hypercall_op(ri);
if(opt.summary_info) {
if(op < PV_HYPERCALL_MAX)
pv->hypercall_count[op]++;
}
if(opt.dump_all) {
uint64_t args[6];
if(op < HYPERCALL_MAX)
printf(" %s%s hypercall %2x (%s)",
ri->dump_header, indent, op, hypercall_name[op]);
else
printf(" %s%s hypercall %2x",
ri->dump_header, indent, op);
switch(op) {
case HYPERCALL_mmu_update:
pv_hypercall_gather_args(ri, args);
printf(" %d updates%s", (uint32_t)args[1] & ~MMU_UPDATE_PREEMPTED,
(args[1] & MMU_UPDATE_PREEMPTED) ? " (preempted)" : "");
break;
case HYPERCALL_multicall:
pv_hypercall_gather_args(ri, args);
printf(" %d calls", (uint32_t)args[1]);
break;
case HYPERCALL_grant_table_op:
pv_hypercall_gather_args(ri, args);
printf(" %s %d ops", grant_table_op_to_str(args[0]), (uint32_t)args[2]);
break;
case HYPERCALL_vcpu_op:
pv_hypercall_gather_args(ri, args);
printf(" %s vcpu %d", vcpu_op_to_str(args[0]), (uint32_t)args[1]);
break;
case HYPERCALL_mmuext_op:
pv_hypercall_gather_args(ri, args);
printf(" %d ops", (uint32_t)args[1]);
break;
case HYPERCALL_sched_op:
pv_hypercall_gather_args(ri, args);
printf(" %s", sched_op_to_str(args[0]));
break;
default:
pv_hypercall_print_args(ri);
break;
}
printf("\n");
}
}
void pv_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct vcpu_data *v = p->current;
struct pv_data *pv = &v->pv;
union pv_event pevt = { .event = ri->event };
if(vcpu_set_data_type(p->current, VCPU_DATA_PV))
return;
if(opt.summary_info) {
pv->summary_info=1;
if(pevt.minor == PV_PTWR_EMULATION_PAE)
pv->count[PV_PTWR_EMULATION]++;
else
pv->count[pevt.minor]++;
}
switch(pevt.minor)
{
case PV_HYPERCALL:
pv_hypercall_process(ri, pv);
break;
case PV_TRAP:
pv_trap_process(ri, pv);
break;
case PV_PTWR_EMULATION:
case PV_PTWR_EMULATION_PAE:
pv_ptwr_emulation_process(ri, pv);
break;
case PV_HYPERCALL_V2:
pv_hypercall_v2_process(ri, pv, "");
break;
case PV_HYPERCALL_SUBCALL:
pv_hypercall_v2_process(ri, pv, " ");
break;
default:
pv_generic_process(ri, pv);
break;
}
}
/* ---- Schedule ---- */
struct vcpu_data * vcpu_create(struct domain_data *d, int vid)
{
struct vcpu_data *v;
assert(d->vcpu[vid] == NULL);
fprintf(warn, "Creating vcpu %d for dom %d\n", vid, d->did);
if((v=malloc(sizeof(*v)))==NULL)
{
fprintf(stderr, "%s: malloc %zd failed!\n", __func__, sizeof(*d));
error(ERR_SYSTEM, NULL);
}
bzero(v, sizeof(*v));
v->vid = vid;
v->d = d;
v->p = NULL;
v->runstate.state = RUNSTATE_INIT;
v->runstate.last_oldstate.wrong = RUNSTATE_INIT;
d->vcpu[vid] = v;
assert(v == v->d->vcpu[v->vid]);
if(vid > d->max_vid)
d->max_vid = vid;
return v;
}
/* Called by both domain_create and sched_default_domain_init */
void domain_init(struct domain_data *d, int did)
{
bzero(d, sizeof(*d));
d->did = did;
d->next = NULL;
if(opt.interval.check == INTERVAL_CHECK_DOMAIN)
interval_domain_value_check(d);
}
struct domain_data * domain_create(int did)
{
struct domain_data *d;
fprintf(warn, "Creating domain %d\n", did);
if((d=malloc(sizeof(*d)))==NULL)
{
fprintf(stderr, "%s: malloc %zd failed!\n", __func__, sizeof(*d));
error(ERR_SYSTEM, NULL);
}
/* Initialize domain & vcpus */
domain_init(d, did);
return d;
}
struct domain_data * domain_find(int did)
{
struct domain_data *d, *n, **q;
/* Look for domain, keeping track of the last pointer so we can add
a domain if we need to. */
for ( d = domain_list, q=&domain_list ;
d && (d->did < did) ;
q = &d->next, d=d->next ) ;
if(d && d->did == did)
return d;
/* Make a new domain */
n = domain_create(did);
/* Insert it into the list */
n->next = d;
*q = n;
return n;
}
struct vcpu_data * vcpu_find(int did, int vid)
{
struct domain_data *d;
struct vcpu_data *v;
d = domain_find(did);
v = d->vcpu[vid];
if(!v)
v = vcpu_create(d, vid);
return v;
}
void pcpu_runstate_update(struct pcpu_info *p, tsc_t tsc)
{
if ( p->time.tsc )
{
if ( p->current->d->did == IDLE_DOMAIN )
update_cycles(&p->time.idle, tsc - p->time.tsc);
else
update_cycles(&p->time.running, tsc - p->time.tsc);
p->time.tsc = 0;
}
}
void vcpu_prev_update(struct pcpu_info *p, struct vcpu_data *prev,
tsc_t tsc, int new_runstate)
{
assert(prev == prev->d->vcpu[prev->vid]);
if(prev->p != p)
{
fprintf(warn, "Strange, sched_switch on pcpu %d, prev->pcpu %d!\n",
p->pid, prev->p->pid);
prev->runstate.tsc = 0;
goto set;
}
//assert(p->current);
if ( !p->current )
{
fprintf(warn, "%s: FATAL: p->current NULL!\n", __func__);
error(ERR_ASSERT, NULL);
}
if(p->current != prev)
{
fprintf(warn, "Strange, sched_switch prev d%dv%d, pcpu %d current d%dv%d!\n",
prev->d->did, prev->vid,
p->pid, p->current->d->did, p->current->vid);
prev->runstate.tsc = 0;
goto set;
}
if(prev->runstate.state != RUNSTATE_RUNNING)
{
fprintf(warn, "Strange, prev d%dv%d not running!\n",
prev->d->did, prev->vid);
prev->runstate.tsc = 0;
goto set;
}
set:
pcpu_runstate_update(p, tsc);
p->current = NULL;
pcpu_string_draw(p);
runstate_update(prev, new_runstate, tsc);
}
void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc)
{
assert(next == next->d->vcpu[next->vid]);
//assert(p->current == NULL);
if ( p->current != NULL )
{
if ( p->lost_record.seen_valid_schedule == 0 )
{
fprintf(warn, "%s: p->current non-NULL, but seen_valid_schedule 0. Ignoring.\n",
__func__);
runstate_update(p->current, RUNSTATE_LOST, tsc);
p->current = NULL;
}
else
{
fprintf(warn, "%s: FATAL: p->current not NULL! (d%dv%d, runstate %s)\n",
__func__,
p->current->d->did,
p->current->vid,
runstate_name[p->current->runstate.state]);
error(ERR_ASSERT, NULL);
}
}
if(next->activated)
{
/* We may get lost records at start-of-day, so ignore
setting runstate of default vcpus */
if(next->runstate.state == RUNSTATE_RUNNING
&& next->d->did != DEFAULT_DOMAIN)
{
fprintf(warn, "Strange, next d%dv%d already running on proc %d!\n",
next->d->did, next->vid,
next->p->pid);
next->runstate.tsc = 0;
}
/* If we're moving from one pcpu to another, record change & update tsc */
if(next->p != p) {
if(next->pcpu_tsc)
{
update_cycles(&next->cpu_affinity_all, tsc - next->pcpu_tsc);
update_cycles(&next->cpu_affinity_pcpu[p->pid], tsc - next->pcpu_tsc);
}
next->pcpu_tsc = tsc;
}
}
else
{
next->guest_paging_levels = opt.default_guest_paging_levels;
next->activated = 1;
next->pcpu_tsc = tsc;
}
runstate_update(next, RUNSTATE_RUNNING, tsc);
if ( opt.scatterplot_pcpu
&& next->d->did != IDLE_DOMAIN
&& next->d->did != DEFAULT_DOMAIN )
{
struct time_struct t;
abs_cycles_to_time(tsc, &t);
if ( next->p )
printf("%dv%d %u.%09u %d\n",
next->d->did, next->vid,
t.s, t.ns,
next->p->pid);
printf("%dv%d %u.%09u %d\n",
next->d->did, next->vid,
t.s, t.ns,
p->pid);
}
next->p = p;
p->current = next;
pcpu_string_draw(p);
p->time.tsc = tsc;
p->lost_record.seen_valid_schedule = 1;
}
/* If current is the default domain, we're fixing up from something
* like start-of-day. Update what we can. */
void vcpu_start(struct pcpu_info *p, struct vcpu_data *v) {
/* If vcpus are created, or first show up, in a "dead zone", this will
* fail. */
if( !p->current || p->current->d->did != DEFAULT_DOMAIN) {
fprintf(stderr, "Strange, p->current not default domain!\n");
error(ERR_FILE, NULL);
return;
}
if(!p->first_tsc) {
fprintf(stderr, "Strange, p%d first_tsc 0!\n", p->pid);
error(ERR_FILE, NULL);
}
if(p->first_tsc <= p->current->runstate.tsc) {
fprintf(stderr, "Strange, first_tsc %llx < default_domain runstate tsc %llx!\n",
p->first_tsc,
p->current->runstate.tsc);
error(ERR_FILE, NULL);
}
/* Change default domain to 'queued' */
runstate_update(p->current, RUNSTATE_QUEUED, p->first_tsc);
/* FIXME: Copy over data from the default domain this interval */
fprintf(warn, "Using first_tsc for d%dv%d (%lld cycles)\n",
v->d->did, v->vid, p->last_tsc - p->first_tsc);
/* Simulate the time since the first tsc */
runstate_update(v, RUNSTATE_RUNNING, p->first_tsc);
p->time.tsc = p->first_tsc;
p->current = v;
pcpu_string_draw(p);
v->p = p;
}
void sched_runstate_process(struct pcpu_info *p)
{
enum {
CHANGE=0,
CONTINUE
} type;
struct vcpu_data *v;
struct record_info *ri = &p->ri;
struct {
unsigned vcpu:16, dom:16;
unsigned long long p1, p2;
} __attribute__((packed)) * r = (typeof(r))ri->d;
union {
unsigned int event;
struct {
unsigned lo:4,
new_runstate:4,
old_runstate:4,
sub:4,
main:12,
unused:4;
};
} _sevt = { .event = ri->event };
struct {
int new_runstate, old_runstate;
} sevt;
int perfctrs;
struct last_oldstate_struct last_oldstate;
switch(_sevt.lo)
{
case 1:
type = CHANGE;
sevt.new_runstate = _sevt.new_runstate;
sevt.old_runstate = _sevt.old_runstate;
break;
case 2:
type = CONTINUE;
sevt.new_runstate = sevt.old_runstate = RUNSTATE_RUNNING;
break;
default:
fprintf(warn, "FATAL: Unexpected runstate change type %d!\n",
_sevt.lo);
error(ERR_RECORD, NULL);
return;
}
perfctrs = (ri->extra_words == 5);
if(opt.dump_all) {
if( perfctrs ) {
printf(" %s %s {%lld,%lld} d%uv%u %s->%s\n",
ri->dump_header,
type?"runstate_continue":"runstate_change",
r->p1, r->p2,
r->dom, r->vcpu,
runstate_name[sevt.old_runstate],
runstate_name[sevt.new_runstate]);
} else {
printf(" %s %s d%uv%u %s->%s\n",
ri->dump_header,
type?"runstate_continue":"runstate_change",
r->dom, r->vcpu,
runstate_name[sevt.old_runstate],
runstate_name[sevt.new_runstate]);
}
}
/* Sanity check: expected transitions */
if ( type == CHANGE )
{
if( (sevt.new_runstate == RUNSTATE_RUNNING
&& sevt.old_runstate != RUNSTATE_RUNNABLE)
|| (sevt.new_runstate == RUNSTATE_BLOCKED
&& sevt.old_runstate == RUNSTATE_RUNNABLE ) )
{
fprintf(warn, "Strange, d%dv%d unexpected runstate transition %s->%s\n",
r->dom, r->vcpu,
runstate_name[sevt.old_runstate],
runstate_name[sevt.new_runstate]);
}
}
if(r->vcpu > MAX_CPUS)
{
fprintf(warn, "%s: vcpu %u > MAX_VCPUS %d!\n",
__func__, r->vcpu, MAX_CPUS);
return;
}
v = vcpu_find(r->dom, r->vcpu);
/* We want last_oldstate reset every time; so copy the last one and use
* that locally, clobbering the one in the vcpu struct. If it needs to
* be reset, it will be reset below. */
last_oldstate = v->runstate.last_oldstate;
v->runstate.last_oldstate.wrong = RUNSTATE_INIT;
/* Close vmexits when the putative reason for blocking / &c stops.
* This way, we don't account cpu contention to some other overhead. */
if(sevt.new_runstate == RUNSTATE_RUNNABLE
&& v->data_type == VCPU_DATA_HVM
&& v->hvm.vmexit_valid) {
hvm_close_vmexit(&v->hvm, ri->tsc);
}
/* Track waking state */
if ( v->data_type == VCPU_DATA_HVM && v->runstate.state != RUNSTATE_LOST ) {
if ( sevt.new_runstate == RUNSTATE_RUNNABLE
&& sevt.old_runstate == RUNSTATE_BLOCKED )
{
/* Hmm... want to make sure we're not in some weird
vmexit state... have to look later. */
if(opt.dump_all)
printf(" [w2h] d%dv%d Setting waking\n", v->d->did, v->vid);
v->hvm.w2h.waking = 1;
}
else if ( sevt.new_runstate != RUNSTATE_RUNNING
|| sevt.old_runstate != RUNSTATE_RUNNABLE )
{
if( v->hvm.w2h.waking
&& sevt.old_runstate == RUNSTATE_RUNNING
&& sevt.new_runstate != RUNSTATE_OFFLINE )
{
/* NB: This is printed a lot unnecessairly when there is TSC skew */
if ( sevt.old_runstate != v->runstate.state )
fprintf(warn, "Strange, unexpected waking transition for d%dv%d: %s -> %s\n",
v->d->did, v->vid,
runstate_name[sevt.old_runstate],
runstate_name[sevt.new_runstate]);
v->hvm.w2h.waking = 0;
}
/* Close wake-to-halt summary */
/* FIXME: Need to think about handling preemption. */
if (sevt.new_runstate == RUNSTATE_BLOCKED
&& sevt.old_runstate == RUNSTATE_RUNNING
&& v->hvm.w2h.interrupts ) {
int i;
for(i=0; ihvm.summary.guest_interrupt + i;
tsc_t start_tsc = g->start_tsc;
if(start_tsc) {
tsc_t t = (start_tsc == 1) ? 0 : ri->tsc - start_tsc;
if(opt.dump_all)
printf(" [w2h] Halting vec %d is_wake %d time %lld\n",
i,
g->is_wake,
t);
if(opt.scatterplot_wake_to_halt
&& t
&& g->is_wake)
scatterplot_vs_time(ri->tsc, t);
if(opt.summary && t) {
if(g->is_wake) {
if(v->hvm.w2h.interrupts==1)
update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ALONE],
t);
update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_WAKE_TO_HALT_ANY],
t);
} else {
update_cycles(&g->runtime[GUEST_INTERRUPT_CASE_INTERRUPT_TO_HALT],
t);
}
}
g->start_tsc = 0;
g->is_wake = 0;
}
}
v->hvm.w2h.interrupts = 0;
v->hvm.w2h.vector = 0;
}
}
}
/* Sanity checks / tsc skew detection */
if( v->runstate.state != sevt.old_runstate
&& v->runstate.state != RUNSTATE_INIT )
{
if(v->runstate.state == RUNSTATE_LOST) {
if( sevt.new_runstate == RUNSTATE_RUNNING )
goto update;
else if(opt.dump_all)
fprintf(warn, "%s: d%dv%d in runstate lost, not updating to %s\n",
__func__, v->d->did, v->vid,
runstate_name[sevt.new_runstate]);
goto no_update;
} else if (last_oldstate.wrong == sevt.new_runstate
&& last_oldstate.actual == sevt.old_runstate) {
tsc_t lag, old_offset;
struct pcpu_info *p2;
if(ri->tsc < last_oldstate.tsc) {
fprintf(warn, "WARNING: new tsc %lld < detected runstate tsc %lld! Not updating\n",
ri->tsc, last_oldstate.tsc);
goto no_update;
}
p2 = P.pcpu + last_oldstate.pid;
lag = ri->tsc
- last_oldstate.tsc;
old_offset = p2->tsc_skew.offset;
cpumask_union(&p2->tsc_skew.downstream, &p->tsc_skew.downstream);
cpumask_set(&p2->tsc_skew.downstream, p->pid);
if(cpumask_isset(&p2->tsc_skew.downstream, p2->pid)) {
if ( opt.tsc_loop_fatal )
{
fprintf(stderr, "FATAL: tsc skew dependency loop detected!\n");
error(ERR_FILE, NULL);
}
else
{
int i;
fprintf(warn, "Tsc skew dependency loop detected! Resetting...\n");
for ( i=0; i<=P.max_active_pcpu; i++)
{
struct pcpu_info *p = P.pcpu + i;
p->tsc_skew.offset = 0;
cpumask_init(&p->tsc_skew.downstream);
}
goto no_update;
}
}
p2->tsc_skew.offset += lag * 2;
fprintf(warn, "TSC skew detected p%d->p%d, %lld cycles. Changing p%d offset from %lld to %lld\n",
p->pid, p2->pid, lag,
p2->pid,
old_offset,
p2->tsc_skew.offset);
goto no_update;
} else {
fprintf(warn, "runstate_change old_runstate %s, d%dv%d runstate %s. Possible tsc skew.\n",
runstate_name[sevt.old_runstate],
v->d->did, v->vid,
runstate_name[v->runstate.state]);
v->runstate.last_oldstate.wrong = sevt.old_runstate;
v->runstate.last_oldstate.actual = v->runstate.state;
v->runstate.last_oldstate.tsc = ri->tsc;
v->runstate.last_oldstate.pid = p->pid;
if ( v->runstate.state == RUNSTATE_RUNNING )
{
fprintf(warn, " Not updating.\n");
goto no_update;
}
goto update;
}
fprintf(stderr, "FATAL: Logic hole in %s\n", __func__);
error(ERR_ASSERT, NULL);
}
update:
/* Actually update the runstate. Special things to do if we're starting
* or stopping actually running on a physical cpu. */
if ( type == CONTINUE )
{
if( v->runstate.state == RUNSTATE_INIT ) {
/* Start-of-day; account first tsc -> now to v */
vcpu_start(p, v);
} else {
/* Continue running. First, do some sanity checks */
if ( v->runstate.state == RUNSTATE_LOST ) {
fprintf(warn, "WARNING: continue with d%dv%d in RUNSTATE_LOST. Resetting current.\n",
v->d->did, v->vid);
if ( p->current )
vcpu_prev_update(p, p->current, ri->tsc, RUNSTATE_LOST);
vcpu_next_update(p, v, ri->tsc);
}
else if( v->runstate.state != RUNSTATE_RUNNING ) {
/* This should never happen. */
fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n",
v->d->did, v->vid, runstate_name[v->runstate.state]);
error(ERR_FILE, NULL);
} else if ( v->p != p ) {
fprintf(warn, "FATAL: continue on p%d, but d%dv%d p%d!\n",
p->pid, v->d->did, v->vid,
v->p ? v->p->pid : -1);
error(ERR_FILE, NULL);
}
runstate_update(v, RUNSTATE_RUNNING, ri->tsc);
}
}
else if ( sevt.old_runstate == RUNSTATE_RUNNING
|| v->runstate.state == RUNSTATE_RUNNING )
{
/*
* Cases:
* old running, v running:
* normal (prev update p, lost record check)
* v running, old ! running:
* tsc skew (prev update v->p, lost record check)
* old running, v init:
start-of-day (fake update, prev p, lost record)
* old running, v !{running,init}:
* # (should never happen)
*/
if( sevt.old_runstate == RUNSTATE_RUNNING ) {
if( v->runstate.state == RUNSTATE_INIT ) {
/* Start-of-day; account first tsc -> now to v */
vcpu_start(p, v);
} else if( v->runstate.state != RUNSTATE_RUNNING
&& v->runstate.state != RUNSTATE_LOST ) {
/* This should never happen. */
fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n",
v->d->did, v->vid, runstate_name[v->runstate.state]);
error(ERR_FILE, NULL);
}
vcpu_prev_update(p, v, ri->tsc, sevt.new_runstate);
} else {
vcpu_prev_update(v->p, v, ri->tsc, sevt.new_runstate);
}
if(P.lost_cpus && v->d->did != IDLE_DOMAIN) {
if(opt.dump_all)
fprintf(warn, "%s: %d lost cpus, setting d%dv%d runstate to RUNSTATE_LOST\n",
__func__, P.lost_cpus, v->d->did, v->vid);
lose_vcpu(v, ri->tsc);
}
}
else if ( sevt.new_runstate == RUNSTATE_RUNNING )
{
if(perfctrs) {
v->runstate.p1_start = r->p1;
v->runstate.p2_start = r->p2;
}
vcpu_next_update(p, v, ri->tsc);
}
else if ( v->runstate.state != RUNSTATE_INIT )
{
/* TSC skew at start-of-day is hard to deal with. Don't
* bring a vcpu out of INIT until it's seen to be actually
* running somewhere. */
runstate_update(v, sevt.new_runstate, ri->tsc);
}
no_update:
return;
}
void dump_sched_switch(struct record_info *ri)
{
struct {
unsigned int prev_dom, prev_vcpu, next_dom, next_vcpu;
} *r = (typeof(r))ri->d;
printf(" %s sched_switch prev d%uv%u next d%uv%u\n",
ri->dump_header, r->prev_dom, r->prev_vcpu,
r->next_dom, r->next_vcpu);
}
void sched_switch_process(struct pcpu_info *p)
{
struct vcpu_data *prev, *next;
struct record_info *ri = &p->ri;
struct {
unsigned int prev_dom, prev_vcpu, next_dom, next_vcpu;
} * r = (typeof(r))ri->d;
if(opt.dump_all)
dump_sched_switch(ri);
if(r->prev_vcpu > MAX_CPUS)
{
fprintf(warn, "%s: prev_vcpu %u > MAX_VCPUS %d!\n",
__func__, r->prev_vcpu, MAX_CPUS);
return;
}
if(r->next_vcpu > MAX_CPUS)
{
fprintf(warn, "%s: next_vcpu %u > MAX_VCPUS %d!\n",
__func__, r->next_vcpu, MAX_CPUS);
return;
}
prev = vcpu_find(r->prev_dom, r->prev_vcpu);
next = vcpu_find(r->next_dom, r->next_vcpu);
vcpu_prev_update(p, prev, ri->tsc, RUNSTATE_QUEUED); /* FIXME */
vcpu_next_update(p, next, ri->tsc);
}
void sched_default_vcpu_activate(struct pcpu_info *p)
{
struct vcpu_data *v = default_domain.vcpu[p->pid];
if(!v)
v = vcpu_create(&default_domain, p->pid);
assert(v == v->d->vcpu[v->vid]);
v->activated = 1;
v->guest_paging_levels = opt.default_guest_paging_levels;
v->p = p;
v->runstate.state = RUNSTATE_RUNNING;
p->current = v;
pcpu_string_draw(p);
}
void sched_default_domain_init(void)
{
struct domain_data *d = &default_domain;
domain_init(d, DEFAULT_DOMAIN);
}
void runstate_clear(tsc_t * runstate_cycles)
{
int i;
for(i=0; ipcpu_tsc )
{
update_cycles(&v->cpu_affinity_all, P.f.last_tsc - v->pcpu_tsc);
update_cycles(&v->cpu_affinity_pcpu[v->p->pid], P.f.last_tsc - v->pcpu_tsc);
}
printf(" Runstates:\n");
for(i=0; irunstates+i, desc);
if ( i==RUNSTATE_RUNNABLE )
{
int j;
for(j=0; jrunnable_states+j, desc);
}
}
}
print_cpu_affinity(&v->cpu_affinity_all, " cpu affinity");
for ( i = 0; i < MAX_CPUS ; i++)
{
snprintf(desc,30, " [%d]", i);
print_cpu_affinity(v->cpu_affinity_pcpu+i, desc);
}
}
void sched_summary_domain(struct domain_data *d)
{
int i;
char desc[30];
printf(" Runstates:\n");
for(i=0; irunstates+i, desc);
}
}
void dump_sched_vcpu_action(struct record_info *ri, const char *action)
{
struct {
unsigned int domid, vcpuid;
} *r = (typeof(r))ri->d;
printf(" %s %s d%uv%u\n", ri->dump_header, action, r->domid, r->vcpuid);
}
void sched_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
if(ri->evt.sub == 0xf) {
switch(ri->event)
{
case TRC_SCHED_SWITCH:
sched_switch_process(p);
break;
default:
process_generic(&p->ri);
}
return;
}
if(ri->evt.sub == 1) {
/* TRC_SCHED_MIN */
sched_runstate_process(p);
} else if (ri->evt.sub == 8) {
/* TRC_SCHED_VERBOSE */
switch(ri->event)
{
case TRC_SCHED_DOM_ADD:
if(opt.dump_all) {
struct {
unsigned int domid;
} *r = (typeof(r))ri->d;
printf(" %s sched_init_domain d%u\n", ri->dump_header, r->domid);
}
break;
case TRC_SCHED_DOM_REM:
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid;
} *r = (typeof(r))ri->d;
printf(" %s sched_destroy_domain d%u\n", ri->dump_header, r->domid);
}
break;
case TRC_SCHED_SLEEP:
if(opt.dump_all)
dump_sched_vcpu_action(ri, "vcpu_sleep");
break;
case TRC_SCHED_WAKE:
if(opt.dump_all)
dump_sched_vcpu_action(ri, "vcpu_wake");
break;
case TRC_SCHED_YIELD:
if(opt.dump_all)
dump_sched_vcpu_action(ri, "vcpu_yield");
break;
case TRC_SCHED_BLOCK:
if(opt.dump_all)
dump_sched_vcpu_action(ri, "vcpu_block");
break;
case TRC_SCHED_SHUTDOWN:
case TRC_SCHED_SHUTDOWN_CODE:
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid, reason;
} *r = (typeof(r))ri->d;
printf(" %s %s d%uv%u, reason = %u\n", ri->dump_header,
ri->event == TRC_SCHED_SHUTDOWN ? "sched_shutdown" :
"sched_shutdown_code", r->domid, r->vcpuid, r->reason);
}
break;
case TRC_SCHED_ADJDOM:
if(opt.dump_all) {
struct {
unsigned int domid;
} *r = (typeof(r))ri->d;
printf(" %s sched_adjust d%u\n", ri->dump_header, r->domid);
}
break;
case TRC_SCHED_SWITCH:
if(opt.dump_all)
dump_sched_switch(ri);
break;
case TRC_SCHED_SWITCH_INFPREV:
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid, runtime;
} *r = (typeof(r))ri->d;
printf(" %s sched_switch prev d%uv%d, run for %u.%uus\n",
ri->dump_header, r->domid, r->vcpuid,
r->runtime / 1000, r->runtime % 1000);
}
break;
case TRC_SCHED_SWITCH_INFNEXT:
if(opt.dump_all)
{
struct {
unsigned int domid, vcpuid, rsince;
int slice;
} *r = (typeof(r))ri->d;
printf(" %s sched_switch next d%uv%u", ri->dump_header,
r->domid, r->vcpuid);
if ( r->rsince != 0 )
printf(", was runnable for %u.%uus", r->rsince / 1000,
r->rsince % 1000);
if ( r->slice > 0 )
printf(", next slice %u.%uus", r->slice / 1000,
r->slice % 1000);
printf("\n");
}
break;
case TRC_SCHED_SWITCH_INFCONT:
if(opt.dump_all)
{
struct {
unsigned int domid, vcpuid, rsince;
int slice;
} *r = (typeof(r))ri->d;
printf(" %s sched_switch continue d%uv%u, run for %u.%uus",
ri->dump_header, r->domid, r->vcpuid,
r->rsince / 1000, r->rsince % 1000);
if ( r->slice > 0 )
printf(", next slice %u.%uus", r->slice / 1000,
r->slice % 1000);
printf("\n");
}
break;
case TRC_SCHED_CTL:
case TRC_SCHED_S_TIMER_FN:
case TRC_SCHED_T_TIMER_FN:
case TRC_SCHED_DOM_TIMER_FN:
break;
default:
process_generic(&p->ri);
}
} else if(ri->evt.sub == 2) {
/* TRC_SCHED_CLASS */
switch(ri->event)
{
/* CREDIT (TRC_CSCHED_xxx) */
case TRC_SCHED_CLASS_EVT(CSCHED, 1): /* SCHED_TASKLET */
if(opt.dump_all)
printf(" %s csched:sched_tasklet\n", ri->dump_header);
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 2): /* ACCOUNT_START */
case TRC_SCHED_CLASS_EVT(CSCHED, 3): /* ACCOUNT_STOP */
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid, actv_cnt;
} *r = (typeof(r))ri->d;
printf(" %s csched:acct_%s d%uv%u, active_vcpus %u\n",
ri->dump_header,
ri->event == TRC_SCHED_CLASS_EVT(CSCHED, 2) ?
"start" : "stop",
r->domid, r->vcpuid,
r->actv_cnt);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 4): /* STOLEN_VCPU */
if(opt.dump_all) {
struct {
unsigned int peer_cpu, domid, vcpuid;
} *r = (typeof(r))ri->d;
printf(" %s csched:stolen_vcpu d%uv%u from cpu %u\n",
ri->dump_header, r->domid, r->vcpuid, r->peer_cpu);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 5): /* PICKED_CPU */
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid, cpu;
} *r = (typeof(r))ri->d;
printf(" %s csched:pick_cpu %u for d%uv%u\n",
ri->dump_header, r->cpu, r->domid, r->vcpuid);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 6): /* TICKLE */
if(opt.dump_all) {
struct {
unsigned int cpu;
} *r = (typeof(r))ri->d;
printf(" %s csched:runq_tickle, cpu %u\n",
ri->dump_header, r->cpu);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 7): /* BOOST_START */
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid;
} *r = (typeof(r))ri->d;
printf(" %s csched: d%uv%u boosted\n",
ri->dump_header, r->domid, r->vcpuid);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 8): /* BOOST_END */
if(opt.dump_all) {
struct {
unsigned int domid, vcpuid;
} *r = (typeof(r))ri->d;
printf(" %s csched: d%uv%u unboosted\n",
ri->dump_header, r->domid, r->vcpuid);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 9): /* SCHEDULE */
if(opt.dump_all) {
struct {
unsigned int cpu:16, tasklet:8, idle:8;
} *r = (typeof(r))ri->d;
printf(" %s csched:schedule cpu %u%s%s\n",
ri->dump_header, r->cpu,
r->tasklet ? ", tasklet scheduled" : "",
r->idle ? ", idle" : ", busy");
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 10): /* RATELIMIT */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
unsigned int runtime;
} *r = (typeof(r))ri->d;
printf(" %s csched:ratelimit, d%uv%u run only %u.%uus\n",
ri->dump_header, r->domid, r->vcpuid,
r->runtime / 1000, r->runtime % 1000);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED, 11): /* STEAL_CHECK */
if(opt.dump_all) {
struct {
unsigned int peer_cpu, check;
} *r = (typeof(r))ri->d;
printf(" %s csched:load_balance %s %u\n",
ri->dump_header, r->check ? "checking" : "skipping",
r->peer_cpu);
}
break;
/* CREDIT 2 (TRC_CSCHED2_xxx) */
case TRC_SCHED_CLASS_EVT(CSCHED2, 1): /* TICK */
case TRC_SCHED_CLASS_EVT(CSCHED2, 4): /* CREDIT_ADD */
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 2): /* RUNQ_POS */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16, pos;
} *r = (typeof(r))ri->d;
printf(" %s csched2:runq_insert d%uv%u, position %u\n",
ri->dump_header, r->domid, r->vcpuid, r->pos);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 3): /* CREDIT_BURN */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
int credit, budget, delta;
} *r = (typeof(r))ri->d;
printf(" %s csched2:burn_credits d%uv%u, credit = %d, ",
ri->dump_header, r->domid, r->vcpuid, r->credit);
if ( r->budget != INT_MIN )
printf("budget = %d, ", r->budget);
printf("delta = %d\n", r->delta);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 5): /* TICKLE_CHECK */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
int credit, score;
} *r = (typeof(r))ri->d;
printf(" %s csched2:tickle_check d%uv%u, credit = %d, score = %d\n\n",
ri->dump_header, r->domid, r->vcpuid,
r->credit, r->score);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 6): /* TICKLE */
if(opt.dump_all) {
struct {
unsigned int cpu:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:runq_tickle cpu %u\n",
ri->dump_header, r->cpu);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 7): /* CREDIT_RESET */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
int credit_start, credit_end;
} *r = (typeof(r))ri->d;
printf(" %s csched2:reset_credits d%uv%u, "
"credit_start = %d, credit_end = %d\n",
ri->dump_header, r->domid, r->vcpuid,
r->credit_start, r->credit_end);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 8): /* SCHED_TASKLET */
if(opt.dump_all)
printf(" %s csched2:sched_tasklet\n", ri->dump_header);
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 9): /* UPDATE_LOAD */
if(opt.dump_all)
printf(" %s csched2:update_load\n", ri->dump_header);
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 10): /* RUNQ_ASSIGN */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
unsigned int rqi:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:runq_assign d%uv%u on rq# %u\n",
ri->dump_header, r->domid, r->vcpuid, r->rqi);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 11): /* UPDATE_VCPU_LOAD */
if(opt.dump_all) {
struct {
uint64_t vcpuload;
unsigned int vcpuid:16, domid:16;
unsigned int shift;
} *r = (typeof(r))ri->d;
double vcpuload;
vcpuload = (r->vcpuload * 100.0) / (1ULL << r->shift);
printf(" %s csched2:update_vcpu_load d%uv%u, "
"vcpu_load = %4.3f%% (%"PRIu64")\n",
ri->dump_header, r->domid, r->vcpuid, vcpuload,
r->vcpuload);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 12): /* UPDATE_RUNQ_LOAD */
if(opt.dump_all) {
struct {
uint64_t rq_avgload, b_avgload;
unsigned int rq_load:16, rq_id:8, shift:8;
} *r = (typeof(r))ri->d;
double avgload, b_avgload;
avgload = (r->rq_avgload * 100.0) / (1ULL << r->shift);
b_avgload = (r->b_avgload * 100.0) / (1ULL << r->shift);
printf(" %s csched2:update_rq_load rq# %u, load = %u, "
"avgload = %4.3f%% (%"PRIu64"), "
"b_avgload = %4.3f%% (%"PRIu64")\n",
ri->dump_header, r->rq_id, r->rq_load,
avgload, r->rq_avgload, b_avgload, r->b_avgload);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 13): /* TICKLE_NEW */
if (opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16, processor;
int credit;
} *r = (typeof(r))ri->d;
printf(" %s csched2:runq_tickle_new d%uv%u, "
"processor = %u, credit = %d\n",
ri->dump_header, r->domid, r->vcpuid,
r->processor, r->credit);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 14): /* RUNQ_MAX_WEIGHT */
if (opt.dump_all) {
struct {
unsigned rqi:16, max_weight:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:update_max_weight rq# %u, max_weight = %u\n",
ri->dump_header, r->rqi, r->max_weight);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 15): /* MIGRATE */
if (opt.dump_all) {
struct {
unsigned vcpuid:16, domid:16;
unsigned rqi:16, trqi:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:migrate d%uv%u rq# %u --> rq# %u\n",
ri->dump_header, r->domid, r->vcpuid, r->rqi, r->trqi);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 16): /* LOAD_CHECK */
if (opt.dump_all) {
struct {
unsigned lrqi:16, orqi:16;
unsigned load_delta;
} *r = (typeof(r))ri->d;
printf(" %s csched2:load_balance_check lrq# %u, orq# %u, "
"delta = %u\n",
ri->dump_header, r->lrqi, r->orqi, r->load_delta);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 17): /* LOAD_BALANCE */
if (opt.dump_all) {
struct {
uint64_t lb_avgload, ob_avgload;
unsigned lrqi:16, orqi:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:load_balance_begin lrq# %u, "
"avg_load = %"PRIu64" -- orq# %u, avg_load = %"PRIu64"\n",
ri->dump_header, r->lrqi, r->lb_avgload,
r->orqi, r->ob_avgload);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 19): /* PICKED_CPU */
if (opt.dump_all) {
struct {
uint64_t b_avgload;
unsigned vcpuid:16, domid:16;
unsigned rqi:16, cpu:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:picked_cpu d%uv%u, rq# %u, cpu %u\n",
ri->dump_header, r->domid, r->vcpuid, r->rqi, r->cpu);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 20): /* RUNQ_CANDIDATE */
if (opt.dump_all) {
struct {
unsigned vcpuid:16, domid:16;
unsigned tickled_cpu;
int credit;
} *r = (typeof(r))ri->d;
printf(" %s csched2:runq_candidate d%uv%u, credit = %d, ",
ri->dump_header, r->domid, r->vcpuid, r->credit);
if (r->tickled_cpu == (unsigned)-1)
printf("no cpu was tickled\n");
else
printf("cpu %u was tickled\n", r->tickled_cpu);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 21): /* SCHEDULE */
if (opt.dump_all) {
struct {
unsigned cpu:16, rqi:16;
unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
} *r = (typeof(r))ri->d;
printf(" %s csched2:schedule cpu %u, rq# %u%s%s%s%s\n",
ri->dump_header, r->cpu, r->rqi,
r->tasklet ? ", tasklet scheduled" : "",
r->idle ? ", idle" : ", busy",
r->idle ? (r->smt_idle ? ", SMT idle" : ", SMT busy") : "",
r->tickled ? ", tickled" : ", not tickled");
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 22): /* RATELIMIT */
if (opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
unsigned int runtime;
} *r = (typeof(r))ri->d;
printf(" %s csched2:ratelimit, d%uv%u run only %u.%uus\n",
ri->dump_header, r->domid, r->vcpuid,
r->runtime / 1000, r->runtime % 1000);
}
break;
case TRC_SCHED_CLASS_EVT(CSCHED2, 23): /* RUNQ_CAND_CHECK */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
} *r = (typeof(r))ri->d;
printf(" %s csched2:runq_cand_check d%uv%u\n",
ri->dump_header, r->domid, r->vcpuid);
}
break;
/* RTDS (TRC_RTDS_xxx) */
case TRC_SCHED_CLASS_EVT(RTDS, 1): /* TICKLE */
if(opt.dump_all) {
struct {
unsigned int cpu:16;
} *r = (typeof(r))ri->d;
printf(" %s rtds:runq_tickle cpu %u\n",
ri->dump_header, r->cpu);
}
break;
case TRC_SCHED_CLASS_EVT(RTDS, 2): /* RUNQ_PICK */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
uint64_t cur_dl, cur_bg;
} __attribute__((packed)) *r = (typeof(r))ri->d;
printf(" %s rtds:runq_pick d%uv%u, deadline = %"PRIu64", "
"budget = %"PRIu64"\n", ri->dump_header,
r->domid, r->vcpuid, r->cur_dl, r->cur_bg);
}
break;
case TRC_SCHED_CLASS_EVT(RTDS, 3): /* BUDGET_BURN */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
uint64_t cur_bg;
int delta;
} __attribute__((packed)) *r = (typeof(r))ri->d;
printf(" %s rtds:burn_budget d%uv%u, budget = %"PRIu64", "
"delta = %d\n", ri->dump_header, r->domid,
r->vcpuid, r->cur_bg, r->delta);
}
break;
case TRC_SCHED_CLASS_EVT(RTDS, 4): /* BUDGET_REPLENISH */
if(opt.dump_all) {
struct {
unsigned int vcpuid:16, domid:16;
uint64_t cur_dl, cur_bg;
} __attribute__((packed)) *r = (typeof(r))ri->d;
printf(" %s rtds:repl_budget d%uv%u, deadline = %"PRIu64", "
"budget = %"PRIu64"\n", ri->dump_header,
r->domid, r->vcpuid, r->cur_dl, r->cur_bg);
}
break;
case TRC_SCHED_CLASS_EVT(RTDS, 5): /* SCHED_TASKLET */
if(opt.dump_all)
printf(" %s rtds:sched_tasklet\n", ri->dump_header);
break;
case TRC_SCHED_CLASS_EVT(RTDS, 6): /* SCHEDULE */
if (opt.dump_all) {
struct {
unsigned cpu:16, tasklet:8, idle:4, tickled:4;
} __attribute__((packed)) *r = (typeof(r))ri->d;
printf(" %s rtds:schedule cpu %u, %s%s%s\n",
ri->dump_header, r->cpu,
r->tasklet ? ", tasklet scheduled" : "",
r->idle ? ", idle" : ", busy",
r->tickled ? ", tickled" : ", not tickled");
}
break;
case TRC_SCHED_CLASS_EVT(SNULL, 1): /* PICKED_CPU */
if (opt.dump_all) {
struct {
uint16_t vcpuid, domid;
uint32_t new_cpu;
} *r = (typeof(r))ri->d;
printf(" %s null:picked_cpu d%uv%u, cpu %u\n",
ri->dump_header, r->domid, r->vcpuid, r->new_cpu);
}
break;
case TRC_SCHED_CLASS_EVT(SNULL, 2): /* VCPU_ASSIGN */
if (opt.dump_all) {
struct {
uint16_t vcpuid, domid;
uint32_t cpu;
} *r = (typeof(r))ri->d;
printf(" %s null:vcpu_assign d%uv%u to cpu %u\n",
ri->dump_header, r->domid, r->vcpuid, r->cpu);
}
break;
case TRC_SCHED_CLASS_EVT(SNULL, 3): /* VCPU_DEASSIGN */
if (opt.dump_all) {
struct {
uint16_t vcpuid, domid;
uint32_t cpu;
} *r = (typeof(r))ri->d;
printf(" %s null:vcpu_deassign d%uv%u from cpu %u\n",
ri->dump_header, r->domid, r->vcpuid, r->cpu);
}
break;
case TRC_SCHED_CLASS_EVT(SNULL, 4): /* MIGRATE */
if (opt.dump_all) {
struct {
uint16_t vcpuid, domid;
uint16_t cpu, new_cpu;
} *r = (typeof(r))ri->d;
printf(" %s null:migrate d%uv%u, cpu %u, new_cpu %u\n",
ri->dump_header, r->domid, r->vcpuid,
r->cpu, r->new_cpu);
}
break;
case TRC_SCHED_CLASS_EVT(SNULL, 5): /* SCHEDULE */
if (opt.dump_all) {
struct {
uint16_t tasklet, cpu;
int16_t vcpuid, domid;
} *r = (typeof(r))ri->d;
printf(" %s null:schedule cpu %u%s",
ri->dump_header, r->cpu,
r->tasklet ? ", tasklet scheduled" : "");
if (r->vcpuid != -1)
printf(", vcpu d%uv%d\n", r->domid, r->vcpuid);
else
printf(", no vcpu\n");
}
break;
case TRC_SCHED_CLASS_EVT(SNULL, 6): /* TASKLET */
if (opt.dump_all)
printf(" %s null:sched_tasklet\n", ri->dump_header);
break;
default:
process_generic(ri);
}
} else {
UPDATE_VOLUME(p, sched_verbose, ri->size);
process_generic(&p->ri);
}
}
/* ---- Memory ---- */
void mem_summary_domain(struct domain_data *d) {
int i, j;
printf(" Grant table ops:\n");
printf(" Done by:\n");
for(i=0; imemops.done[i])
printf(" %-14s: %d\n",
mem_name[i],
d->memops.done[i]);
printf(" Done for:\n");
for(i=0; imemops.done_for[i])
printf(" %-14s: %d\n",
mem_name[i],
d->memops.done_for[i]);
printf(" Populate-on-demand:\n");
printf(" Populated:\n");
for(i=0; i<4; i++)
{
if ( d->pod.populate_order[i] )
printf(" [%d] %d\n", i,
d->pod.populate_order[i]);
}
printf(" Reclaim order:\n");
for(i=0; i<4; i++)
{
if ( d->pod.reclaim_order[i] )
printf(" [%d] %d\n", i,
d->pod.reclaim_order[i]);
}
printf(" Reclaim contexts:\n");
for(j=0; jpod.reclaim_context[j] )
{
printf(" * [%s] %d\n",
pod_reclaim_context_name[j],
d->pod.reclaim_context[j]);
for(i=0; i<4; i++)
{
if ( d->pod.reclaim_context_order[j][i] )
printf(" [%d] %d\n", i,
d->pod.reclaim_context_order[j][i]);
}
}
}
}
int p2m_canonical_order(int order)
{
if ( order % 9
|| (order / 9) > 2 )
{
fprintf(warn, "%s: Strange, non-canonical order %d\n",
__func__, order);
order = 4;
} else {
order /= 9;
}
return order;
}
void mem_pod_zero_reclaim_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
int context = POD_RECLAIM_CONTEXT_UNKNOWN;
struct vcpu_data *v = p->current;
struct {
uint64_t gfn, mfn;
int d:16,order:16;
} *r = (typeof(r))ri->d;
if ( v && v->hvm.vmexit_valid )
{
switch(v->hvm.exit_reason)
{
case EXIT_REASON_EPT_VIOLATION:
case EXIT_REASON_EXCEPTION_NMI:
context = POD_RECLAIM_CONTEXT_FAULT;
break;
case EXIT_REASON_VMCALL:
context = POD_RECLAIM_CONTEXT_BALLOON;
break;
}
}
if ( opt.dump_all )
{
printf(" %s pod_zero_reclaim d%d o%d g %llx m %llx ctx %s\n",
ri->dump_header,
r->d, r->order,
(unsigned long long)r->gfn, (unsigned long long)r->mfn,
pod_reclaim_context_name[context]);
}
if ( opt.summary_info )
{
struct domain_data *d;
if ( v && (d=v->d) )
{
int order;
order = p2m_canonical_order(r->order);
d->pod.reclaim_order[order]++;
d->pod.reclaim_context[context]++;
d->pod.reclaim_context_order[context][order]++;
}
}
}
void mem_pod_populate_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct {
uint64_t gfn, mfn;
int d:16,order:16;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s pod_populate d%d o%d g %llx m %llx\n",
ri->dump_header,
r->d, r->order,
(unsigned long long)r->gfn, (unsigned long long)r->mfn);
}
if ( opt.summary_info )
{
struct vcpu_data *v = p->current;
struct domain_data *d;
if ( v && (d=v->d) )
{
int order;
order = p2m_canonical_order(r->order);
d->pod.populate_order[order]++;
}
}
}
void mem_pod_superpage_splinter_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct {
uint64_t gfn;
int d:16;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s pod_spage_splinter d%d g %llx\n",
ri->dump_header,
r->d, (unsigned long long)r->gfn);
}
}
void mem_page_grant(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct {
unsigned domain;
} *r = (typeof(r))ri->d;
union pv_event pevt = { .event = ri->event };
if ( opt.dump_all )
{
printf(" %s %s domain %u\n", ri->dump_header, mem_name[pevt.minor], r->domain);
}
}
void mem_set_p2m_entry_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct {
uint64_t gfn, mfn;
int p2mt;
int d:16,order:16;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s set_p2m_entry d%d o%d t %d g %llx m %llx\n",
ri->dump_header,
r->d, r->order,
r->p2mt,
(unsigned long long)r->gfn, (unsigned long long)r->mfn);
}
}
void mem_decrease_reservation_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct {
uint64_t gfn;
int d:16,order:16;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s decrease_reservation d%d o%d g %llx\n",
ri->dump_header,
r->d, r->order,
(unsigned long long)r->gfn);
}
}
void mem_process(struct pcpu_info *p) {
struct record_info *ri = &p->ri;
struct {
int dom;
} *r = (typeof(r))ri->d;
int minor = ri->evt.minor;
switch ( minor )
{
case MEM_PAGE_GRANT_MAP:
case MEM_PAGE_GRANT_UNMAP:
case MEM_PAGE_GRANT_TRANSFER:
mem_page_grant(p);
break;
case MEM_SET_P2M_ENTRY:
mem_set_p2m_entry_process(p);
break;
case MEM_DECREASE_RESERVATION:
mem_decrease_reservation_process(p);
break;
case MEM_POD_POPULATE:
mem_pod_populate_process(p);
break;
case MEM_POD_ZERO_RECLAIM:
mem_pod_zero_reclaim_process(p);
break;
case MEM_POD_SUPERPAGE_SPLINTER:
mem_pod_superpage_splinter_process(p);
break;
default:
if(opt.dump_all) {
dump_generic(stdout, ri);
}
if(opt.summary_info && minor < MEM_MAX) {
struct domain_data *d;
if(p->current) {
if (p->current->d) {
p->current->d->memops.done[minor]++;
p->current->d->memops.done_interval[minor]++;
}
if((d=domain_find(r->dom))) {
d->memops.done_for[minor]++;
d->memops.done_for_interval[minor]++;
}
}
}
break;
}
}
/* ---- PM ---- */
#define CSTATE_MAX 5
#define CSTATE_INVALID ((CSTATE_MAX)+1)
void pm_process(struct pcpu_info *p) {
struct record_info *ri = &p->ri;
switch ( ri->event )
{
case TRC_PM_FREQ_CHANGE:
if (opt.dump_all )
printf(" %s pm_freq_change o%d n%d\n",
ri->dump_header,
ri->d[0],
ri->d[1]);
break;
case TRC_PM_IDLE_ENTRY:
if (opt.dump_all )
printf(" %s pm_idle_start c%d\n",
ri->dump_header,
ri->d[0]);
if ( ri->d[0] <= CSTATE_MAX )
{
p->power_state=ri->d[0];
pcpu_string_draw(p);
}
break;
case TRC_PM_IDLE_EXIT:
if (opt.dump_all )
printf(" %s pm_idle_end c%d\n",
ri->dump_header,
ri->d[0]);
if ( p->power_state != ri->d[0]
&& p->power_state != CSTATE_INVALID )
printf("Strange, pm_idle_end %d, power_state %d!\n",
ri->d[0], p->power_state);
p->power_state = 0;
pcpu_string_draw(p);
break;
default:
if(opt.dump_all) {
dump_generic(stdout, ri);
}
break;
}
}
/*
* IRQ related stuff
*/
#define MAX_VECTOR 256
int global_vector_used[256] = {0};
struct pci_dev {
uint8_t bus;
uint8_t devfn;
int vector_used[MAX_VECTOR];
struct pci_dev *next;
} *pdev_list;
#define MAX_IRQ 512
struct irq_desc {
enum {
IRQ_NONE,
IRQ_MSI,
IRQ_GSI
} type;
struct pci_dev *dev;
} irq_table[MAX_IRQ];
struct pci_dev * pdev_find(uint8_t bus, uint8_t devfn)
{
struct pci_dev *d, *n, **q;
/* Look for domain, keeping track of the last pointer so we can add
a domain if we need to. */
for ( d = pdev_list, q=&pdev_list ;
d && ( (d->bus < bus)
|| (d->bus == bus && d->devfn < devfn) ) ;
q = &d->next, d=d->next ) ;
if(d && d->bus == bus && d->devfn == devfn)
return d;
/* Make a new domain */
fprintf(warn, "Creating pdev %02x:%02x.%x\n", bus, devfn>>4, devfn&3);
if((n=malloc(sizeof(*n)))==NULL)
{
fprintf(stderr, "%s: malloc %zd failed!\n", __func__, sizeof(*n));
error(ERR_SYSTEM, NULL);
}
bzero(n, sizeof(*n));
n->bus=bus;
n->devfn=devfn;
/* Insert it into the list */
n->next = d;
*q = n;
return n;
}
void irq_process(struct pcpu_info *p) {
struct record_info *ri = &p->ri;
switch ( ri->event )
{
case TRC_HW_IRQ_BIND_VECTOR:
{
struct {
int irq, vec;
unsigned mask[4];
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s irq_bind_vector irq %x vec %x mask %04x %04x %04x %04x\n",
ri->dump_header,
r->irq, r->vec,
r->mask[3],
r->mask[2],
r->mask[1],
r->mask[0]);
}
break;
}
case TRC_HW_IRQ_HANDLED:
{
struct {
int irq, start_tsc, end_tsc;
} *r = (typeof(r))ri->d;
int arctime;
arctime = r->end_tsc - r->start_tsc;
if ( opt.dump_all )
{
printf(" %s irq_handled irq %x %d (%d,%d)\n",
ri->dump_header,
r->irq, arctime, r->start_tsc, r->end_tsc);
}
if ( opt.scatterplot_irq )
{
struct time_struct t;
abs_cycles_to_time(ri->tsc, &t);
printf("i%x %u.%09u %d\n",
(unsigned)r->irq,
t.s, t.ns,
p->pid);
}
break;
}
case TRC_HW_IRQ_ASSIGN_VECTOR:
{
struct {
int irq, vec;
unsigned mask[4];
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s irq_assign_vector irq %x vec %x mask %04x %04x %04x %04x\n",
ri->dump_header,
r->irq, r->vec,
r->mask[3],
r->mask[2],
r->mask[1],
r->mask[0]);
}
if ( r->irq < MAX_IRQ
&& r->vec < MAX_VECTOR )
{
if ( irq_table[r->irq].type == IRQ_MSI )
{
if(global_vector_used[r->vec])
fprintf(warn, " Vector collision on global table!\n");
global_vector_used[r->vec]=1;
}
if( irq_table[r->irq].dev )
{
struct pci_dev * pdev=irq_table[r->irq].dev;
if(pdev->vector_used[r->vec])
fprintf(warn, " Vector collision on %02x.%02x!\n",
pdev->bus, pdev->devfn);
pdev->vector_used[r->vec]=1;
}
}
break;
}
case TRC_HW_IRQ_MOVE_CLEANUP_DELAY:
{
struct {
int irq, vec, cpu;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s irq_move_cleanup_delay irq %x vec %x cpu %d\n",
ri->dump_header,
r->irq, r->vec, r->cpu);
}
break;
}
case TRC_HW_IRQ_MOVE_CLEANUP:
{
struct {
int irq;
int vec;
int cpu;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s irq_move_cleanup irq %x vec %x cpu %d\n",
ri->dump_header,
r->irq, r->vec, r->cpu);
}
if ( r->irq < MAX_IRQ
&& r->vec < MAX_VECTOR )
{
if ( irq_table[r->irq].type == IRQ_MSI )
{
if(!global_vector_used[r->vec])
fprintf(warn," Strange, cleanup on non-used vector\n");
global_vector_used[r->vec]=0;
}
if ( irq_table[r->irq].dev )
{
struct pci_dev * pdev=irq_table[r->irq].dev;
if(!pdev->vector_used[r->vec])
fprintf(warn," Strange, cleanup on non-used vector\n");
pdev->vector_used[r->vec]=0;
}
}
break;
}
case TRC_HW_IRQ_UNMAPPED_VECTOR:
{
struct {
int vec;
} *r = (typeof(r))ri->d;
if ( opt.dump_all )
{
printf(" %s irq_unmapped_vector vec %x\n",
ri->dump_header,
r->vec);
}
break;
}
case TRC_HW_IRQ_CLEAR_VECTOR:
case TRC_HW_IRQ_MOVE_FINISH :
default:
if(opt.dump_all) {
dump_generic(stdout, ri);
}
break;
}
}
#define TRC_HW_SUB_PM 1
#define TRC_HW_SUB_IRQ 2
void hw_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
switch(ri->evt.sub)
{
case TRC_HW_SUB_PM:
pm_process(p);
break;
case TRC_HW_SUB_IRQ:
irq_process(p);
break;
}
}
#define TRC_DOM0_SUB_DOMOPS 1
void dom0_process(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
switch(ri->evt.sub)
{
case TRC_DOM0_SUB_DOMOPS:
if(opt.dump_all) {
struct {
unsigned int domid;
} *r = (typeof(r))ri->d;
printf(" %s %s domain d%u\n", ri->dump_header,
ri->event == TRC_DOM0_DOM_ADD ? "creating" : "destroying",
r->domid);
}
break;
default:
process_generic(&p->ri);
}
}
/* ---- Base ----- */
void dump_generic(FILE * f, struct record_info *ri)
{
int i;
fprintf(f, "]%s %7x(%x:%x:%x) %u [",
ri->dump_header,
ri->event,
ri->evt.main,
ri->evt.sub,
ri->evt.minor,
ri->extra_words);
for(i=0; iextra_words; i++) {
fprintf(f, " %x", ri->d[i]);
}
fprintf(f, " ]\n");
}
void dump_raw(char * s, struct record_info *ri)
{
int i;
if(ri->rec.cycle_flag)
printf("%s %7x %d %14lld [",
s, ri->event, ri->extra_words, ri->tsc);
else
printf("%s %7x %d %14s [",
s, ri->event, ri->extra_words, "-");
for(i=0; i<7; i++) {
if ( i < ri->extra_words )
printf(" %8x", ri->d[i]);
else
printf(" ");
}
printf(" ] | ");
for (i=0; i<8; i++) {
printf(" %08x", ri->rec.raw[i]);
}
printf(" |\n");
}
void error(enum error_level l, struct record_info *ri)
{
if ( l > opt.tolerance )
{
if ( ri )
dump_generic(warn, ri);
exit(1);
}
}
int check_extra_words(struct record_info *ri,
int expected_size,
const char *record)
{
static int off_by_one = 0;
int expected_extra = expected_size / sizeof(unsigned int);
if(ri->extra_words != expected_extra
&& !(off_by_one && ri->extra_words == expected_extra + 1) )
{
if ( !off_by_one && ri->extra_words == expected_extra + 1 )
{
fprintf(warn, "Detected off-by-one bug; relaxing expectations\n");
off_by_one=1;
}
else {
fprintf(warn, "ERROR: %s extra_words %d, expected %d!\n",
record,
ri->extra_words, expected_extra);
error(ERR_RECORD, ri);
return 1;
}
}
return 0;
}
void process_generic(struct record_info *ri) {
error(ERR_STRICT, ri);
if(opt.dump_all) {
dump_generic(stdout, ri);
}
}
int vcpu_set_data_type(struct vcpu_data *v, int type)
{
if (v->data_type == VCPU_DATA_NONE )
{
v->data_type = type;
switch(type)
{
case VCPU_DATA_HVM:
init_hvm_data(&v->hvm, v);
break;
default:
break;
}
}
else
assert(v->data_type == type);
return 0;
}
void lose_vcpu(struct vcpu_data *v, tsc_t tsc)
{
if(v->data_type == VCPU_DATA_HVM)
v->hvm.vmexit_valid=0;
runstate_update(v, RUNSTATE_LOST, tsc);
hvm_vlapic_clear(&v->vlapic);
if(v->data_type == VCPU_DATA_HVM) {
int i;
if(opt.dump_all)
printf(" [w2h] Clearing w2h state for d%dv%d\n",
v->d->did, v->vid);
v->hvm.w2h.interrupts=0;
v->hvm.w2h.vector=0;
v->hvm.w2h.waking = 0;
for(i=0; ihvm.summary.guest_interrupt[i].start_tsc) {
printf(" Interrupt %d clearing start_tsc %lld\n",
i, v->hvm.summary.guest_interrupt[i].start_tsc);
}
v->hvm.summary.guest_interrupt[i].start_tsc = 0;
}
}
}
struct lost_record_struct {
int lost_records;
unsigned did:16,vid:16;
tsc_t first_tsc;
};
void process_lost_records(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct lost_record_struct *r = (typeof(r))ri->d;
tsc_t first_tsc; /* TSC of first record that was lost */
/* Sanity checks */
if(ri->extra_words != 4)
{
fprintf(warn, "FATAL: Lost record has unexpected extra words %d!\n",
ri->extra_words);
error(ERR_RECORD, ri);
return;
}
first_tsc = r->first_tsc;
if(opt.dump_all)
{
if(p->current)
printf(" %s lost_records count %d d%uv%u (cur d%dv%d) first_tsc %lld\n",
ri->dump_header, r->lost_records,
r->did, r->vid,
p->current->d->did, p->current->vid,
r->first_tsc);
else
printf(" %s lost_records count %d d%uv%u (cur X) first_tsc %lld\n",
ri->dump_header, r->lost_records,
r->did, r->vid,
r->first_tsc);
}
#if 0
if(opt.dump_trace_volume_on_lost_record)
volume_summary(&p->volume.last_buffer);
#endif
if ( p->current ) {
hvm_vlapic_clear(&p->current->vlapic);
if(p->current->data_type == VCPU_DATA_HVM) {
p->current->hvm.vmexit_valid=0;
cr3_switch(0, &p->current->hvm);
}
/* We may lose scheduling records; so we need to:
* - Point all records until now to the next schedule in the
* "default" domain
* - Make sure there are no warnings / strangeness with the
* current vcpu (if it gets scheduled elsewhere).
*/
vcpu_prev_update(p, p->current, first_tsc, RUNSTATE_LOST);
}
#if 0
vcpu_next_update(p, default_domain.vcpu[p->pid], first_tsc);
if(p->current->data_type == VCPU_DATA_HVM) {
p->current->hvm.vmexit_valid=0;
}
#endif
/* The lost record trace is processed early -- i.e.,
* After the last good record, rather than when the next
* record is processed. Between the time it's processed and
* the time it actually went in, the vcpu may be scheduled on
* other processors. So we can't switch vcpus until the first
* TSC'd record after the lost record. */
if(!p->lost_record.active) {
P.lost_cpus++;
if(P.lost_cpus > P.max_active_pcpu + 1) {
fprintf(warn, "ERROR: P.lost_cpus %d > P.max_active_pcpu + 1 %d!\n",
P.lost_cpus, P.max_active_pcpu + 1);
error(ERR_ASSERT, NULL);
}
} else
fprintf(warn, "Strange, lost record for pcpu %d, but lost_record still active!\n",
p->pid);
p->lost_record.active = 1;
p->lost_record.tsc = first_tsc;
pcpu_string_draw(p);
{
/* Any vcpu which is not actively running may be scheduled on the
* lost cpu. To avoid mis-accounting, we need to reset */
struct domain_data *d;
int i;
for(d=domain_list ; d; d=d->next)
{
if(d->did != DEFAULT_DOMAIN) {
for(i=0; ivcpu[i] &&
d->vcpu[i]->runstate.state != RUNSTATE_RUNNING) {
if(opt.dump_all)
fprintf(warn, "%s: setting d%dv%d to RUNSTATE_LOST\n",
__func__, d->did, i);
lose_vcpu(d->vcpu[i], first_tsc);
}
}
}
}
p->lost_record.domain_valid=1;
p->lost_record.did=r->did;
p->lost_record.vid=r->vid;
}
void process_lost_records_end(struct pcpu_info *p)
{
struct record_info *ri = &p->ri;
struct lost_record_struct *r = (typeof(r))ri->d;
if(!p->lost_record.active) {
fprintf(warn, "FATAL: lost_records_end but pid %d not lost!\n",
p->pid);
error(ERR_FILE, NULL);
return;
}
/* Lost records. If this is the first record on a pcpu after the loss,
* Update the information. */
if(ri->tsc > p->lost_record.tsc)
{
if(opt.dump_all)
printf(" %s lost_records end ---\n",
pcpu_string(p->pid));
update_cycles(&p->time.lost, ri->tsc - p->lost_record.tsc);
if(p->lost_record.domain_valid) {
int did = p->lost_record.did,
vid = p->lost_record.vid;
if(opt.dump_all)
printf(" %s lost_records end d%dv%d---\n",
pcpu_string(p->pid),
did, vid);
if(p->current)
{
fprintf(warn, "FATAL: lost_record valid (d%dv%d), but current d%dv%d!\n",
did, vid,
p->current->d->did, p->current->vid);
error(ERR_FILE, NULL);
return;
}
if(opt.dump_all)
fprintf(warn, "Changing p%d current to d%dv%d\n",
p->pid, did, vid);
vcpu_next_update(p,
vcpu_find(did, vid),
ri->tsc);
p->lost_record.domain_valid=0;
p->lost_record.seen_valid_schedule=0; /* Let next vcpu_next_update know that
this one was inferred */
} else {
if(opt.dump_all)
printf(" %s lost_records end (domain invalid)---\n",
pcpu_string(p->pid));
}
p->lost_record.active = 0;
pcpu_string_draw(p);
P.lost_cpus--;
if(P.lost_cpus < 0) {
fprintf(warn, "ERROR: lost_cpus fell below 0 for pcpu %d!\n",
p->pid);
error(ERR_ASSERT, NULL);
}
}
}
void base_process(struct pcpu_info *p) {
struct record_info *ri = &p->ri;
switch(ri->event)
{
case TRC_TRACE_WRAP_BUFFER:
break;
case TRC_LOST_RECORDS:
process_lost_records(p);
break;
case TRC_LOST_RECORDS_END:
process_lost_records_end(p);
break;
default:
process_generic(ri);
}
}
/* Non-compat only */
void record_order_insert(struct pcpu_info *new);
void record_order_remove(struct pcpu_info *rem);
void record_order_bubble(struct pcpu_info *last);
struct cpu_change_data {
int cpu;
unsigned window_size;
};
void activate_early_eof(void) {
struct pcpu_info *p;
int i;
fprintf(warn, "Short cpu_change window, activating early_eof\n");
P.early_eof = 1;
for(i=0; i<=P.max_active_pcpu; i++) {
p = P.pcpu + i;
if(p->active && p->file_offset > P.last_epoch_offset) {
fprintf(warn, " deactivating pid %d\n",
p->pid);
p->active = 0;
}
}
}
off_t scan_for_new_pcpu(off_t offset) {
ssize_t r;
struct trace_record rec;
struct cpu_change_data *cd;
r=__read_record(&rec, offset);
if(r==0)
return 0;
if(rec.event != TRC_TRACE_CPU_CHANGE
|| rec.cycle_flag)
{
fprintf(stderr, "%s: Unexpected record event %x!\n",
__func__, rec.event);
error(ERR_ASSERT, NULL); /* Actually file, but can't recover */
}
cd = (typeof(cd))rec.u.notsc.data;
if ( cd->cpu > MAX_CPUS )
{
fprintf(stderr, "%s: cpu %d exceeds MAX_CPU %d!\n",
__func__, cd->cpu, MAX_CPUS);
/* FIXME: Figure out if we could handle this more gracefully */
error(ERR_ASSERT, NULL);
}
if(cd->cpu > P.max_active_pcpu || !P.pcpu[cd->cpu].active) {
struct pcpu_info *p = P.pcpu + cd->cpu;
fprintf(warn, "%s: Activating pcpu %d at offset %lld\n",
__func__, cd->cpu, (unsigned long long)offset);
p->active = 1;
/* Process this cpu_change record first */
p->ri.rec = rec;
p->ri.size = r;
__fill_in_record_info(p);
p->file_offset = offset;
p->next_cpu_change_offset = offset;
record_order_insert(p);
offset += r + cd->window_size;
sched_default_vcpu_activate(p);
if ( cd->cpu > P.max_active_pcpu )
P.max_active_pcpu = cd->cpu;
return offset;
} else {
return 0;
}
}
/*
* Conceptually, when we reach a cpu_change record that's not for our pcpu,
* we want to scan forward through the file until we reach one that's for us.
* However, looping through involves reading the file, which we'd rather
* do in one place. Because cpu_change records don't include a tsc,
* the same pcpu will be processed repeatedly until the cpu_change
* equals p->pid.
*
* There are two additional things we need to do in this algorithm:
* + Detect new pcpus as they come online
* + De-activate pcpus which don't have any more records
*
* Detecting new pcpus which are less than P.max_active_pcpu is straight-
* forward: when max_active_pcpu is searching for its next cpu window,
* it will pass by the new cpu's window, and can activate it then.
*
* Detecting new pcpus greater than P.max_active_pcpu is a little harder;
* When max_active_pcpu is scanning for its next cpu window, after it's found
* it, we need to scan one more window forward to see if its' an already-active
* pcpu; if not, activate it.
*
* We also need to deal with truncated files, where records from one pcpu may
* be present but not from another pcpu due to lack of disk space. The best
* thing to do is to find the last "epoch" and essentially truncate the file
* to that.
*/
void deactivate_pcpu(struct pcpu_info *p)
{
if ( p->current )
{
pcpu_runstate_update(p, p->last_tsc);
fprintf(warn, "%s: setting d%dv%d to state LOST\n",
__func__, p->current->d->did,
p->current->vid);
lose_vcpu(p->current, p->last_tsc);
}
p->active = 0;
record_order_remove(p);
if ( p->pid == P.max_active_pcpu )
{
int i, max_active_pcpu = -1;
for(i=0; i<=P.max_active_pcpu; i++)
{
if(!P.pcpu[i].active)
continue;
max_active_pcpu = i;
}
P.max_active_pcpu = max_active_pcpu;
fprintf(warn, "%s: Setting max_active_pcpu to %d\n",
__func__, max_active_pcpu);
}
}
/* Helper function to process tsc-related record info */
void process_record_tsc(tsc_t order_tsc, struct record_info *ri)
{
/* Find the first tsc set */
if(ri->tsc && ri->tsc >= P.f.first_tsc) {
/* We use the order_tsc to account for the second processing of
* a lost record. */
tsc_t tsc = order_tsc;
if(P.f.first_tsc == 0) {
P.f.first_tsc = tsc;
if ( opt.interval_mode ) {
P.interval.start_tsc = tsc;
}
} else {
if ( opt.interval_mode ) {
if(P.interval.start_tsc > tsc) {
fprintf(warn, "FATAL: order_tsc %lld < interval.start_tsc %lld!\n",
tsc, P.interval.start_tsc);
error(ERR_FILE, NULL);
} else {
while ( tsc - P.interval.start_tsc > opt.interval.cycles ) {
interval_callback();
P.interval.start_tsc += opt.interval.cycles;
}
}
}
}
P.f.last_tsc=tsc;
P.f.total_cycles = P.f.last_tsc - P.f.first_tsc;
P.now = tsc;
}
}
/* Standardized part of dump output */
void create_dump_header(struct record_info *ri, struct pcpu_info *p)
{
char * c;
int len, r;
len = DUMP_HEADER_MAX;
c = ri->dump_header;
abs_cycles_to_time(ri->tsc, &ri->t);
if ( ri->t.time )
{
r=snprintf(c, len, "%3u.%09u", ri->t.s, ri->t.ns);
c+=r;
len-=r;
}
else
{
r=snprintf(c,
len,
" ");
c+=r;
len-=r;
}
r = snprintf(c, len, " %s", pcpu_string(ri->cpu));
c+=r;
len-=r;
if ( p->current )
{
r = snprintf(c, len, " d%dv%d", p->current->d->did, p->current->vid);
c+=r;
len-=r;
}
else
{
r = snprintf(c, len, " d?v?");
c+=r;
len-=r;
}
}
int find_toplevel_event(struct record_info *ri)
{
int toplevel=0, i, count;
for(i=0, count=0; ievt.main & (1UL<event,
ri->evt.main, ri->evt.sub, ri->evt.minor);
error(ERR_RECORD, NULL);
return -1;
}
return toplevel;
}
void process_cpu_change(struct pcpu_info *p) {
struct record_info *ri = &p->ri;
struct cpu_change_data *r = (typeof(r))ri->d;
if(opt.dump_all && verbosity >= 6) {
printf("]%s cpu_change this-cpu %u record-cpu %u window_size %u(0x%08x)\n",
ri->dump_header, p->pid, r->cpu, r->window_size,
r->window_size);
}
/* File sanity check */
if(p->file_offset != p->next_cpu_change_offset) {
fprintf(warn, "Strange, pcpu %d expected offset %llx, actual %llx!\n",
p->pid, (unsigned long long)p->next_cpu_change_offset,
(unsigned long long)p->file_offset);
}
if(r->cpu > MAX_CPUS)
{
fprintf(stderr, "FATAL: cpu %d > MAX_CPUS %d.\n",
r->cpu, MAX_CPUS);
/* Actually file, but takes some work to skip */
error(ERR_ASSERT, NULL);
}
/* Detect beginning of new "epoch" while scanning thru file */
if((p->last_cpu_change_pid > r->cpu)
&& (p->file_offset > P.last_epoch_offset)) {
P.last_epoch_offset = p->file_offset;
}
/* If that pcpu has never been activated, activate it. */
if(!P.pcpu[r->cpu].active && P.pcpu[r->cpu].file_offset == 0)
{
struct pcpu_info * p2 = P.pcpu + r->cpu;
p2->active = 1;
if(r->cpu > P.max_active_pcpu)
P.max_active_pcpu = r->cpu;
/* Taking this record as the first record should make everything
* run swimmingly. */
p2->ri = *ri;
p2->ri.cpu = r->cpu;
p2->ri.d = p2->ri.rec.u.notsc.data;
p2->file_offset = p->file_offset;
p2->next_cpu_change_offset = p->file_offset;
fprintf(warn, "%s: Activating pcpu %d at offset %lld\n",
__func__, r->cpu, (unsigned long long)p->file_offset);
record_order_insert(p2);
sched_default_vcpu_activate(p2);
}
p->last_cpu_change_pid = r->cpu;
/* If this isn't the cpu we're looking for, skip the whole bunch */
if(p->pid != r->cpu)
{
p->file_offset += ri->size + r->window_size;
p->next_cpu_change_offset = p->file_offset;
if(p->file_offset > G.file_size) {
activate_early_eof();
} else if(P.early_eof && p->file_offset > P.last_epoch_offset) {
fprintf(warn, "%s: early_eof activated, pcpu %d past last_epoch_offset %llx, deactivating.\n",
__func__, p->pid, (unsigned long long)P.last_epoch_offset);
deactivate_pcpu(p);
}
}
else
{
/* Track information about dom0 scheduling and records */
if(opt.dump_trace_volume_on_lost_record) {
tsc_t cycles;
struct time_struct t;
/* Update dom0 runstates */
cycles = (p->volume.buffer_first_tsc > p->volume.buffer_dom0_runstate_tsc) ?
p->volume.buffer_first_tsc :
p->volume.buffer_dom0_runstate_tsc;
p->volume.buffer_dom0_runstate_cycles[p->volume.buffer_dom0_runstate]
+= ri->tsc - cycles;
printf(" - updated p%d dom0_runstate %s to %lld cycles (+%lld)\n",
p->pid, runstate_name[p->volume.buffer_dom0_runstate],
p->volume.buffer_dom0_runstate_cycles[p->volume.buffer_dom0_runstate],
ri->tsc - cycles);
/* print info */
cycles = ri->tsc - p->volume.buffer_first_tsc;
cycles_to_time(cycles, &t);
printf("Buffer time: %u.%09u (%lld cycles)\n",
t.s, t.ns, cycles);
if(p->volume.buffer_size)
printf("Rate: %lld cycles / byte\n",
cycles / p->volume.buffer_size);
if(P.buffer_trace_virq_tsc)
{
cycles = ri->tsc - P.buffer_trace_virq_tsc;
cycles_to_time(cycles, &t);
printf("trace_virq latency: %u.%09u (%lld cycles)\n",
t.s, t.ns, cycles);
P.buffer_trace_virq_tsc = 0;
}
else
{
printf("No trace_virq record found.\n");
}
printf("Dom0 runstates this buffer:\n");
runstate_summary(p->volume.buffer_dom0_runstate_cycles);
volume_summary(&p->volume.last_buffer);
/* reset info */
p->volume.buffer_first_tsc = 0;
p->volume.buffer_size = r->window_size;
runstate_clear(p->volume.buffer_dom0_runstate_cycles);
volume_clear(&p->volume.last_buffer);
}
p->file_offset += ri->size;
p->next_cpu_change_offset = p->file_offset + r->window_size;
if(p->next_cpu_change_offset > G.file_size)
activate_early_eof();
else if(p->pid == P.max_active_pcpu)
scan_for_new_pcpu(p->next_cpu_change_offset);
}
}
struct tl_assert_mask {
unsigned p_current:1,
not_idle_domain:1;
int vcpu_data_mode;
};
static struct tl_assert_mask tl_assert_checks[TOPLEVEL_MAX] = {
[TRC_HVM_MAIN]={ .p_current=1, .not_idle_domain=1, .vcpu_data_mode=VCPU_DATA_HVM },
[TRC_SHADOW_MAIN]={ .p_current=1, .not_idle_domain=1, .vcpu_data_mode=VCPU_DATA_HVM },
[TRC_PV_MAIN]={ .p_current=1, .not_idle_domain=1, .vcpu_data_mode=VCPU_DATA_PV },
};
/* There are a lot of common assumptions for the various processing
* routines. Check them all in one place, doing something else if
* they don't pass. */
int toplevel_assert_check(int toplevel, struct pcpu_info *p)
{
struct tl_assert_mask mask;
mask = tl_assert_checks[toplevel];
if (mask.p_current && p->current == NULL)
{
fprintf(warn, "WARNING: p->current null! Not processing\n");
goto fail;
}
if( mask.not_idle_domain )
{
/* Can't do this check w/o first doing above check */
assert(mask.p_current);
if ( p->current->d->did == IDLE_DOMAIN) {
fprintf(warn, "WARNING: Unexpected record for idle domain! Not processing\n");
goto fail;
}
}
if ( mask.vcpu_data_mode )
{
struct vcpu_data *v;
assert(mask.p_current);
v = p->current;
if ( ! (v->data_type == VCPU_DATA_NONE
|| v->data_type == mask.vcpu_data_mode) )
{
/* This may happen for track_dirty_vram, which causes a SHADOW_WRMAP_BF trace f/ dom0 */
fprintf(warn, "WARNING: Unexpected vcpu data type for d%dv%d on proc %d! Expected %d got %d. Not processing\n",
v->d->did, v->vid, p->pid,
mask.vcpu_data_mode,
v->data_type);
goto fail;
}
}
return 1;
fail:
dump_generic(warn, &p->ri);
return 0;
}
void process_record(struct pcpu_info *p) {
struct record_info *ri = &p->ri;
int toplevel;
/* Process only TRC_TRACE_CPU_CHANGE */
if(ri->event == TRC_TRACE_CPU_CHANGE) {
process_cpu_change(p);
return;
}
if ( opt.dump_no_processing )
goto out;
p->summary = 1;
if( opt.dump_raw_process )
dump_raw("* ", ri);
process_record_tsc(p->order_tsc, ri);
if(opt.dump_all)
create_dump_header(ri, p);
toplevel = find_toplevel_event(ri);
if ( toplevel < 0 )
return;
/* Unify toplevel assertions */
if ( toplevel_assert_check(toplevel, p) )
{
switch(toplevel) {
case TRC_GEN_MAIN:
base_process(p);
break;
case TRC_SCHED_MAIN:
sched_process(p);
break;
case TRC_HVM_MAIN:
hvm_process(p);
break;
case TRC_SHADOW_MAIN:
shadow_process(p);
break;
case TRC_PV_MAIN:
pv_process(p);
break;
case TRC_MEM_MAIN:
mem_process(p);
break;
case TRC_HW_MAIN:
hw_process(p);
break;
case TRC_DOM0OP_MAIN:
dom0_process(p);
break;
default:
process_generic(ri);
}
}
UPDATE_VOLUME(p, toplevel[toplevel], ri->size);
if(!p->volume.buffer_first_tsc)
p->volume.buffer_first_tsc = ri->tsc;
out:
/* Lost records gets processed twice */
if(ri->event != TRC_LOST_RECORDS)
p->file_offset += ri->size;
}
static inline ssize_t get_rec_size(struct trace_record *rec) {
ssize_t s;
s = sizeof(uint32_t);
if(rec->cycle_flag)
s += sizeof(tsc_t);
s += rec->extra_words * sizeof(uint32_t);
return s;
}
#define STDIN 0
void progress_child_exec(void) {
fclose(stdin);
dup2(G.progress.pipe[0], STDIN);
execlp("zenity", "zenity", "--progress", "--auto-close", "--title",
"Analyzing", "--text", G.trace_file, "--auto-kill", NULL);
}
void progress_init(void) {
int pid;
if (pipe(G.progress.pipe) < 0)
perror("pipe");
if(!(pid = fork())) {
progress_child_exec();
fprintf(stderr, "%s: exec failed (%s), disabling progress bar\n",
__func__, strerror(errno));
opt.progress = 0;
exit(1);
} else if( pid < 0 ) {
fprintf(stderr, "%s: could not fork: %s, disabling progress bar\n",
__func__, strerror(errno));
opt.progress = 0;
}
if( (G.progress.out = fdopen(G.progress.pipe[1], "w")) == NULL ) {
fprintf(stderr, "%s: could not fdopen pipe: %s, disabling progress bar\n",
__func__, strerror(errno));
opt.progress = 0;
}
}
void progress_update(off_t offset) {
long long p;
p = ( offset * 100 ) / G.file_size;
fprintf(G.progress.out, "%lld\n", p);
fflush(G.progress.out);
p += 1;
G.progress.update_offset = ( G.file_size * p ) / 100;
#if 0
fprintf(stderr, "Progress: %lld %% Next update_offset: %lld\n",
p-1,
G.progress.update_offset);
#endif
}
void progress_finish(void) {
int pid;
fprintf(G.progress.out, "100\n");
fflush(G.progress.out);
fclose(G.progress.out);
wait(NULL);
if(!(pid = fork())) {
/* Child */
char text[128];
snprintf(text, 128, "Finished analyzing %s",
G.trace_file);
execlp("zenity", "zenity", "--info", "--text", text, NULL);
}
}
ssize_t __read_record(struct trace_record *rec, off_t offset)
{
ssize_t r, rsize;
r=mread64(G.mh, rec, sizeof(*rec), offset);
if(r < 0) {
/* Read error */
perror("read");
fprintf(stderr, "offset %llx\n", (unsigned long long)offset);
return 0;
} else if(r==0) {
/* End-of-file */
return 0;
} else if(r < sizeof(uint32_t)) {
/* Full header not read */
fprintf(stderr, "%s: short read (%zd bytes)\n",
__func__, r);
error(ERR_SYSTEM, NULL);
}
rsize=get_rec_size(rec);
if(r < rsize) {
/* Full record not read */
fprintf(stderr, "%s: short read (%zd, expected %zd)\n",
__func__, r, rsize);
return 0;
}
return rsize;
}
void __fill_in_record_info(struct pcpu_info *p)
{
struct record_info *ri;
tsc_t tsc=0;
ri = &p->ri;
ri->event = ri->rec.event;
ri->extra_words = ri->rec.extra_words;
if(ri->rec.cycle_flag) {
tsc = (((tsc_t)ri->rec.u.tsc.tsc_hi) << 32)
| ri->rec.u.tsc.tsc_lo;
tsc += p->tsc_skew.offset;
ri->tsc = tsc;
ri->d = ri->rec.u.tsc.data;
if(p->first_tsc == 0)
p->first_tsc = tsc;
/* We process lost record twice: once at the first_tsc,
once at the time it was placed in the log */
if(ri->event == TRC_LOST_RECORDS && ri->extra_words == 4) {
struct lost_record_struct *r = (typeof(r))ri->d;
p->order_tsc = r->first_tsc + p->tsc_skew.offset;
} else
p->order_tsc = tsc;
p->last_tsc = tsc;
} else {
ri->tsc = p->last_tsc;
ri->d = ri->rec.u.notsc.data;
}
if ( opt.dump_raw_reads ) {
char s[256];
snprintf(s, 256, "R p%2d o%016llx ",
p->pid, (unsigned long long)p->file_offset);
dump_raw(s, ri);
}
/* Updated tracing uses CPU_CHANGE. If we hit one of these,
* it will process very next (since the tsc isn't updated), and
* we'll skip forward appropriately. */
ri->cpu = p->pid;
}
ssize_t read_record(struct pcpu_info * p) {
off_t * offset;
struct record_info *ri;
offset = &p->file_offset;
ri = &p->ri;
ri->size = __read_record(&ri->rec, *offset);
if(ri->size)
{
__fill_in_record_info(p);
}
else
{
fprintf(warn, "%s: read returned zero, deactivating pcpu %d\n",
__func__, p->pid);
deactivate_pcpu(p);
}
return ri->size;
}
/*
* This function gets called for every record when doing dump. Try to
* make it efficient by changing the minimum amount from the last
* call. Do this by:
* - Keeping track of the last pcpu called, so we can just set that to -
* - Keeping track of how many pcpus we've "drawn", and only "drawing" new ones
* - Updating the current one
*
* FIXME: Need to deal with pcpu states changing...
*
* WARNING not thread-safe
*/
char __pcpu_string[MAX_CPUS+1] = { 0 };
void pcpu_string_draw(struct pcpu_info *p)
{
char *s = __pcpu_string;
int i=p->pid;
if(p->lost_record.active)
s[i]='l';
else if (!p->current)
s[i]=' ';
else if (p->current->d->did == DEFAULT_DOMAIN)
s[i]='.';
else if (p->current->d->did == IDLE_DOMAIN)
{
if ( opt.dump_show_power_states )
s[i]=p->power_state+'0';
else
s[i]='-';
}
else
s[i]='|';
}
char * pcpu_string(int pcpu)
{
char *s = __pcpu_string;
static int max_active_pcpu=-1, last_pcpu=-1;
assert(P.max_active_pcpu < MAX_CPUS);
assert(pcpu <= P.max_active_pcpu);
if(last_pcpu >= 0)
pcpu_string_draw(P.pcpu+last_pcpu);
if(P.max_active_pcpu > max_active_pcpu)
{
int i;
for(i=max_active_pcpu + 1; i<= P.max_active_pcpu; i++)
pcpu_string_draw(P.pcpu+i);
max_active_pcpu=P.max_active_pcpu;
}
s[pcpu]='x';
last_pcpu = pcpu;
return s;
}
/* Null terminated */
struct pcpu_info *record_order[MAX_CPUS+1] = { 0 };
/* In the case of identical tsc values, the old algorithm would favor the
* pcpu with the lowest number. By default the new algorithm favors the
* pcpu which has been processed most recently.
*
* I think the second way is better; but it's good to be able to use the
* old ordering, at very lest to verify that there are no (other) ordering
* differences. Enabling the below flag will cause the insertion / bubble
* routines to order by pcpu id as well as tsc, preserving the old order. */
//#define PRESERVE_PCPU_ORDERING
/* Steady state:
* + Entire list is in order, except (potentially) for the first entry
* + last is pointing to the first entry.
*/
void record_order_bubble(struct pcpu_info *last)
{
int i;
/* Find the pcpu to "bubble". This is usually the
* first one, but if other pcpus have been activated, it may
* not be. */
for(i=0; record_order[i] && record_order[i]!=last; i++);
assert(record_order[i]);
/* Now bubble it down */
for( ;
record_order[i+1]
&& ( record_order[i+1]->order_tsc < last->order_tsc
#ifdef PRESERVE_PCPU_ORDERING
|| ( record_order[i+1]->order_tsc == last->order_tsc
&& record_order[i+1]->pid < last->pid )
#endif
) ;
i++)
record_order[i]=record_order[i+1];
record_order[i]=last;
}
void record_order_insert(struct pcpu_info *new)
{
int i;
struct pcpu_info *p=NULL, *t=NULL;
/* Sanity check: Make sure it's not already in there */
for(i=0; record_order[i]; i++)
assert(record_order[i]!=new);
/* Find where to insert it */
for(i=0;
record_order[i]
&& ( record_order[i]->order_tsc < new->order_tsc
#ifdef PRESERVE_PCPU_ORDERING
|| ( record_order[i]->order_tsc == new->order_tsc
&& record_order[i]->pid < new->pid )
#endif
) ;
i++)
;
/* And insert it */
for( p=new; p ; i++)
{
t=record_order[i];
record_order[i]=p;
p=t;
}
}
void record_order_remove(struct pcpu_info *rem)
{
int i;
/* Find where the record is */
for(i=0; record_order[i] && record_order[i]!=rem; i++)
;
/* Sanity check: Make sure it's actually there! */
assert(record_order[i]);
/* And move everyone forward */
for(; (record_order[i]=record_order[i+1]); i++)
;
}
struct pcpu_info * choose_next_record(void)
{
struct pcpu_info *min_p=NULL;
min_p=record_order[0];
if(opt.progress && min_p && min_p->file_offset >= G.progress.update_offset)
progress_update(min_p->file_offset);
/* If there are active pcpus, make sure we chose one */
assert(min_p || (P.max_active_pcpu==-1));
return min_p;
}
void process_records(void) {
while(1) {
struct pcpu_info *p = NULL;
if(!(p=choose_next_record()))
return;
process_record(p);
/* Lost records gets processed twice. */
if(p->ri.event == TRC_LOST_RECORDS) {
p->ri.event = TRC_LOST_RECORDS_END;
if(p->ri.tsc > p->order_tsc)
p->order_tsc = p->ri.tsc;
else {
fprintf(warn, "Strange, lost_record ri->tsc %lld !> p->order_tsc %lld!\n",
p->ri.tsc, p->order_tsc);
error(ERR_FILE, NULL);
}
}
else
read_record(p);
/* Update this pcpu in the processing order */
if ( p->active )
record_order_bubble(p);
}
}
void vcpu_summary(struct vcpu_data *v)
{
printf("-- v%d --\n", v->vid);
sched_summary_vcpu(v);
switch(v->data_type) {
case VCPU_DATA_HVM:
hvm_summary(&v->hvm);
break;
case VCPU_DATA_PV:
pv_summary(&v->pv);
break;
default:
break;
}
}
void domain_summary(void)
{
struct domain_data * d;
int i;
if(opt.show_default_domain_summary) {
d = &default_domain;
printf("|-- Default domain --|\n");
for( i = 0; i < MAX_CPUS ; i++ )
{
if(d->vcpu[i])
vcpu_summary(d->vcpu[i]);
}
}
for ( d = domain_list ; d ; d=d->next )
{
int i;
printf("|-- Domain %d --|\n", d->did);
sched_summary_domain(d);
mem_summary_domain(d);
for( i = 0; i < MAX_CPUS ; i++ )
{
if(d->vcpu[i])
vcpu_summary(d->vcpu[i]);
}
printf("Emulate eip list\n");
dump_eip(d->emulate_eip_list);
if ( opt.with_interrupt_eip_enumeration )
{
printf("Interrupt eip list (vector %d)\n",
opt.interrupt_eip_enumeration_vector);
dump_eip(d->interrupt_eip_list);
}
cr3_dump_list(d->cr3_value_head);
}
}
char * stringify_cpu_hz(long long cpu_hz);
void summary(void) {
int i;
printf("Total time: %.2lf seconds (using cpu speed %s)\n",
((double)(P.f.total_cycles))/opt.cpu_hz,
stringify_cpu_hz(opt.cpu_hz));
printf("--- Log volume summary ---\n");
for(i=0; isummary)
continue;
printf(" - cpu %d -\n", i);
volume_summary(&p->volume.total);
}
domain_summary();
}
void report_pcpu(void) {
int i, active=0;
for(i=0; isummary)
continue;
printf("pcpu %d\n", i);
print_cycle_summary(&p->time.running, " running");
print_cycle_summary(&p->time.idle, " idle");
print_cycle_summary(&p->time.lost, " lost");
if ( p->time.running.count )
active++;
}
printf("Total active cpus: %d\n", active);
}
void init_pcpus(void) {
int i=0;
off_t offset = 0;
for(i=0; i GHZ) {
hz = (float)cpu_hz / GHZ;
suffix = 'G';
} else if(cpu_hz > MHZ) {
hz = (float)cpu_hz / MHZ;
suffix = 'M';
} else if(cpu_hz > KHZ) {
hz = (float)cpu_hz / KHZ;
suffix = 'k';
} else {
hz = cpu_hz;
suffix = ' ';
}
snprintf(cpu_string, 20, "%1.2lf %cHz", hz, suffix);
return cpu_string;
}
int parse_array(char *arg, struct array_struct *a) {
char *p, *q;
int n=1, i;
/* Count the number of commas (and thus the number of elements) */
for(p=arg; *p; p++)
if(*p == ',')
n++;
fprintf(warn, "%s: Found %d elements\n", __func__, n);
fflush(warn);
a->count = n;
a->values = malloc(n * sizeof(unsigned long long));
if(!a->values) {
fprintf(stderr, "Malloc failed!\n");
error(ERR_SYSTEM, NULL);
}
/* Now parse the elements */
p = q = arg;
for(i=0; ivalues[i] = strtoull(p, &q, 0);
if(p == q) {
fprintf(stderr, "Bad format: %s\n", q);
return -1;
}
fprintf(warn, "%s: Found element 0x%llx (%lld)\n",
__func__, a->values[i],
a->values[i]);
fflush(warn);
if(*q == ',')
q++;
else if(*q != '\0') {
fprintf(stderr, "Bad format: %s\n", q);
return -1;
}
p=q;
}
return n;
}
error_t cmd_parser(int key, char *arg, struct argp_state *state)
{
switch (key)
{
/* Dump group */
case OPT_DUMP_ALL:
opt.dump_all = 1;
G.output_defined = 1;
break;
case OPT_DUMP_RAW_READS:
opt.dump_raw_reads = 1;
G.output_defined = 1;
break;
case OPT_DUMP_NO_PROCESSING:
opt.dump_no_processing = 1;
opt.dump_raw_reads = 1;
G.output_defined = 1;
break;
case OPT_DUMP_RAW_PROCESS:
opt.dump_raw_process = 1;
G.output_defined = 1;
break;
case OPT_DUMP_IPI_LATENCY:
opt.dump_ipi_latency = 1;
break;
case OPT_DUMP_TRACE_VOLUME_ON_LOST_RECORD:
opt.dump_trace_volume_on_lost_record = 1;
break;
case OPT_DUMP_SHOW_POWER_STATES:
opt.dump_show_power_states = 1;
break;
/* Extra group */
case OPT_WITH_CR3_ENUMERATION:
opt.with_cr3_enumeration=1;
break;
case OPT_WITH_PIO_ENUMERATION:
opt.with_pio_enumeration=1;
break;
case OPT_WITH_MMIO_ENUMERATION:
opt.with_mmio_enumeration=1;
break;
case OPT_SHOW_DEFAULT_DOMAIN_SUMMARY:
opt.show_default_domain_summary=1;
break;
case OPT_SAMPLE_SIZE:
{
char * inval;
opt.sample_size = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
break;
}
case OPT_SAMPLE_MAX:
{
char * inval;
opt.sample_max = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
break;
}
case OPT_MMIO_ENUMERATION_SKIP_VGA:
{
char * inval;
opt.mmio_enumeration_skip_vga = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
break;
}
case OPT_SCATTERPLOT_INTERRUPT_EIP:
{
char * inval;
G.output_defined = 1;
opt.scatterplot_interrupt_eip=1;
opt.scatterplot_interrupt_vector = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
}
break;
case OPT_WITH_INTERRUPT_EIP_ENUMERATION:
{
char * inval;
opt.with_interrupt_eip_enumeration=1;
opt.interrupt_eip_enumeration_vector = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
}
break;
case OPT_SCATTERPLOT_UNPIN_PROMOTE:
G.output_defined = 1;
opt.scatterplot_unpin_promote=1;
break;
case OPT_SCATTERPLOT_CR3_SWITCH:
G.output_defined = 1;
opt.scatterplot_cr3_switch=1;
break;
case OPT_SCATTERPLOT_WAKE_TO_HALT:
G.output_defined = 1;
opt.scatterplot_wake_to_halt=1;
break;
case OPT_SCATTERPLOT_VMEXIT_EIP:
G.output_defined = 1;
opt.scatterplot_vmexit_eip=1;
break;
case OPT_SCATTERPLOT_EXTINT_CYCLES:
{
char * inval;
G.output_defined = 1;
opt.scatterplot_extint_cycles=1;
opt.scatterplot_extint_cycles_vector = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
}
break;
case OPT_SCATTERPLOT_RDTSC:
G.output_defined = 1;
opt.scatterplot_rdtsc=1;
break;
case OPT_SCATTERPLOT_IRQ:
G.output_defined = 1;
opt.scatterplot_irq=1;
break;
case OPT_SCATTERPLOT_IO:
{
char * inval;
G.output_defined = 1;
opt.scatterplot_io=1;
opt.scatterplot_io_port = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
}
break;
case OPT_SCATTERPLOT_RUNSTATE:
G.output_defined = 1;
opt.scatterplot_runstate=1;
break;
case OPT_SCATTERPLOT_RUNSTATE_TIME:
G.output_defined = 1;
opt.scatterplot_runstate_time=1;
break;
case OPT_SCATTERPLOT_PCPU:
G.output_defined = 1;
opt.scatterplot_pcpu=1;
break;
case OPT_HISTOGRAM_INTERRUPT_EIP:
{
char * inval, *p;
opt.histogram_interrupt_eip=1;
opt.histogram_interrupt_vector = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
p = inval;
if(*p == ',')
opt.histogram_interrupt_increment = (unsigned long long)strtoull(p+1, &inval, 0);
else
opt.histogram_interrupt_increment = 0x1000000;
printf("Making histogram of eips at interrupt %d, increment %llx\n",
opt.histogram_interrupt_vector,
opt.histogram_interrupt_increment);
}
break;
case OPT_INTERVAL_LENGTH:
{
char * inval;
opt.interval.msec = (unsigned) (strtof(arg, &inval) * 1000);
if ( inval == arg )
argp_usage(state);
break;
}
case OPT_INTERVAL_CR3_SCHEDULE_TIME:
{
if(parse_array(arg, &opt.interval.array) < 0)
goto usage;
interval_table_alloc(opt.interval.array.count);
opt.interval.output = INTERVAL_CR3_SCHEDULE_TIME;
opt.interval.check = INTERVAL_CHECK_CR3;
opt.interval.mode = INTERVAL_MODE_ARRAY;
opt.interval_mode = 1;
opt.summary_info = 1;
opt.with_cr3_enumeration = 1;
G.output_defined = 1;
break;
usage:
fprintf(stderr, "Invalid input for cr3_schedule_time\n");
argp_usage(state);
break;
}
case OPT_INTERVAL_CR3_SCHEDULE_TIME_ALL:
opt.interval.output = INTERVAL_CR3_SCHEDULE_TIME;
opt.interval.check = INTERVAL_CHECK_CR3;
opt.interval.mode = INTERVAL_MODE_LIST;
opt.interval_mode = 1;
opt.summary_info = 1;
opt.with_cr3_enumeration = 1;
G.output_defined = 1;
break;
case OPT_INTERVAL_CR3_SCHEDULE_ORDERED:
opt.interval.output = INTERVAL_CR3_SCHEDULE_ORDERED;
opt.interval.check = INTERVAL_CHECK_CR3;
opt.interval_mode = 1;
opt.summary_info = 1;
opt.with_cr3_enumeration = 1;
G.output_defined = 1;
break;
case OPT_INTERVAL_CR3_SHORT_SUMMARY:
{
if(parse_array(arg, &opt.interval.array) < 0
|| opt.interval.array.count != 1)
goto usage;
opt.interval.output = INTERVAL_CR3_SHORT_SUMMARY;
opt.interval.check = INTERVAL_CHECK_CR3;
opt.interval_mode = 1;
opt.summary_info = 1;
opt.with_cr3_enumeration = 1;
G.output_defined = 1;
break;
}
case OPT_INTERVAL_DOMAIN_TOTAL_TIME:
{
if(parse_array(arg, &opt.interval.array) < 0)
goto idtt_usage;
interval_table_alloc(opt.interval.array.count);
opt.interval.output = INTERVAL_DOMAIN_TOTAL_TIME;
opt.interval.check = INTERVAL_CHECK_DOMAIN;
opt.interval.mode = INTERVAL_MODE_ARRAY;
opt.interval_mode = 1;
opt.summary_info = 1;
G.output_defined = 1;
break;
idtt_usage:
fprintf(stderr, "Invalid input for domain_total_time\n");
argp_usage(state);
break;
}
case OPT_INTERVAL_DOMAIN_TOTAL_TIME_ALL:
opt.interval.output = INTERVAL_DOMAIN_TOTAL_TIME;
opt.interval.check = INTERVAL_CHECK_DOMAIN;
opt.interval.mode = INTERVAL_MODE_LIST;
opt.interval_mode = 1;
opt.summary_info = 1;
G.output_defined = 1;
break;
case OPT_INTERVAL_DOMAIN_SHORT_SUMMARY:
{
if((parse_array(arg, &opt.interval.array) < 0)
|| opt.interval.array.count != 1)
argp_usage(state);
opt.interval.output = INTERVAL_DOMAIN_SHORT_SUMMARY;
opt.interval.check = INTERVAL_CHECK_DOMAIN;
opt.interval_mode = 1;
opt.summary_info = 1;
G.output_defined = 1;
break;
}
case OPT_INTERVAL_DOMAIN_GUEST_INTERRUPT:
{
if((parse_array(arg, &opt.interval.array) < 0)
|| opt.interval.array.count != 1)
argp_usage(state);
opt.interval.output = INTERVAL_DOMAIN_GUEST_INTERRUPT;
opt.interval.check = INTERVAL_CHECK_DOMAIN;
opt.interval_mode = 1;
opt.summary_info = 1;
G.output_defined = 1;
break;
}
case OPT_INTERVAL_DOMAIN_GRANT_MAPS:
{
if((parse_array(arg, &opt.interval.array) < 0)
|| opt.interval.array.count != 1)
argp_usage(state);
opt.interval.output = INTERVAL_DOMAIN_GRANT_MAPS;
opt.interval.check = INTERVAL_CHECK_DOMAIN;
opt.interval_mode = 1;
opt.summary_info = 1;
G.output_defined = 1;
break;
}
/* Summary group */
case OPT_SUMMARY:
opt.summary = 1;
opt.summary_info = 1;
G.output_defined = 1;
break;
case OPT_REPORT_PCPU:
opt.report_pcpu = 1;
//opt.summary_info = 1;
G.output_defined = 1;
break;
/* Guest info group */
case OPT_DEFAULT_GUEST_PAGING_LEVELS:
{
char *inval;
opt.default_guest_paging_levels = (int)strtol(arg, &inval, 0);
if ( inval == arg )
argp_usage(state);
}
break;
case OPT_SYMBOL_FILE:
/* FIXME - strcpy */
G.symbol_file = arg;
break;
/* Hardware info group */
case OPT_SVM_MODE:
opt.svm_mode = 1;
break;
case OPT_CPU_HZ:
parse_cpu_hz(arg);
break;
break;
case OPT_TOLERANCE:
{
char * inval;
opt.tolerance = (int)strtol(arg, &inval, 0);
if( inval == arg )
argp_usage(state);
if ( opt.tolerance > ERR_MAX_TOLERABLE )
{
fprintf(stderr, "ERROR: Max tolerable error %d\n",
ERR_MAX_TOLERABLE);
exit(1);
}
printf("Tolerating errors at or below %d\n",
opt.tolerance);
}
break;
case OPT_PROGRESS:
opt.progress = 1;
break;
case OPT_TSC_LOOP_FATAL:
opt.tsc_loop_fatal = 1;
break;
case ARGP_KEY_ARG:
{
/* FIXME - strcpy */
if (state->arg_num == 0)
G.trace_file = arg;
else
argp_usage(state);
}
break;
case ARGP_KEY_END:
{
if(opt.interval_mode) {
opt.interval.cycles = ( opt.interval.msec * opt.cpu_hz ) / 1000 ;
interval_header();
}
if(!G.output_defined)
{
fprintf(stderr, "No output defined, using summary.\n");
opt.summary = 1;
opt.summary_info = 1;
}
fprintf(stderr, "Using %s hardware-assisted virtualization.\n",
opt.svm_mode?"SVM":"VMX");
}
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
const struct argp_option cmd_opts[] = {
/* Dump group */
{ .name = "dump-all",
.key = OPT_DUMP_ALL,
.group = OPT_GROUP_DUMP,
.doc = "Dump all records as they come in.", },
{ .name = "dump-raw-reads",
.key = OPT_DUMP_RAW_READS,
.group = OPT_GROUP_DUMP,
.doc = "Dump raw data as it's read from disk. Useful mainly for debugging the analysis tool.", },
{ .name = "dump-no-processing",
.key = OPT_DUMP_NO_PROCESSING,
.group = OPT_GROUP_DUMP,
.doc = "Don't do any processing on records other than cpu changes. Implies dump-raw-reads (or you wouldn't get anything).", },
{ .name = "dump-raw-process",
.key = OPT_DUMP_RAW_PROCESS,
.group = OPT_GROUP_DUMP,
.doc = "Dump raw data as it's processed. Useful mainly for debugging the analysis tool.", },
{ .name = "dump-ipi-latency",
.key = OPT_DUMP_IPI_LATENCY,
.group = OPT_GROUP_DUMP,
.doc = "Dump IPI latency info as IPIs are delivered (vector 0xd1 only).", },
{ .name = "dump-trace-volume-on-lost-record",
.key = OPT_DUMP_TRACE_VOLUME_ON_LOST_RECORD,
.group = OPT_GROUP_DUMP,
.doc = "Dump the volume of trace types in the previous cpu buffer when a lost record is created.", },
{ .name = "dump-show-power-states",
.key = OPT_DUMP_SHOW_POWER_STATES,
.group = OPT_GROUP_DUMP,
.doc = "Show the power-state of the physical cpu when dumping output.", },
/* Extra processing group */
{ .name = "with-cr3-enumeration",
.key = OPT_WITH_CR3_ENUMERATION,
.group = OPT_GROUP_EXTRA,
.doc = "Keep track of per-cr3 values", },
{ .name = "with-pio-enumeration",
.key = OPT_WITH_PIO_ENUMERATION,
.group = OPT_GROUP_EXTRA,
.doc = "Report summary info on indiviaul IO addresses", },
{ .name = "with-mmio-enumeration",
.key = OPT_WITH_MMIO_ENUMERATION,
.group = OPT_GROUP_EXTRA,
.doc = "Report summary info on indiviaul MMIO addresses.", },
{ .name = "with-interrupt-eip-enumeration",
.key = OPT_WITH_INTERRUPT_EIP_ENUMERATION,
.arg = "vector",
.group = OPT_GROUP_EXTRA,
.doc = "Report a summary on eips interrupted by specified vector.", },
{ .name = "scatterplot-interrupt-eip",
.key = OPT_SCATTERPLOT_INTERRUPT_EIP,
.arg = "vector",
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of eips as a function of time.", },
{ .name = "scatterplot-extint-cycles",
.key = OPT_SCATTERPLOT_EXTINT_CYCLES,
.arg = "vector",
.group = OPT_GROUP_EXTRA,
.doc = "Output a scatterplot of vmexit cycles for external interrupts of the given vector as a function of time.", },
{ .name = "scatterplot-unpin-promote",
.key = OPT_SCATTERPLOT_UNPIN_PROMOTE,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of unpins and promotions. If " \
"--with-cr3-enumeration is included, promotions include current cr3.", },
{ .name = "scatterplot-cr3-switch",
.key = OPT_SCATTERPLOT_CR3_SWITCH,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of cr3 switches.", },
{ .name = "scatterplot-wake-to-halt",
.key = OPT_SCATTERPLOT_WAKE_TO_HALT,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of wake-to-halt.", },
{ .name = "scatterplot-vmexit-eip",
.key = OPT_SCATTERPLOT_VMEXIT_EIP,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of vmexit eips.", },
{ .name = "scatterplot-io",
.key = OPT_SCATTERPLOT_IO,
.arg = "port",
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of io latencies for givein address as a function of time.", },
{ .name = "scatterplot-runstate",
.key = OPT_SCATTERPLOT_RUNSTATE,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of runstate.", },
{ .name = "scatterplot-runstate-time",
.key = OPT_SCATTERPLOT_RUNSTATE_TIME,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of time in a runstate.", },
{ .name = "scatterplot-pcpu",
.key = OPT_SCATTERPLOT_PCPU,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of which pcpu vcpus are run on.", },
{ .name = "scatterplot-rdtsc",
.key = OPT_SCATTERPLOT_RDTSC,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of rdtsc values.", },
{ .name = "scatterplot-irq",
.key = OPT_SCATTERPLOT_IRQ,
.group = OPT_GROUP_EXTRA,
.doc = "Output scatterplot of irqs on pcpus.", },
{ .name = "histogram-interrupt-eip",
.key = OPT_HISTOGRAM_INTERRUPT_EIP,
.arg = "vector[,increment]",
.group = OPT_GROUP_EXTRA,
.doc = "Output histograms of eips.", },
{ .name = "interval",
.key = OPT_INTERVAL_LENGTH,
.arg = "sec",
.group = OPT_GROUP_INTERVAL,
.doc = "Interval length to do time-based graphs, in seconds", },
{ .name = "interval-cr3-schedule-time",
.key = OPT_INTERVAL_CR3_SCHEDULE_TIME,
.arg = "gmfn[,gmfn...]",
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with the listed cr3 value(s) every interval.", },
{ .name = "interval-cr3-schedule-time-all",
.key = OPT_INTERVAL_CR3_SCHEDULE_TIME_ALL,
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with all cr3 values every interval.", },
{ .name = "interval-cr3-schedule-ordered",
.key = OPT_INTERVAL_CR3_SCHEDULE_ORDERED,
.group = OPT_GROUP_INTERVAL,
.doc = "Print summary with the top 10 cr3 values every interval.", },
{ .name = "interval-cr3-short-summary",
.key = OPT_INTERVAL_CR3_SHORT_SUMMARY,
.arg = "gmfn",
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with the hvm short summary of cr3 value every interval.", },
{ .name = "interval-domain-total-time",
.key = OPT_INTERVAL_DOMAIN_TOTAL_TIME,
.arg = "domain[,domain...]",
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with the listed domain(s) total runtime every interval.", },
{ .name = "interval-domain-total-time-all",
.key = OPT_INTERVAL_DOMAIN_TOTAL_TIME_ALL,
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with all domains every interval.", },
{ .name = "interval-domain-short-summary",
.key = OPT_INTERVAL_DOMAIN_SHORT_SUMMARY,
.arg = "domain-id",
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with the hvm short summary of given domain every interval.", },
{ .name = "interval-domain-guest-interrupt",
.key = OPT_INTERVAL_DOMAIN_GUEST_INTERRUPT,
.arg = "domain-id",
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with the guest interrupt count of given domain every interval.", },
{ .name = "interval-domain-grant-maps",
.key = OPT_INTERVAL_DOMAIN_GRANT_MAPS,
.arg = "domain-id",
.group = OPT_GROUP_INTERVAL,
.doc = "Print a csv with the grant maps done on behalf of a given domain every interval.", },
/* Summary group */
{ .name = "show-default-domain-summary",
.key = OPT_SHOW_DEFAULT_DOMAIN_SUMMARY,
.group = OPT_GROUP_SUMMARY,
.doc = "Show default domain information on summary", },
{ .name = "mmio-enumeration-skip-vga",
.key = OPT_MMIO_ENUMERATION_SKIP_VGA,
.arg = "[0|1]",
.group = OPT_GROUP_SUMMARY,
.doc = "Control whether we enumerate MMIO accesses to the VGA area, which can be extremely high during boot. Default: 0", },
{ .name = "sample-size",
.key = OPT_SAMPLE_SIZE,
.arg = "size",
.group = OPT_GROUP_SUMMARY,
.doc = "Start with [size] samples for percentile purposes. Enter 0 to" \
"disable. Default 1024.", },
{ .name = "sample-max",
.key = OPT_SAMPLE_MAX,
.arg = "size",
.group = OPT_GROUP_SUMMARY,
.doc = "Do not allow sample to grow beyond [size] samples for percentile"\
" purposes. Enter 0 for no limit.", },
{ .name = "summary",
.key = OPT_SUMMARY,
.group = OPT_GROUP_SUMMARY,
.doc = "Output a summary", },
{ .name = "report-pcpu",
.key = OPT_REPORT_PCPU,
.group = OPT_GROUP_SUMMARY,
.doc = "Report utilization for pcpus", },
/* Guest info */
{ .name = "default-guest-paging-levels",
.key = OPT_DEFAULT_GUEST_PAGING_LEVELS,
.group = OPT_GROUP_GUEST,
.arg = "L",
.doc = "Default guest paging levels. Mainly necessary for Rio, as Miami traces include guest paging levels where appropriate.", },
{ .name = "symbol-file",
.key = OPT_SYMBOL_FILE,
.group = OPT_GROUP_GUEST,
.arg = "filename",
.doc = "A symbol file for interpreting guest eips.", },
/* Hardware info */
{ .name = "cpu-hz",
.key = OPT_CPU_HZ,
.group = OPT_GROUP_HARDWARE,
.arg = "HZ",
.doc = "Cpu speed of the tracing host, used to convert tsc into seconds.", },
{ .name = "svm-mode",
.key = OPT_SVM_MODE,
.group = OPT_GROUP_HARDWARE,
.doc = "Assume AMD SVM-style vmexit error codes. (Default is Intel VMX.)", },
{ .name = "progress",
.key = OPT_PROGRESS,
.doc = "Progress dialog. Requires the zenity (GTK+) executable.", },
{ .name = "tsc-loop-fatal",
.key = OPT_TSC_LOOP_FATAL,
.doc = "Stop processing and exit if tsc skew tracking detects a dependency loop.", },
{ .name = "tolerance",
.key = OPT_TOLERANCE,
.arg = "errlevel",
.doc = "Sets tolerance for errors found in the file. Default is 3; max is 6.", },
{ 0 },
};
const struct argp parser_def = {
.options = cmd_opts,
.parser = cmd_parser,
.args_doc = "[trace file]",
.doc = "",
};
const char *argp_program_bug_address = "George Dunlap ";
int main(int argc, char *argv[]) {
/* Start with warn at stderr. */
warn = stderr;
argp_parse(&parser_def, argc, argv, 0, NULL, NULL);
if (G.trace_file == NULL)
exit(1);
if ( (G.fd = open(G.trace_file, O_RDONLY)) < 0) {
perror("open");
error(ERR_SYSTEM, NULL);
} else {
struct stat s;
fstat(G.fd, &s);
G.file_size = s.st_size;
}
if ( (G.mh = mread_init(G.fd)) == NULL )
perror("mread");
if (G.symbol_file != NULL)
parse_symbol_file(G.symbol_file);
if(opt.dump_all)
warn = stdout;
init_pcpus();
if(opt.progress)
progress_init();
process_records();
if(opt.interval_mode)
interval_tail();
if(opt.summary)
summary();
if(opt.report_pcpu)
report_pcpu();
if(opt.progress)
progress_finish();
return 0;
}
/*
* Local variables:
* mode: C
* c-set-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/