diff --git a/.appveyor.yml b/.appveyor.yml index 6d9d1651e03e44972fe0646c4b3c42b2675e617e..6abf1962a8734c8925ffb638fffb607cc7d66569 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -53,16 +53,16 @@ branches: platform: x64 -image: Visual Studio 2013 +image: Visual Studio 2015 build: verbosity: detailed # i#2406: Appveyor's global serialization makes it painful to use more than # one configuration. We no longer build packages with VS2010 and are -# dropping official support for it, meaning we only need to test VS2013 here. +# dropping official support for it, meaning we only need to test VS2015 here. configuration: - - 2013 + - 2015 install: ################################################## @@ -92,7 +92,7 @@ install: # XXX i#2145: point at Qt5 for testing drgui build. before_build: - - if "%configuration%"=="2013" call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" x86 + - if "%configuration%"=="2015" call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" x86 - cd c:\projects\dynamorio build_script: @@ -101,7 +101,7 @@ build_script: - echo %PATH% # The perl in c:\perl can't open a pipe so we use cygwin perl. # XXX i#1967: can we pass "package" only when deploying to save time? - - c:\cygwin\bin\perl ../suite/runsuite_wrapper.pl travis use_ninja package %EXTRA_ARGS% + - c:\cygwin\bin\perl ../suite/runsuite_wrapper.pl -VV travis use_ninja package %EXTRA_ARGS% # Automated deployment of builds to GitHub Releases. # We rely on a Travis cron job to push a tag to the repo which then diff --git a/api/docs/release.dox b/api/docs/release.dox index 0ff106a7a6157fc4ca53a1d1cb3979eb80d3efd6..e51b901e001985d701a6f8b09ccebcb832459c09 100644 --- a/api/docs/release.dox +++ b/api/docs/release.dox @@ -133,7 +133,17 @@ Dr. Memory Framework (DRMF) in the same package as DynamoRIO. DRMF provides the umbra, drsyscall, and drsymcache Extensions for use by clients. -The changes between version \DR_VERSION and 7.1.0 include: +The changes between version \DR_VERSION and 7.1.0 include the following minor +compatibility changes: + + - Replaced NUM_SIMD_SLOTS with proc_num_simd_saved() and adds the define + #MCTX_NUM_SIMD_SLOTS. Clients may set(DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY ON) + in order to provide the define NUM_SIMD_SLOTS using proc_num_simd_saved(). + The macro is not a constant expression and code relying on this needs to be + rewritten. DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY is set automatically if + clients target version 7.0 or earlier. + +Further non-compatibility-affecting changes include: - Added drfront_set_verbose() to obtain diagnostics from drfrontendlib. diff --git a/core/arch/aarch64/clean_call_opt.c b/core/arch/aarch64/clean_call_opt.c index aad25ccdc222dfcc1e94cb66066c6c2df4d77810..fd815fefcd8455929a013d5149631a097d48d255 100644 --- a/core/arch/aarch64/clean_call_opt.c +++ b/core/arch/aarch64/clean_call_opt.c @@ -183,7 +183,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci) /* XXX implement bitset for optimisation */ memset(ci->reg_used, 0, sizeof(bool) * NUM_GP_REGS); ci->num_simd_used = 0; - memset(ci->simd_used, 0, sizeof(bool) * NUM_SIMD_REGS); + memset(ci->simd_used, 0, sizeof(bool) * MCTX_NUM_SIMD_SLOTS); ci->write_flags = false; num_regparm = MIN(ci->num_args, NUM_REGPARM); @@ -213,7 +213,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci) } /* SIMD register usage */ - for (i = 0; i < NUM_SIMD_REGS; i++) { + for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) { if (!ci->simd_used[i] && instr_uses_reg(instr, (DR_REG_Q0 + (reg_id_t)i))) { LOG(THREAD, LOG_CLEANCALL, 2, "CLEANCALL: callee " PFX " uses VREG%d at " PFX "\n", ci->start, i, diff --git a/core/arch/aarch64/proc.c b/core/arch/aarch64/proc.c index de2278dc376e4b0202c6f37494f2e956e7a73beb..04f3af61b5a23ca160413566ea396962ea136117 100644 --- a/core/arch/aarch64/proc.c +++ b/core/arch/aarch64/proc.c @@ -34,9 +34,13 @@ #include "proc.h" #include "instr.h" +int num_simd_saved; + void proc_init_arch(void) { + num_simd_saved = MCTX_NUM_SIMD_SLOTS; + /* FIXME i#1569: NYI */ } @@ -61,6 +65,13 @@ proc_fpstate_save_size(void) return 0; } +DR_API +int +proc_num_simd_saved(void) +{ + return num_simd_saved; +} + DR_API size_t proc_save_fpstate(byte *buf) diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c index 6cea69449d96ff76a199eac00b5ead4fc28eacb9..5255ba617ae6e8b1185492d6e65ff19bf98f8ede 100644 --- a/core/arch/aarchxx/mangle.c +++ b/core/arch/aarchxx/mangle.c @@ -409,7 +409,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, # endif if (cci == NULL) cci = &default_clean_call_info; - if (cci->preserve_mcontext || cci->num_simd_skip != NUM_SIMD_REGS) { + if (cci->preserve_mcontext || cci->num_simd_skip != MCTX_NUM_SIMD_SLOTS) { /* FIXME i#1551: once we add skipping of regs, need to keep shape here */ } /* FIXME i#1551: once we have cci->num_simd_skip, skip this if possible */ @@ -523,7 +523,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, insert_save_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0, DR_REG_Q0, false /* is_gpr */); - dstack_offs += (NUM_SIMD_SLOTS * sizeof(dr_simd_t)); + dstack_offs += (MCTX_NUM_SIMD_SLOTS * sizeof(dr_simd_t)); /* Restore the registers we used. */ /* ldp x0, x1, [sp] */ @@ -544,7 +544,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, PRE(ilist, instr, INSTR_CREATE_vstmdb(dcontext, OPND_CREATE_MEMLIST(DR_REG_SP), SIMD_REG_LIST_LEN, SIMD_REG_LIST_0_15)); - dstack_offs += NUM_SIMD_SLOTS * sizeof(dr_simd_t); + dstack_offs += proc_num_simd_saved() * sizeof(dr_simd_t); /* pc and aflags */ if (cci->skip_save_flags) { /* even if we skip flag saves we want to keep mcontext shape */ @@ -635,7 +635,7 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist opnd_create_reg(DR_REG_SP))); current_offs = - get_clean_call_switch_stack_size() - NUM_SIMD_SLOTS * sizeof(dr_simd_t); + get_clean_call_switch_stack_size() - proc_num_simd_saved() * sizeof(dr_simd_t); /* add x0, x0, current_offs */ PRE(ilist, instr, diff --git a/core/arch/arch.c b/core/arch/arch.c index a25b59d0fd28b723ab2038df80d456892c6d2d5b..9b8389b02bb2b5dd5283f712bb190d060888be5d 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3477,7 +3477,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) #ifdef X86 if (preserve_xmm_caller_saved()) { int i, j; - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { if (YMM_ENABLED()) { print_file(f, dump_xml ? "\t\tymm%d= \"0x" : "\tymm%d= 0x", i); for (j = 0; j < 8; j++) { @@ -3505,7 +3505,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) #elif defined(ARM) { int i, j; - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i); for (j = 0; j < 4; j++) { print_file(f, "%08x ", context->simd[i].u32[j]); diff --git a/core/arch/arch.h b/core/arch/arch.h index 51750a07a5ad96ce1b323bf4eba029ee6d378c81..520a5d864018ea4f91ef5e94661aebd807940c17 100644 --- a/core/arch/arch.h +++ b/core/arch/arch.h @@ -323,7 +323,6 @@ typedef enum { # define SHARED_GENCODE_MATCH_THREAD(dc) get_shared_gencode(dc) #endif -#define NUM_SIMD_REGS NUM_SIMD_SAVED #define NUM_GP_REGS DR_NUM_GPR_REGS /* Information about each individual clean call invocation site. @@ -339,7 +338,7 @@ typedef struct _clean_call_info_t { bool skip_save_flags; bool skip_clear_flags; uint num_simd_skip; - bool simd_skip[NUM_SIMD_REGS]; + bool simd_skip[MCTX_NUM_SIMD_SLOTS]; uint num_regs_skip; bool reg_skip[NUM_GP_REGS]; bool preserve_mcontext; /* even if skip reg save, preserve mcontext shape */ @@ -1371,18 +1370,18 @@ typedef struct _slot_t { /* data structure of clean call callee information. */ typedef struct _callee_info_t { - bool bailout; /* if we bail out on function analysis */ - uint num_args; /* number of args that will passed in */ - int num_instrs; /* total number of instructions of a function */ - app_pc start; /* entry point of a function */ - app_pc bwd_tgt; /* earliest backward branch target */ - app_pc fwd_tgt; /* last forward branch target */ - int num_simd_used; /* number of SIMD registers (xmms) used by callee */ - bool simd_used[NUM_SIMD_REGS]; /* SIMD (xmm/ymm) registers usage */ - bool reg_used[NUM_GP_REGS]; /* general purpose registers usage */ - int num_callee_save_regs; /* number of regs callee saved */ - bool callee_save_regs[NUM_GP_REGS]; /* callee-save registers */ - bool has_locals; /* if reference local via stack */ + bool bailout; /* if we bail out on function analysis */ + uint num_args; /* number of args that will passed in */ + int num_instrs; /* total number of instructions of a function */ + app_pc start; /* entry point of a function */ + app_pc bwd_tgt; /* earliest backward branch target */ + app_pc fwd_tgt; /* last forward branch target */ + int num_simd_used; /* number of SIMD registers (xmms) used by callee */ + bool simd_used[MCTX_NUM_SIMD_SLOTS]; /* SIMD (xmm/ymm) registers usage */ + bool reg_used[NUM_GP_REGS]; /* general purpose registers usage */ + int num_callee_save_regs; /* number of regs callee saved */ + bool callee_save_regs[NUM_GP_REGS]; /* callee-save registers */ + bool has_locals; /* if reference local via stack */ bool standard_fp; /* if standard reg (xbp/x29) is used as frame pointer */ bool opt_inline; /* can be inlined or not */ bool write_flags; /* if the function changes flags */ diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h index a725628b671b5ed55e558bd6b77906a6be18c2cd..6284bd1c3bacc6f3d84c615fb56accc0e67b94cd 100644 --- a/core/arch/arch_exports.h +++ b/core/arch/arch_exports.h @@ -58,13 +58,12 @@ */ # define XMM_REG_SIZE 16 # define YMM_REG_SIZE 32 -# define XMM_SAVED_REG_SIZE YMM_REG_SIZE /* space in priv_mcontext_t for xmm/ymm */ -# define XMM_SLOTS_SIZE (NUM_SIMD_SLOTS * XMM_SAVED_REG_SIZE) -# define XMM_SAVED_SIZE (NUM_SIMD_SAVED * XMM_SAVED_REG_SIZE) +# define MCTX_SIMD_SLOT_SIZE YMM_REG_SIZE +# define MCTX_TOTAL_SIMD_SLOTS_SIZE (MCTX_NUM_SIMD_SLOTS * YMM_REG_SIZE) /* Indicates OS support, not just processor support (xref i#1278) */ # define YMM_ENABLED() (proc_avx_enabled()) # define YMMH_REG_SIZE (YMM_REG_SIZE / 2) /* upper half */ -# define YMMH_SAVED_SIZE (NUM_SIMD_SLOTS * YMMH_REG_SIZE) +# define MCTX_YMMH_SLOTS_SIZE (MCTX_NUM_SIMD_SLOTS * YMMH_REG_SIZE) #endif /* X86 */ /* Number of slots for spills from inlined clean calls. */ diff --git a/core/arch/arm/arm.asm b/core/arch/arm/arm.asm index be3c58962301edb0f5cfd7d4eeb51280c95327dd..026cdcd2de20e8f6ba2e6ae30f02d7c3e13f54c0 100644 --- a/core/arch/arm/arm.asm +++ b/core/arch/arm/arm.asm @@ -59,18 +59,18 @@ DECL_EXTERN(initstack_mutex) #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ) #ifdef X64 -# define NUM_SIMD_SLOTS 32 -# define SIMD_REG_SIZE 16 -# define NUM_GPR_SLOTS 33 /* incl flags */ -# define GPR_REG_SIZE 8 +# define MCTX_NUM_SIMD_SLOTS 32 +# define SIMD_REG_SIZE 16 +# define NUM_GPR_SLOTS 33 /* incl flags */ +# define GPR_REG_SIZE 8 #else -# define NUM_SIMD_SLOTS 16 -# define SIMD_REG_SIZE 16 -# define NUM_GPR_SLOTS 17 /* incl flags */ -# define GPR_REG_SIZE 4 +# define MCTX_NUM_SIMD_SLOTS 16 +# define SIMD_REG_SIZE 16 +# define NUM_GPR_SLOTS 17 /* incl flags */ +# define GPR_REG_SIZE 4 #endif -#define PRE_SIMD_PADDING 0 -#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + NUM_SIMD_SLOTS*SIMD_REG_SIZE) +#define PRE_SIMD_PADDING 0 +#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCTX_NUM_SIMD_SLOTS*SIMD_REG_SIZE) #define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE) #define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */ #define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */ diff --git a/core/arch/arm/proc.c b/core/arch/arm/proc.c index be6854f2e62877f60f0fbc59672ec7c03a5e5a57..60d733be3d82823dfe93f28cb5b4d16b29e3cf6a 100644 --- a/core/arch/arm/proc.c +++ b/core/arch/arm/proc.c @@ -43,10 +43,14 @@ # error NYI #endif +int num_simd_saved; + /* arch specific proc info */ void proc_init_arch(void) { + num_simd_saved = MCTX_NUM_SIMD_SLOTS; + /* FIXME i#1551: NYI on ARM */ /* all of the CPUID registers are only accessible in privileged modes * so we either need read /proc/cpuinfo or auxiliary vector provided by @@ -86,6 +90,13 @@ proc_fpstate_save_size(void) return DR_FPSTATE_BUF_SIZE; } +DR_API +int +proc_num_simd_saved(void) +{ + return num_simd_saved; +} + DR_API size_t proc_save_fpstate(byte *buf) diff --git a/core/arch/clean_call_opt_shared.c b/core/arch/clean_call_opt_shared.c index 1669f0c7803d5d9161f0dc86b636fc4034667911..b8631d001b54b89f1b1678d75a276244bfce930d 100644 --- a/core/arch/clean_call_opt_shared.c +++ b/core/arch/clean_call_opt_shared.c @@ -73,8 +73,8 @@ callee_info_init(callee_info_t *ci) * but then later in analyze_callee_regs_usage, we have to use the loop. */ /* assuming all xmm registers are used */ - ci->num_simd_used = NUM_SIMD_REGS; - for (i = 0; i < NUM_SIMD_REGS; i++) + ci->num_simd_used = MCTX_NUM_SIMD_SLOTS; + for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) ci->simd_used[i] = true; for (i = 0; i < NUM_GP_REGS; i++) ci->reg_used[i] = true; @@ -493,7 +493,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci) callee_info_t *info = cci->callee_info; /* 1. xmm registers */ - for (i = 0; i < NUM_SIMD_REGS; i++) { + for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) { if (info->simd_used[i]) { cci->simd_skip[i] = false; } else { @@ -504,7 +504,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci) cci->num_simd_skip++; } } - if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_simd_skip != NUM_SIMD_REGS) + if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_simd_skip != MCTX_NUM_SIMD_SLOTS) cci->should_align = false; /* 2. general purpose registers */ /* set regs not to be saved for clean call */ @@ -646,7 +646,7 @@ analyze_clean_call_inline(dcontext_t *dcontext, clean_call_info_t *cci) } } } - if (cci->num_simd_skip == NUM_SIMD_REGS) { + if (cci->num_simd_skip == MCTX_NUM_SIMD_SLOTS) { STATS_INC(cleancall_simd_skipped); } if (cci->skip_save_flags) { @@ -735,7 +735,7 @@ analyze_clean_call(dcontext_t *dcontext, clean_call_info_t *cci, instr_t *where, * to be saved or if more than GPR_SAVE_TRESHOLD GP registers have to be saved. * XXX: This should probably be in arch-specific clean_call_opt.c. */ - if ((NUM_SIMD_REGS - cci->num_simd_skip) > SIMD_SAVE_TRESHOLD || + if ((MCTX_NUM_SIMD_SLOTS - cci->num_simd_skip) > SIMD_SAVE_TRESHOLD || (NUM_GP_REGS - cci->num_regs_skip) > GPR_SAVE_TRESHOLD || always_out_of_line) cci->out_of_line_swap = true; # endif diff --git a/core/arch/proc.h b/core/arch/proc.h index 0127fee35b8ee6c034b416742ea0d79aebe24bd3..73e03bd30b3ddb6108ba9afc8acf529dc75d3b21 100644 --- a/core/arch/proc.h +++ b/core/arch/proc.h @@ -457,6 +457,34 @@ DR_API size_t proc_fpstate_save_size(void); +DR_API +/** + * Returns number of SIMD registers to be saved. + * + * The number of saved SIMD registers may be variable. For example, we may decide + * to optimize the number of saved registers in a context switch to avoid frequency + * scaling (https://github.com/DynamoRIO/dynamorio/issues/3169). + */ +/* PR 306394: for 32-bit xmm0-7 are caller-saved, and are touched by + * libc routines invoked by DR in some Linux systems (xref i#139), + * so they should be saved in 32-bit Linux. + * + * Xref i#139: + * XMM register preservation will cause extra runtime overhead. + * We test it over 32-bit SPEC2006 on a 64-bit Debian Linux, which shows + * that DR with xmm preservation adds negligible overhead over DR without + * xmm preservation. + * It means xmm preservation would have little performance impact over + * DR base system. This is mainly because DR's own operations' overhead + * is much higher than the context switch overhead. + * However, if a program is running with a DR client which performs many + * clean calls (one or more per basic block), xmm preservation may + * have noticable impacts, i.e. pushing bbs over the max size limit, + * and could have a noticeable performance hit. + */ +int +proc_num_simd_saved(void); + DR_API /** * Saves the floating point state into the buffer \p buf. diff --git a/core/arch/x86/clean_call_opt.c b/core/arch/x86/clean_call_opt.c index a3583b191191a8d444e8033cddc4e1be925774e0..8a1a60b90d0a34ca7d518576063517b70fa0a8ae 100644 --- a/core/arch/x86/clean_call_opt.c +++ b/core/arch/x86/clean_call_opt.c @@ -63,7 +63,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci) uint i, num_regparm; ci->num_simd_used = 0; - memset(ci->simd_used, 0, sizeof(bool) * NUM_SIMD_REGS); + memset(ci->simd_used, 0, sizeof(bool) * MCTX_NUM_SIMD_SLOTS); memset(ci->reg_used, 0, sizeof(bool) * NUM_GP_REGS); ci->write_flags = false; for (instr = instrlist_first(ilist); instr != NULL; instr = instr_get_next(instr)) { @@ -74,7 +74,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci) * impact unless there are a lot of different clean call callees. */ /* XMM registers usage */ - for (i = 0; i < NUM_SIMD_REGS; i++) { + for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) { if (!ci->simd_used[i] && instr_uses_reg(instr, (DR_REG_XMM0 + (reg_id_t)i))) { LOG(THREAD, LOG_CLEANCALL, 2, "CLEANCALL: callee " PFX " uses XMM%d at " PFX "\n", ci->start, i, @@ -605,7 +605,7 @@ insert_inline_reg_save(dcontext_t *dcontext, clean_call_info_t *cci, instrlist_t insert_get_mcontext_base(dcontext, ilist, where, ci->spill_reg); /* Save used registers. */ - ASSERT(cci->num_simd_skip == NUM_SIMD_REGS); + ASSERT(cci->num_simd_skip == MCTX_NUM_SIMD_SLOTS); for (i = 0; i < NUM_GP_REGS; i++) { if (!cci->reg_skip[i]) { reg_id_t reg_id = DR_REG_XAX + (reg_id_t)i; diff --git a/core/arch/x86/emit_utils.c b/core/arch/x86/emit_utils.c index 636418d31b4b8c977528e9a29fbe41d43ef371ed..751c1531c006fe42e63a5bbced130bf6bbe2ddfe 100644 --- a/core/arch/x86/emit_utils.c +++ b/core/arch/x86/emit_utils.c @@ -1337,12 +1337,12 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) int i; uint opcode = move_mm_reg_opcode(true /*align32*/, true /*align16*/); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < 6; i++) { APP(ilist, instr_create_1dst_1src( dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i), OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM, - XMM_OFFSET + i * XMM_SAVED_REG_SIZE))); + XMM_OFFSET + i * MCTX_SIMD_SLOT_SIZE))); } } } @@ -1560,12 +1560,13 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) int i; uint opcode = move_mm_reg_opcode(true /*align32*/, true /*align16*/); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < 6; i++) { APP(ilist, - instr_create_1dst_1src(dcontext, opcode, - OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM, - XMM_OFFSET + i * XMM_SAVED_REG_SIZE), - opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i))); + instr_create_1dst_1src( + dcontext, opcode, + OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM, + XMM_OFFSET + i * MCTX_SIMD_SLOT_SIZE), + opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i))); } } } diff --git a/core/arch/x86/mangle.c b/core/arch/x86/mangle.c index a37c080f46fde825c6ad9d9319280ee626224cc4..cf28aad59df10543f5801df73f3608c7923c8e40 100644 --- a/core/arch/x86/mangle.c +++ b/core/arch/x86/mangle.c @@ -341,8 +341,8 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, int offs_beyond_xmm = 0; if (cci == NULL) cci = &default_clean_call_info; - if (cci->preserve_mcontext || cci->num_simd_skip != NUM_SIMD_REGS) { - int offs = XMM_SLOTS_SIZE + PRE_XMM_PADDING; + if (cci->preserve_mcontext || cci->num_simd_skip != MCTX_NUM_SIMD_SLOTS) { + int offs = MCTX_TOTAL_SIMD_SLOTS_SIZE + PRE_XMM_PADDING; if (cci->preserve_mcontext && cci->skip_save_flags) { offs_beyond_xmm = 2 * XSP_SZ; /* pc and flags */ offs += offs_beyond_xmm; @@ -367,20 +367,19 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, */ uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 16), ALIGNED(alignment, 32)); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < 6; i++) { if (!cci->simd_skip[i]) { PRE(ilist, instr, instr_create_1dst_1src( dcontext, opcode, opnd_create_base_disp(REG_XSP, REG_NULL, 0, - PRE_XMM_PADDING + i * XMM_SAVED_REG_SIZE + + PRE_XMM_PADDING + i * MCTX_SIMD_SLOT_SIZE + offs_beyond_xmm, OPSZ_SAVED_XMM), opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i))); } } - ASSERT(i * XMM_SAVED_REG_SIZE == XMM_SAVED_SIZE); - ASSERT(XMM_SAVED_SIZE <= XMM_SLOTS_SIZE); + ASSERT(i * MCTX_SIMD_SLOT_SIZE == MCTX_TOTAL_SIMD_SLOTS_SIZE); } /* pc and aflags */ if (!cci->skip_save_flags) { @@ -509,26 +508,26 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist * is better. */ uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 32), ALIGNED(alignment, 16)); ASSERT(proc_has_feature(FEATURE_SSE)); - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < 6; i++) { if (!cci->simd_skip[i]) { PRE(ilist, instr, instr_create_1dst_1src( dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i), opnd_create_base_disp(REG_XSP, REG_NULL, 0, - PRE_XMM_PADDING + i * XMM_SAVED_REG_SIZE + + PRE_XMM_PADDING + i * MCTX_SIMD_SLOT_SIZE + offs_beyond_xmm, OPSZ_SAVED_XMM))); } } - ASSERT(i * XMM_SAVED_REG_SIZE == XMM_SAVED_SIZE); - ASSERT(XMM_SAVED_SIZE <= XMM_SLOTS_SIZE); + ASSERT(i * MCTX_SIMD_SLOT_SIZE == MCTX_TOTAL_SIMD_SLOTS_SIZE); } PRE(ilist, instr, - INSTR_CREATE_lea( - dcontext, opnd_create_reg(REG_XSP), - OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0, - PRE_XMM_PADDING + XMM_SLOTS_SIZE + offs_beyond_xmm))); + INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP), + OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0, + PRE_XMM_PADDING + + MCTX_TOTAL_SIMD_SLOTS_SIZE + + offs_beyond_xmm))); } reg_id_t diff --git a/core/arch/x86/proc.c b/core/arch/x86/proc.c index a47dcc0a00fcf984cfce010d05eafa08704dcbe0..61846ef2b1ec11e1df7d24cc2501ab825ab66f35 100644 --- a/core/arch/x86/proc.c +++ b/core/arch/x86/proc.c @@ -68,6 +68,9 @@ #define AMD_ECX /* cAMD */ 0x444d4163 static bool avx_enabled; + +int num_simd_saved; + /* global writable variable for debug registers value */ DECLARE_NEVERPROT_VAR(app_pc debugRegister[DEBUG_REGISTERS_NB], { 0 }); @@ -353,6 +356,8 @@ proc_init_arch(void) (!proc_has_feature(FEATURE_FXSR) && !proc_has_feature(FEATURE_SSE)), "Unsupported processor type: SSE and FXSR must match"); + num_simd_saved = MCTX_NUM_SIMD_SLOTS; + if (proc_has_feature(FEATURE_AVX) && proc_has_feature(FEATURE_OSXSAVE)) { /* Even if the processor supports AVX, it will #UD on any AVX instruction * if the OS hasn't enabled YMM and XMM state saving. @@ -419,6 +424,13 @@ proc_fpstate_save_size(void) return (proc_has_feature(FEATURE_FXSR) ? 512 : 108); } +DR_API +int +proc_num_simd_saved(void) +{ + return num_simd_saved; +} + DR_API size_t proc_save_fpstate(byte *buf) diff --git a/core/arch/x86/x86.asm b/core/arch/x86/x86.asm index adef76677e4c89cecbd0130a90892c83f86c0f93..6d4ae6671b329e24fca02e96dc3009b49e42b4f7 100644 --- a/core/arch/x86/x86.asm +++ b/core/arch/x86/x86.asm @@ -2205,25 +2205,25 @@ GLOBAL_LABEL(get_own_context_helper:) DECLARE_FUNC(get_xmm_caller_saved) GLOBAL_LABEL(get_xmm_caller_saved:) mov REG_XAX, ARG1 - movups [REG_XAX + 0*XMM_SAVED_REG_SIZE], xmm0 - movups [REG_XAX + 1*XMM_SAVED_REG_SIZE], xmm1 - movups [REG_XAX + 2*XMM_SAVED_REG_SIZE], xmm2 - movups [REG_XAX + 3*XMM_SAVED_REG_SIZE], xmm3 - movups [REG_XAX + 4*XMM_SAVED_REG_SIZE], xmm4 - movups [REG_XAX + 5*XMM_SAVED_REG_SIZE], xmm5 + movups [REG_XAX + 0*MCTX_SIMD_SLOT_SIZE], xmm0 + movups [REG_XAX + 1*MCTX_SIMD_SLOT_SIZE], xmm1 + movups [REG_XAX + 2*MCTX_SIMD_SLOT_SIZE], xmm2 + movups [REG_XAX + 3*MCTX_SIMD_SLOT_SIZE], xmm3 + movups [REG_XAX + 4*MCTX_SIMD_SLOT_SIZE], xmm4 + movups [REG_XAX + 5*MCTX_SIMD_SLOT_SIZE], xmm5 #ifdef UNIX - movups [REG_XAX + 6*XMM_SAVED_REG_SIZE], xmm6 - movups [REG_XAX + 7*XMM_SAVED_REG_SIZE], xmm7 + movups [REG_XAX + 6*MCTX_SIMD_SLOT_SIZE], xmm6 + movups [REG_XAX + 7*MCTX_SIMD_SLOT_SIZE], xmm7 #endif #if defined(UNIX) && defined(X64) - movups [REG_XAX + 8*XMM_SAVED_REG_SIZE], xmm8 - movups [REG_XAX + 9*XMM_SAVED_REG_SIZE], xmm9 - movups [REG_XAX + 10*XMM_SAVED_REG_SIZE], xmm10 - movups [REG_XAX + 11*XMM_SAVED_REG_SIZE], xmm11 - movups [REG_XAX + 12*XMM_SAVED_REG_SIZE], xmm12 - movups [REG_XAX + 13*XMM_SAVED_REG_SIZE], xmm13 - movups [REG_XAX + 14*XMM_SAVED_REG_SIZE], xmm14 - movups [REG_XAX + 15*XMM_SAVED_REG_SIZE], xmm15 + movups [REG_XAX + 8*MCTX_SIMD_SLOT_SIZE], xmm8 + movups [REG_XAX + 9*MCTX_SIMD_SLOT_SIZE], xmm9 + movups [REG_XAX + 10*MCTX_SIMD_SLOT_SIZE], xmm10 + movups [REG_XAX + 11*MCTX_SIMD_SLOT_SIZE], xmm11 + movups [REG_XAX + 12*MCTX_SIMD_SLOT_SIZE], xmm12 + movups [REG_XAX + 13*MCTX_SIMD_SLOT_SIZE], xmm13 + movups [REG_XAX + 14*MCTX_SIMD_SLOT_SIZE], xmm14 + movups [REG_XAX + 15*MCTX_SIMD_SLOT_SIZE], xmm15 #endif ret END_FUNC(get_xmm_caller_saved) diff --git a/core/arch/x86/x86_asm_defines.asm b/core/arch/x86/x86_asm_defines.asm index f7b28773bd306b1f199bc732959a45a7fb4ea104..4bf88b595b2370e10df40702d61ea72c2f5417a6 100644 --- a/core/arch/x86/x86_asm_defines.asm +++ b/core/arch/x86/x86_asm_defines.asm @@ -47,18 +47,19 @@ */ #ifdef X64 # ifdef WINDOWS -# define NUM_SIMD_SLOTS 6 /* xmm0-5 */ +# define MCTX_NUM_SIMD_SLOTS 6 /* xmm0-5 */ # else -# define NUM_SIMD_SLOTS 16 /* xmm0-15 */ +# define MCTX_NUM_SIMD_SLOTS 16 /* xmm0-15 */ # endif # define PRE_XMM_PADDING 16 #else -# define NUM_SIMD_SLOTS 8 /* xmm0-7 */ +# define MCTX_NUM_SIMD_SLOTS 8 /* xmm0-7 */ # define PRE_XMM_PADDING 24 #endif -#define XMM_SAVED_REG_SIZE 32 /* for ymm */ +#define YMM_REG_SIZE 32 +#define MCTX_SIMD_SLOT_SIZE YMM_REG_SIZE /* xmm0-5/7/15 for PR 264138/i#139/PR 302107 */ -#define XMM_SAVED_SIZE ((NUM_SIMD_SLOTS)*(XMM_SAVED_REG_SIZE)) +#define MCTX_TOTAL_SIMD_SLOTS_SIZE ((MCTX_NUM_SIMD_SLOTS)*(YMM_REG_SIZE)) #ifdef X64 /* push GPR registers in priv_mcontext_t order. does NOT make xsp have a @@ -103,7 +104,7 @@ pop r13 @N@\ pop r14 @N@\ pop r15 @N@ -# define PRIV_MCXT_SIZE (18*ARG_SZ + PRE_XMM_PADDING + XMM_SAVED_SIZE) +# define PRIV_MCXT_SIZE (18*ARG_SZ + PRE_XMM_PADDING + MCTX_TOTAL_SIMD_SLOTS_SIZE) # define dstack_OFFSET (PRIV_MCXT_SIZE+UPCXT_EXTRA+3*ARG_SZ) # define MCONTEXT_PC_OFFS (17*ARG_SZ) #else @@ -111,7 +112,7 @@ pusha # define POPGPR \ popa -# define PRIV_MCXT_SIZE (10*ARG_SZ + PRE_XMM_PADDING + XMM_SAVED_SIZE) +# define PRIV_MCXT_SIZE (10*ARG_SZ + PRE_XMM_PADDING + MCTX_TOTAL_SIMD_SLOTS_SIZE) # define dstack_OFFSET (PRIV_MCXT_SIZE+UPCXT_EXTRA+3*ARG_SZ) # define MCONTEXT_PC_OFFS (9*ARG_SZ) #endif @@ -119,7 +120,7 @@ #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ) #define PUSHGPR_XSP_OFFS (3*ARG_SZ) #define MCONTEXT_XSP_OFFS (PUSHGPR_XSP_OFFS) -#define PUSH_PRIV_MCXT_PRE_PC_SHIFT (- XMM_SAVED_SIZE - PRE_XMM_PADDING) +#define PUSH_PRIV_MCXT_PRE_PC_SHIFT (- MCTX_TOTAL_SIMD_SLOTS_SIZE - PRE_XMM_PADDING) #if defined(WINDOWS) && !defined(X64) /* FIXME: check these selector values on all platforms: these are for XPSP2. diff --git a/core/dynamo.c b/core/dynamo.c index 6dbf9e430d54f98c143da1761616f5e84c3aaf5d..724a11b9f794743516df914ffc8a3123e25febbc 100644 --- a/core/dynamo.c +++ b/core/dynamo.c @@ -1600,7 +1600,8 @@ create_new_dynamo_context(bool initial, byte *dstack_in, priv_mcontext_t *mc) ASSERT(ALIGNED(get_mcontext(dcontext)->ymm, YMM_REG_SIZE)); /* also ensure we don't have extra padding beyond x86.asm defines */ ASSERT(sizeof(priv_mcontext_t) == - IF_X64_ELSE(18, 10) * sizeof(reg_t) + PRE_XMM_PADDING + XMM_SLOTS_SIZE); + IF_X64_ELSE(18, 10) * sizeof(reg_t) + PRE_XMM_PADDING + + MCTX_TOTAL_SIMD_SLOTS_SIZE); #elif defined(ARM) /* FIXME i#1551: add arm alignment check if any */ #endif /* X86/ARM */ diff --git a/core/lib/globals_shared.h b/core/lib/globals_shared.h index 4ec16e51257b972fb57040825fc0672e9c606614..35f1f24a657ed0764c2dd9c5e49245f42419bc87 100644 --- a/core/lib/globals_shared.h +++ b/core/lib/globals_shared.h @@ -1841,11 +1841,11 @@ typedef union _dr_simd_t { } dr_simd_t; # endif # ifdef X64 -# define NUM_SIMD_SLOTS \ +# define MCTX_NUM_SIMD_SLOTS \ 32 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t \ */ # else -# define NUM_SIMD_SLOTS \ +# define MCTX_NUM_SIMD_SLOTS \ 16 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t \ */ # endif @@ -1858,7 +1858,7 @@ typedef union _dr_simd_t { # ifdef AVOID_API_EXPORT /* If this is increased, you'll probably need to increase the size of * inject_into_thread's buf and INTERCEPTION_CODE_SIZE (for Windows). - * Also, update NUM_SIMD_SLOTS in x86.asm and get_xmm_caller_saved. + * Also, update MCTX_NUM_SIMD_SLOTS in x86.asm and get_xmm_caller_saved. * i#437: YMM is an extension of XMM from 128-bit to 256-bit without * adding new ones, so code operating on XMM often also operates on YMM, * and thus some *XMM* macros also apply to *YMM*. @@ -1867,10 +1867,11 @@ typedef union _dr_simd_t { # ifdef X64 # ifdef WINDOWS /*xmm0-5*/ -# define NUM_SIMD_SLOTS 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */ +# define MCTX_NUM_SIMD_SLOTS \ + 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */ # else /*xmm0-15*/ -# define NUM_SIMD_SLOTS \ +# define MCTX_NUM_SIMD_SLOTS \ 16 /**< Number of [xy]mm reg slots in dr_mcontext_t \ */ # endif @@ -1878,17 +1879,29 @@ typedef union _dr_simd_t { 16 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ # else /*xmm0-7*/ -# define NUM_SIMD_SLOTS 8 /**< Number of [xy]mm reg slots in dr_mcontext_t */ +# define MCTX_NUM_SIMD_SLOTS \ + 8 /**< Number of [xy]mm reg slots in dr_mcontext_t \ + */ # define PRE_XMM_PADDING \ 24 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */ # endif -# define NUM_XMM_SLOTS NUM_SIMD_SLOTS /* for backward compatibility */ - #else # error NYI #endif /* AARCHXX/X86 */ +#ifdef DR_NUM_SIMD_SLOTS_COMPATIBILITY + +# undef NUM_SIMD_SLOTS +/** + * Number of saved SIMD slots in dr_mcontext_t. + */ +# define NUM_SIMD_SLOTS proc_num_simd_saved() + +# define NUM_XMM_SLOTS NUM_SIMD_SLOTS /* for backward compatibility */ + +#endif /* DR_NUM_SIMD_SLOTS_COMPATIBILITY */ + /** Values for the flags field of dr_mcontext_t */ typedef enum { /** @@ -1948,27 +1961,4 @@ typedef struct _priv_mcontext_t { #include "mcxtx.h" } priv_mcontext_t; -/* PR 306394: for 32-bit xmm0-7 are caller-saved, and are touched by - * libc routines invoked by DR in some Linux systems (xref i#139), - * so they should be saved in 32-bit Linux. - */ -/* Xref i#139: - * XMM register preservation will cause extra runtime overhead. - * We test it over 32-bit SPEC2006 on a 64-bit Debian Linux, which shows - * that DR with xmm preservation adds negligible overhead over DR without - * xmm preservation. - * It means xmm preservation would have little performance impact over - * DR base system. This is mainly because DR's own operations' overhead - * is much higher than the context switch overhead. - * However, if a program is running with a DR client which performs many - * clean calls (one or more per basic block), xmm preservation may - * have noticable impacts, i.e. pushing bbs over the max size limit, - * and could have a noticeable performance hit. - */ -/* We now save everything but we keep separate NUM_SIMD_SLOTS vs NUM_SIMD_SAVED - * in case we go back to not saving some slots in the future: e.g., w/o - * CLIENT_INTERFACE we could control our own libs enough to avoid some saves. - */ -#define NUM_SIMD_SAVED NUM_SIMD_SLOTS - #endif /* _GLOBALS_SHARED_H_ */ diff --git a/core/lib/instrument.c b/core/lib/instrument.c index 567872b30c50e90bf37f64a6f9ec44044e818d4d..6196c97cec21e5bea4d4f18a078e5cb5bc7cf4bc 100644 --- a/core/lib/instrument.c +++ b/core/lib/instrument.c @@ -5363,7 +5363,7 @@ dr_insert_clean_call_ex_varg(void *drcontext, instrlist_t *ilist, instr_t *where cci.num_simd_skip = 6; #else /* all 8 (or 16) are scratch */ - cci.num_simd_skip = NUM_SIMD_REGS; + cci.num_simd_skip = MCTX_NUM_SIMD_SLOTS; #endif for (i = 0; i < cci.num_simd_skip; i++) cci.simd_skip[i] = true; diff --git a/core/lib/mcxtx.h b/core/lib/mcxtx.h index c5f250a262f39a696bc7bc3c172b2b30dd9b021d..54a705fa0e1e40959851eb903c4d5f1432189e9e 100644 --- a/core/lib/mcxtx.h +++ b/core/lib/mcxtx.h @@ -134,7 +134,7 @@ * all. We do not need anything more than word alignment for OP_vldm/OP_vstm, * and dr_simd_t has no fields larger than 32 bits, so we have no padding. */ - dr_simd_t simd[NUM_SIMD_SLOTS]; + dr_simd_t simd[MCTX_NUM_SIMD_SLOTS]; #else /* X86 */ # ifdef AVOID_API_EXPORT /* FIXME: have special comment syntax instead of bogus ifdef to @@ -245,5 +245,5 @@ * DrMi#665: we now preserve all of the xmm registers. */ # endif - dr_ymm_t ymm[NUM_SIMD_SLOTS]; + dr_ymm_t ymm[MCTX_NUM_SIMD_SLOTS]; #endif /* ARM/X86 */ diff --git a/core/unix/signal_linux_x86.c b/core/unix/signal_linux_x86.c index 0ce6f7493e6deb5f7732e98d8a436a8b30ffa96b..6186c152e67bd569010c7d8444d61c459ee216db 100644 --- a/core/unix/signal_linux_x86.c +++ b/core/unix/signal_linux_x86.c @@ -227,7 +227,7 @@ save_xmm(dcontext_t *dcontext, sigframe_rt_t *frame) dr_xgetbv(&bv_high, &bv_low); xstate->xstate_hdr.xstate_bv = (((uint64)bv_high) << 32) | bv_low; } - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { /* we assume no padding */ #ifdef X64 /* __u32 xmm_space[64] */ @@ -379,7 +379,7 @@ dump_fpstate(dcontext_t *dcontext, kernel_fpstate_t *fp) ASSERT(TEST(XCR0_AVX, fp->sw_reserved.xstate_bv)); LOG(THREAD, LOG_ASYNCH, 1, "\txstate_bv = 0x" HEX64_FORMAT_STRING "\n", xstate->xstate_hdr.xstate_bv); - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { LOG(THREAD, LOG_ASYNCH, 1, "\tymmh%d = ", i); for (j = 0; j < 4; j++) { LOG(THREAD, LOG_ASYNCH, 1, "%04x ", @@ -445,7 +445,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) sigcontext_t *sc = sc_full->sc; if (sc->fpstate != NULL) { int i; - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&mc->ymm[i], &sc->fpstate->IF_X64_ELSE(xmm_space[i * 4], _xmm[i]), XMM_REG_SIZE); } @@ -457,7 +457,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) */ ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate)); ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv)); - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&mc->ymm[i].u32[4], &xstate->ymmh.ymmh_space[i * 4], YMMH_REG_SIZE); } @@ -472,7 +472,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) sigcontext_t *sc = sc_full->sc; if (sc->fpstate != NULL) { int i; - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&sc->fpstate->IF_X64_ELSE(xmm_space[i * 4], _xmm[i]), &mc->ymm[i], XMM_REG_SIZE); } @@ -484,7 +484,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) */ ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate)); ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv)); - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&xstate->ymmh.ymmh_space[i * 4], &mc->ymm[i].u32[4], YMMH_REG_SIZE); } diff --git a/core/unix/signal_macos.c b/core/unix/signal_macos.c index cf38fb8459c0e468f531d7f3c46a61159c250213..c4c20bb6cf18815c0389efb313e019a0685147ba 100644 --- a/core/unix/signal_macos.c +++ b/core/unix/signal_macos.c @@ -153,11 +153,11 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) */ sigcontext_t *sc = sc_full->sc; int i; - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&mc->ymm[i], &sc->__fs.__fpu_xmm0 + i, XMM_REG_SIZE); } if (YMM_ENABLED()) { - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&mc->ymm[i].u32[4], &sc->__fs.__fpu_ymmh0 + i, YMMH_REG_SIZE); } } @@ -168,11 +168,11 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) { sigcontext_t *sc = sc_full->sc; int i; - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&sc->__fs.__fpu_xmm0 + i, &mc->ymm[i], XMM_REG_SIZE); } if (YMM_ENABLED()) { - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { memcpy(&sc->__fs.__fpu_ymmh0 + i, &mc->ymm[i].u32[4], YMMH_REG_SIZE); } } @@ -200,7 +200,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc) } LOG(THREAD, LOG_ASYNCH, 1, "\n"); } - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { LOG(THREAD, LOG_ASYNCH, 1, "\txmm%d = ", i); for (j = 0; j < 4; j++) { LOG(THREAD, LOG_ASYNCH, 1, "%08x ", @@ -209,7 +209,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc) LOG(THREAD, LOG_ASYNCH, 1, "\n"); } if (YMM_ENABLED()) { - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < proc_num_simd_saved(); i++) { LOG(THREAD, LOG_ASYNCH, 1, "\tymmh%d = ", i); for (j = 0; j < 4; j++) { LOG(THREAD, LOG_ASYNCH, 1, "%08x ", diff --git a/core/win32/callback.c b/core/win32/callback.c index 1308415f03bdbf83eb481d923928f6e0ea494311..ea88abd0edea7203f30b1a2020fe60ff8616aa75 100644 --- a/core/win32/callback.c +++ b/core/win32/callback.c @@ -4715,7 +4715,7 @@ dump_context_info(CONTEXT *context, file_t file, bool all) TESTALL(CONTEXT_XMM_FLAG, context->ContextFlags)) { int i, j; byte *ymmh_area; - for (i = 0; i < NUM_SIMD_SAVED; i++) { + for (i = 0; i < 6; i++) { LOG(file, LOG_ASYNCH, 2, "xmm%d=0x", i); /* This would be simpler if we had uint64 fields in dr_xmm_t but * that complicates our struct layouts */ diff --git a/core/win32/inject.c b/core/win32/inject.c index a8f72103b4af1fc64e90a3ab1b2c1aed84c539f2..fd6e5a641e224637122b954cde29bc209d46592d 100644 --- a/core/win32/inject.c +++ b/core/win32/inject.c @@ -243,7 +243,7 @@ inject_into_thread(HANDLE phandle, CONTEXT *cxt, HANDLE thandle, char *dynamo_pa int i, j; /* For x86, ensure we have ExtendedRegisters space (i#1223) */ IF_NOT_X64(ASSERT(TEST(CONTEXT_XMM_FLAG, cxt->ContextFlags))); - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < MCTX_TOTAL_SIMD_SLOTS_SIZE; i++) { for (j = 0; j < IF_X64_ELSE(2, 4); j++) { *bufptr++ = CXT_XMM(cxt, i)->reg[j]; } @@ -254,7 +254,7 @@ inject_into_thread(HANDLE phandle, CONTEXT *cxt, HANDLE thandle, char *dynamo_pa } } else { /* skip xmm slots */ - bufptr += XMM_SLOTS_SIZE / sizeof(*bufptr); + bufptr += MCTX_TOTAL_SIMD_SLOTS_SIZE / sizeof(*bufptr); } ASSERT((char *)bufptr - (char *)buf == sizeof(priv_mcontext_t)); *bufptr++ = (ptr_uint_t)load_dynamo_code; diff --git a/core/win32/ntdll.c b/core/win32/ntdll.c index 589b45b075335992b4ac158dc45739206ff73116..7d6f8c554cf92f47a5ecf3f42a34195f86179165 100644 --- a/core/win32/ntdll.c +++ b/core/win32/ntdll.c @@ -1124,7 +1124,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt) /* no harm done if no sse support */ /* CONTEXT_FLOATING_POINT or CONTEXT_EXTENDED_REGISTERS */ int i; - for (i = 0; i < NUM_SIMD_SLOTS; i++) + for (i = 0; i < 6; i++) memcpy(&mcontext->ymm[i], CXT_XMM(cxt, i), XMM_REG_SIZE); } /* if XSTATE is NOT set, the app has NOT used any ymm state and @@ -1134,7 +1134,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt) byte *ymmh_area = context_ymmh_saved_area(cxt); if (ymmh_area != NULL) { int i; - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < 6; i++) { memcpy(&mcontext->ymm[i].u32[4], &YMMH_AREA(ymmh_area, i).u32[0], YMMH_REG_SIZE); } @@ -1225,7 +1225,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg) memcpy(&cxt->ExtendedRegisters, fpstate, written); # endif /* Now update w/ the xmm values from mcontext */ - for (i = 0; i < NUM_SIMD_SLOTS; i++) + for (i = 0; i < 6; i++) memcpy(CXT_XMM(cxt, i), &mcontext->ymm[i], XMM_REG_SIZE); } if (CONTEXT_PRESERVE_YMM && TESTALL(CONTEXT_XSTATE, cxt->ContextFlags)) { @@ -1255,7 +1255,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg) memcpy(&YMMH_AREA(ymmh_area, 6).u32[0], &ymms[0].u32[4], YMMH_REG_SIZE); memcpy(&YMMH_AREA(ymmh_area, 7).u32[0], &ymms[1].u32[4], YMMH_REG_SIZE); # endif - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < 6; i++) { memcpy(&YMMH_AREA(ymmh_area, i).u32[0], &mcontext->ymm[i].u32[4], YMMH_REG_SIZE); } diff --git a/core/win32/syscall.c b/core/win32/syscall.c index bd3f7c738117e2e13a7f4c5ff5fd48fbd734db16..38525dffc004d3f5be0f5bc1eb71b3277d199b9d 100644 --- a/core/win32/syscall.c +++ b/core/win32/syscall.c @@ -3297,13 +3297,15 @@ postsys_GetContextThread(dcontext_t *dcontext, reg_t *param_base, bool success) if (TESTALL(CONTEXT_XMM_FLAG, cxt->ContextFlags) && preserve_xmm_caller_saved()) { /* PR 264138 */ - memcpy(CXT_XMM(cxt, 0), CXT_XMM(xlate_cxt, 0), XMM_SAVED_SIZE); + memcpy(CXT_XMM(cxt, 0), CXT_XMM(xlate_cxt, 0), + MCTX_TOTAL_SIMD_SLOTS_SIZE); } if (TESTALL(CONTEXT_YMM_FLAG, cxt->ContextFlags) && preserve_xmm_caller_saved()) { byte *ymmh_area = context_ymmh_saved_area(cxt); ASSERT(ymmh_area != NULL); - memcpy(ymmh_area, context_ymmh_saved_area(xlate_cxt), YMMH_SAVED_SIZE); + memcpy(ymmh_area, context_ymmh_saved_area(xlate_cxt), + MCTX_YMMH_SLOTS_SIZE); } } SELF_PROTECT_LOCAL(trec->dcontext, READONLY); diff --git a/make/DynamoRIOConfig.cmake.in b/make/DynamoRIOConfig.cmake.in index 292f84d2658e85e2c64613293545ff59b18adda4..371034f692f7852e81f7918e690aeee5f3210016 100755 --- a/make/DynamoRIOConfig.cmake.in +++ b/make/DynamoRIOConfig.cmake.in @@ -186,6 +186,10 @@ # # set(DynamoRIO_PAGE_SIZE_COMPATIBILITY ON) # +# To request that NUM_SIMD_SLOTS and NUM_XMM_SLOTS be defined set this variable: +# +# set(DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY ON) +# # To request a preferred base even when not targeting 64-bit: # (the actual base will use PREFERRED_BASE if set) # @@ -574,6 +578,10 @@ function (DynamoRIO_extra_cflags flags_out extra_cflags tgt_cxx) set(extra_cflags "${extra_cflags} -DDR_PAGE_SIZE_COMPATIBILITY") endif (DynamoRIO_PAGE_SIZE_COMPATIBILITY) + if (DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY) + set(extra_cflags "${extra_cflags} -DDR_NUM_SIMD_SLOTS_COMPATIBILITY") + endif (DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY) + if (DynamoRIO_LOG_COMPATIBILITY) set(extra_cflags "${extra_cflags} -DDR_LOG_DEFINE_COMPATIBILITY") endif () diff --git a/make/DynamoRIOConfigVersion.cmake.in b/make/DynamoRIOConfigVersion.cmake.in index 837f5b719b94a1564b604601513307a1a1f15fd1..b96bf40461cff7a5cfb18fcd8cf049cbcc6d8bf2 100644 --- a/make/DynamoRIOConfigVersion.cmake.in +++ b/make/DynamoRIOConfigVersion.cmake.in @@ -96,4 +96,8 @@ if (NOT "${PACKAGE_FIND_VERSION_MAJOR}" EQUAL 0) elseif ("${PACKAGE_FIND_VERSION}" VERSION_LESS "7.0") set(DynamoRIO_LOG_COMPATIBILITY ON PARENT_SCOPE) endif () + # Automatically define NUM_SIMD_SLOTS if client targets older version + if ("${PACKAGE_FIND_VERSION}" VERSION_LESS "7.1") + set(DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY ON PARENT_SCOPE) + endif () endif () diff --git a/suite/runsuite_wrapper.pl b/suite/runsuite_wrapper.pl index a8684ff4cf6d9dcdea8fa1105e2712245c5f2488..c11e71230cc33f32ef4be3b82b26882657f3326a 100755 --- a/suite/runsuite_wrapper.pl +++ b/suite/runsuite_wrapper.pl @@ -77,7 +77,7 @@ if ($child) { # that has to be manually downloaded. We thus stick with -V for # Travis. For Appveyor where many devs have no local Visual # Studio we do use -VV so build warning details are visible. - my $verbose = "-V"; + my $verbose = "-VV"; if ($^O eq 'cygwin') { $verbose = "-VV"; # CMake is native Windows so pass it a Windows path. diff --git a/suite/tests/api/opnd-a64.c b/suite/tests/api/opnd-a64.c index 26cc302e16244002ea6d0216f59bee6eb69912cf..5f62c872f8f848221c713203783b8c3b130e3e72 100644 --- a/suite/tests/api/opnd-a64.c +++ b/suite/tests/api/opnd-a64.c @@ -58,7 +58,7 @@ test_get_size() } // Check sizes of FP/SIMD regs. - for (uint i = 0; i < NUM_SIMD_SLOTS; i++) { + for (int i = 0; i < 6; i++) { ASSERT(reg_get_size((reg_id_t)DR_REG_H0 + i) == OPSZ_2); ASSERT(reg_get_size((reg_id_t)DR_REG_S0 + i) == OPSZ_4); ASSERT(reg_get_size((reg_id_t)DR_REG_D0 + i) == OPSZ_8); diff --git a/suite/tests/client-interface/cleancall-opt-1.dll.c b/suite/tests/client-interface/cleancall-opt-1.dll.c index c0eaea3037d853f361c22153e5899317dfbbc93f..32e293169752602e420316002acd840ca023c27c 100644 --- a/suite/tests/client-interface/cleancall-opt-1.dll.c +++ b/suite/tests/client-interface/cleancall-opt-1.dll.c @@ -125,7 +125,7 @@ event_basic_block(void *dc, void *tag, instrlist_t *bb, bool for_trace, bool tra static instrlist_t * codegen_out_of_line(void *dc) { - uint i; + int i; instrlist_t *ilist = instrlist_create(dc); codegen_prologue(dc, ilist); @@ -138,7 +138,7 @@ codegen_out_of_line(void *dc) } /* FIXME i#1569: FMOV support is NYI on AArch64 */ #ifdef X86 - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < 6; i++) { reg_id_t reg = DR_REG_XMM0 + (reg_id_t)i; APP(ilist, INSTR_CREATE_movd(dc, opnd_create_reg(reg), diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h index 8ceb2fb9a08b76e58c0f87808908426a1e801af6..1e0f2f3d1837f90479b9a9071a83dcd95effe293 100644 --- a/suite/tests/client-interface/cleancall-opt-shared.h +++ b/suite/tests/client-interface/cleancall-opt-shared.h @@ -269,12 +269,12 @@ mcontexts_equal(dr_mcontext_t *mc_a, dr_mcontext_t *mc_b, int func_index) #ifdef X86 /* Only look at the initialized bits of the SSE regs. */ ymm_bytes_used = (proc_has_feature(FEATURE_AVX) ? 32 : 16); - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < 6; i++) { if (memcmp(&mc_a->ymm[i], &mc_b->ymm[i], ymm_bytes_used) != 0) return false; } #elif defined(AARCH64) - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < 6; i++) { if (memcmp(&mc_a->simd[i], &mc_b->simd[i], sizeof(dr_simd_t)) != 0) return false; } @@ -301,7 +301,7 @@ dump_diff_mcontexts(void) } dr_fprintf(STDERR, "Printing XMM regs:\n"); - for (i = 0; i < NUM_SIMD_SLOTS; i++) { + for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) { #ifdef X86 dr_ymm_t before_reg = before_mcontext.ymm[i]; dr_ymm_t after_reg = after_mcontext.ymm[i];