diff --git a/.appveyor.yml b/.appveyor.yml
index 6d9d1651e03e44972fe0646c4b3c42b2675e617e..6abf1962a8734c8925ffb638fffb607cc7d66569 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -53,16 +53,16 @@ branches:
 
 platform: x64
 
-image: Visual Studio 2013
+image: Visual Studio 2015
 
 build:
   verbosity: detailed
 
 # i#2406: Appveyor's global serialization makes it painful to use more than
 # one configuration.  We no longer build packages with VS2010 and are
-# dropping official support for it, meaning we only need to test VS2013 here.
+# dropping official support for it, meaning we only need to test VS2015 here.
 configuration:
-  - 2013
+  - 2015
 
 install:
   ##################################################
@@ -92,7 +92,7 @@ install:
   # XXX i#2145: point at Qt5 for testing drgui build.
 
 before_build:
-  - if "%configuration%"=="2013" call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" x86
+  - if "%configuration%"=="2015" call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat" x86
   - cd c:\projects\dynamorio
 
 build_script:
@@ -101,7 +101,7 @@ build_script:
   - echo %PATH%
   # The perl in c:\perl can't open a pipe so we use cygwin perl.
   # XXX i#1967: can we pass "package" only when deploying to save time?
-  - c:\cygwin\bin\perl ../suite/runsuite_wrapper.pl travis use_ninja package %EXTRA_ARGS%
+  - c:\cygwin\bin\perl ../suite/runsuite_wrapper.pl -VV travis use_ninja package %EXTRA_ARGS%
 
 # Automated deployment of builds to GitHub Releases.
 # We rely on a Travis cron job to push a tag to the repo which then
diff --git a/api/docs/release.dox b/api/docs/release.dox
index 0ff106a7a6157fc4ca53a1d1cb3979eb80d3efd6..e51b901e001985d701a6f8b09ccebcb832459c09 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -133,7 +133,17 @@ Dr. Memory Framework (DRMF) in the same package as DynamoRIO.  DRMF
 provides the umbra, drsyscall, and drsymcache Extensions for use by
 clients.
 
-The changes between version \DR_VERSION and 7.1.0 include:
+The changes between version \DR_VERSION and 7.1.0 include the following minor
+compatibility changes:
+
+ - Replaced NUM_SIMD_SLOTS with proc_num_simd_saved() and adds the define
+   #MCTX_NUM_SIMD_SLOTS. Clients may set(DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY ON)
+   in order to provide the define NUM_SIMD_SLOTS using proc_num_simd_saved().
+   The macro is not a constant expression and code relying on this needs to be
+   rewritten. DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY is set automatically if
+   clients target version 7.0 or earlier.
+
+Further non-compatibility-affecting changes include:
 
  - Added drfront_set_verbose() to obtain diagnostics from drfrontendlib.
 
diff --git a/core/arch/aarch64/clean_call_opt.c b/core/arch/aarch64/clean_call_opt.c
index aad25ccdc222dfcc1e94cb66066c6c2df4d77810..fd815fefcd8455929a013d5149631a097d48d255 100644
--- a/core/arch/aarch64/clean_call_opt.c
+++ b/core/arch/aarch64/clean_call_opt.c
@@ -183,7 +183,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
     /* XXX implement bitset for optimisation */
     memset(ci->reg_used, 0, sizeof(bool) * NUM_GP_REGS);
     ci->num_simd_used = 0;
-    memset(ci->simd_used, 0, sizeof(bool) * NUM_SIMD_REGS);
+    memset(ci->simd_used, 0, sizeof(bool) * MCTX_NUM_SIMD_SLOTS);
     ci->write_flags = false;
 
     num_regparm = MIN(ci->num_args, NUM_REGPARM);
@@ -213,7 +213,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
         }
 
         /* SIMD register usage */
-        for (i = 0; i < NUM_SIMD_REGS; i++) {
+        for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) {
             if (!ci->simd_used[i] && instr_uses_reg(instr, (DR_REG_Q0 + (reg_id_t)i))) {
                 LOG(THREAD, LOG_CLEANCALL, 2,
                     "CLEANCALL: callee " PFX " uses VREG%d at " PFX "\n", ci->start, i,
diff --git a/core/arch/aarch64/proc.c b/core/arch/aarch64/proc.c
index de2278dc376e4b0202c6f37494f2e956e7a73beb..04f3af61b5a23ca160413566ea396962ea136117 100644
--- a/core/arch/aarch64/proc.c
+++ b/core/arch/aarch64/proc.c
@@ -34,9 +34,13 @@
 #include "proc.h"
 #include "instr.h"
 
+int num_simd_saved;
+
 void
 proc_init_arch(void)
 {
+    num_simd_saved = MCTX_NUM_SIMD_SLOTS;
+
     /* FIXME i#1569: NYI */
 }
 
@@ -61,6 +65,13 @@ proc_fpstate_save_size(void)
     return 0;
 }
 
+DR_API
+int
+proc_num_simd_saved(void)
+{
+    return num_simd_saved;
+}
+
 DR_API
 size_t
 proc_save_fpstate(byte *buf)
diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c
index 6cea69449d96ff76a199eac00b5ead4fc28eacb9..5255ba617ae6e8b1185492d6e65ff19bf98f8ede 100644
--- a/core/arch/aarchxx/mangle.c
+++ b/core/arch/aarchxx/mangle.c
@@ -409,7 +409,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
 #    endif
     if (cci == NULL)
         cci = &default_clean_call_info;
-    if (cci->preserve_mcontext || cci->num_simd_skip != NUM_SIMD_REGS) {
+    if (cci->preserve_mcontext || cci->num_simd_skip != MCTX_NUM_SIMD_SLOTS) {
         /* FIXME i#1551: once we add skipping of regs, need to keep shape here */
     }
     /* FIXME i#1551: once we have cci->num_simd_skip, skip this if possible */
@@ -523,7 +523,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     insert_save_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0, DR_REG_Q0,
                           false /* is_gpr */);
 
-    dstack_offs += (NUM_SIMD_SLOTS * sizeof(dr_simd_t));
+    dstack_offs += (MCTX_NUM_SIMD_SLOTS * sizeof(dr_simd_t));
 
     /* Restore the registers we used. */
     /* ldp x0, x1, [sp] */
@@ -544,7 +544,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     PRE(ilist, instr,
         INSTR_CREATE_vstmdb(dcontext, OPND_CREATE_MEMLIST(DR_REG_SP), SIMD_REG_LIST_LEN,
                             SIMD_REG_LIST_0_15));
-    dstack_offs += NUM_SIMD_SLOTS * sizeof(dr_simd_t);
+    dstack_offs += proc_num_simd_saved() * sizeof(dr_simd_t);
     /* pc and aflags */
     if (cci->skip_save_flags) {
         /* even if we skip flag saves we want to keep mcontext shape */
@@ -635,7 +635,7 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
                           opnd_create_reg(DR_REG_SP)));
 
     current_offs =
-        get_clean_call_switch_stack_size() - NUM_SIMD_SLOTS * sizeof(dr_simd_t);
+        get_clean_call_switch_stack_size() - proc_num_simd_saved() * sizeof(dr_simd_t);
 
     /* add x0, x0, current_offs */
     PRE(ilist, instr,
diff --git a/core/arch/arch.c b/core/arch/arch.c
index a25b59d0fd28b723ab2038df80d456892c6d2d5b..9b8389b02bb2b5dd5283f712bb190d060888be5d 100644
--- a/core/arch/arch.c
+++ b/core/arch/arch.c
@@ -3477,7 +3477,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml)
 #ifdef X86
     if (preserve_xmm_caller_saved()) {
         int i, j;
-        for (i = 0; i < NUM_SIMD_SAVED; i++) {
+        for (i = 0; i < proc_num_simd_saved(); i++) {
             if (YMM_ENABLED()) {
                 print_file(f, dump_xml ? "\t\tymm%d= \"0x" : "\tymm%d= 0x", i);
                 for (j = 0; j < 8; j++) {
@@ -3505,7 +3505,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml)
 #elif defined(ARM)
         {
             int i, j;
-            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+            for (i = 0; i < proc_num_simd_saved(); i++) {
                 print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i);
                 for (j = 0; j < 4; j++) {
                     print_file(f, "%08x ", context->simd[i].u32[j]);
diff --git a/core/arch/arch.h b/core/arch/arch.h
index 51750a07a5ad96ce1b323bf4eba029ee6d378c81..520a5d864018ea4f91ef5e94661aebd807940c17 100644
--- a/core/arch/arch.h
+++ b/core/arch/arch.h
@@ -323,7 +323,6 @@ typedef enum {
 #    define SHARED_GENCODE_MATCH_THREAD(dc) get_shared_gencode(dc)
 #endif
 
-#define NUM_SIMD_REGS NUM_SIMD_SAVED
 #define NUM_GP_REGS DR_NUM_GPR_REGS
 
 /* Information about each individual clean call invocation site.
@@ -339,7 +338,7 @@ typedef struct _clean_call_info_t {
     bool skip_save_flags;
     bool skip_clear_flags;
     uint num_simd_skip;
-    bool simd_skip[NUM_SIMD_REGS];
+    bool simd_skip[MCTX_NUM_SIMD_SLOTS];
     uint num_regs_skip;
     bool reg_skip[NUM_GP_REGS];
     bool preserve_mcontext; /* even if skip reg save, preserve mcontext shape */
@@ -1371,18 +1370,18 @@ typedef struct _slot_t {
 
 /* data structure of clean call callee information. */
 typedef struct _callee_info_t {
-    bool bailout;                  /* if we bail out on function analysis */
-    uint num_args;                 /* number of args that will passed in */
-    int num_instrs;                /* total number of instructions of a function */
-    app_pc start;                  /* entry point of a function  */
-    app_pc bwd_tgt;                /* earliest backward branch target */
-    app_pc fwd_tgt;                /* last forward branch target */
-    int num_simd_used;             /* number of SIMD registers (xmms) used by callee */
-    bool simd_used[NUM_SIMD_REGS]; /* SIMD (xmm/ymm) registers usage */
-    bool reg_used[NUM_GP_REGS];    /* general purpose registers usage */
-    int num_callee_save_regs;      /* number of regs callee saved */
-    bool callee_save_regs[NUM_GP_REGS]; /* callee-save registers */
-    bool has_locals;                    /* if reference local via stack */
+    bool bailout;      /* if we bail out on function analysis */
+    uint num_args;     /* number of args that will passed in */
+    int num_instrs;    /* total number of instructions of a function */
+    app_pc start;      /* entry point of a function  */
+    app_pc bwd_tgt;    /* earliest backward branch target */
+    app_pc fwd_tgt;    /* last forward branch target */
+    int num_simd_used; /* number of SIMD registers (xmms) used by callee */
+    bool simd_used[MCTX_NUM_SIMD_SLOTS]; /* SIMD (xmm/ymm) registers usage */
+    bool reg_used[NUM_GP_REGS];          /* general purpose registers usage */
+    int num_callee_save_regs;            /* number of regs callee saved */
+    bool callee_save_regs[NUM_GP_REGS];  /* callee-save registers */
+    bool has_locals;                     /* if reference local via stack */
     bool standard_fp;   /* if standard reg (xbp/x29) is used as frame pointer */
     bool opt_inline;    /* can be inlined or not */
     bool write_flags;   /* if the function changes flags */
diff --git a/core/arch/arch_exports.h b/core/arch/arch_exports.h
index a725628b671b5ed55e558bd6b77906a6be18c2cd..6284bd1c3bacc6f3d84c615fb56accc0e67b94cd 100644
--- a/core/arch/arch_exports.h
+++ b/core/arch/arch_exports.h
@@ -58,13 +58,12 @@
  */
 #    define XMM_REG_SIZE 16
 #    define YMM_REG_SIZE 32
-#    define XMM_SAVED_REG_SIZE YMM_REG_SIZE /* space in priv_mcontext_t for xmm/ymm */
-#    define XMM_SLOTS_SIZE (NUM_SIMD_SLOTS * XMM_SAVED_REG_SIZE)
-#    define XMM_SAVED_SIZE (NUM_SIMD_SAVED * XMM_SAVED_REG_SIZE)
+#    define MCTX_SIMD_SLOT_SIZE YMM_REG_SIZE
+#    define MCTX_TOTAL_SIMD_SLOTS_SIZE (MCTX_NUM_SIMD_SLOTS * YMM_REG_SIZE)
 /* Indicates OS support, not just processor support (xref i#1278) */
 #    define YMM_ENABLED() (proc_avx_enabled())
 #    define YMMH_REG_SIZE (YMM_REG_SIZE / 2) /* upper half */
-#    define YMMH_SAVED_SIZE (NUM_SIMD_SLOTS * YMMH_REG_SIZE)
+#    define MCTX_YMMH_SLOTS_SIZE (MCTX_NUM_SIMD_SLOTS * YMMH_REG_SIZE)
 #endif /* X86 */
 
 /* Number of slots for spills from inlined clean calls. */
diff --git a/core/arch/arm/arm.asm b/core/arch/arm/arm.asm
index be3c58962301edb0f5cfd7d4eeb51280c95327dd..026cdcd2de20e8f6ba2e6ae30f02d7c3e13f54c0 100644
--- a/core/arch/arm/arm.asm
+++ b/core/arch/arm/arm.asm
@@ -59,18 +59,18 @@ DECL_EXTERN(initstack_mutex)
 #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ)
 
 #ifdef X64
-# define NUM_SIMD_SLOTS 32
-# define SIMD_REG_SIZE  16
-# define NUM_GPR_SLOTS  33 /* incl flags */
-# define GPR_REG_SIZE    8
+# define MCTX_NUM_SIMD_SLOTS 32
+# define SIMD_REG_SIZE       16
+# define NUM_GPR_SLOTS       33 /* incl flags */
+# define GPR_REG_SIZE         8
 #else
-# define NUM_SIMD_SLOTS 16
-# define SIMD_REG_SIZE  16
-# define NUM_GPR_SLOTS  17 /* incl flags */
-# define GPR_REG_SIZE    4
+# define MCTX_NUM_SIMD_SLOTS 16
+# define SIMD_REG_SIZE       16
+# define NUM_GPR_SLOTS       17 /* incl flags */
+# define GPR_REG_SIZE         4
 #endif
-#define PRE_SIMD_PADDING 0
-#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + NUM_SIMD_SLOTS*SIMD_REG_SIZE)
+#define PRE_SIMD_PADDING     0
+#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCTX_NUM_SIMD_SLOTS*SIMD_REG_SIZE)
 #define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE)
 #define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */
 #define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */
diff --git a/core/arch/arm/proc.c b/core/arch/arm/proc.c
index be6854f2e62877f60f0fbc59672ec7c03a5e5a57..60d733be3d82823dfe93f28cb5b4d16b29e3cf6a 100644
--- a/core/arch/arm/proc.c
+++ b/core/arch/arm/proc.c
@@ -43,10 +43,14 @@
 #    error NYI
 #endif
 
+int num_simd_saved;
+
 /* arch specific proc info */
 void
 proc_init_arch(void)
 {
+    num_simd_saved = MCTX_NUM_SIMD_SLOTS;
+
     /* FIXME i#1551: NYI on ARM */
     /* all of the CPUID registers are only accessible in privileged modes
      * so we either need read /proc/cpuinfo or auxiliary vector provided by
@@ -86,6 +90,13 @@ proc_fpstate_save_size(void)
     return DR_FPSTATE_BUF_SIZE;
 }
 
+DR_API
+int
+proc_num_simd_saved(void)
+{
+    return num_simd_saved;
+}
+
 DR_API
 size_t
 proc_save_fpstate(byte *buf)
diff --git a/core/arch/clean_call_opt_shared.c b/core/arch/clean_call_opt_shared.c
index 1669f0c7803d5d9161f0dc86b636fc4034667911..b8631d001b54b89f1b1678d75a276244bfce930d 100644
--- a/core/arch/clean_call_opt_shared.c
+++ b/core/arch/clean_call_opt_shared.c
@@ -73,8 +73,8 @@ callee_info_init(callee_info_t *ci)
      * but then later in analyze_callee_regs_usage, we have to use the loop.
      */
     /* assuming all xmm registers are used */
-    ci->num_simd_used = NUM_SIMD_REGS;
-    for (i = 0; i < NUM_SIMD_REGS; i++)
+    ci->num_simd_used = MCTX_NUM_SIMD_SLOTS;
+    for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++)
         ci->simd_used[i] = true;
     for (i = 0; i < NUM_GP_REGS; i++)
         ci->reg_used[i] = true;
@@ -493,7 +493,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci)
     callee_info_t *info = cci->callee_info;
 
     /* 1. xmm registers */
-    for (i = 0; i < NUM_SIMD_REGS; i++) {
+    for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) {
         if (info->simd_used[i]) {
             cci->simd_skip[i] = false;
         } else {
@@ -504,7 +504,7 @@ analyze_clean_call_regs(dcontext_t *dcontext, clean_call_info_t *cci)
             cci->num_simd_skip++;
         }
     }
-    if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_simd_skip != NUM_SIMD_REGS)
+    if (INTERNAL_OPTION(opt_cleancall) > 2 && cci->num_simd_skip != MCTX_NUM_SIMD_SLOTS)
         cci->should_align = false;
     /* 2. general purpose registers */
     /* set regs not to be saved for clean call */
@@ -646,7 +646,7 @@ analyze_clean_call_inline(dcontext_t *dcontext, clean_call_info_t *cci)
                 }
             }
         }
-        if (cci->num_simd_skip == NUM_SIMD_REGS) {
+        if (cci->num_simd_skip == MCTX_NUM_SIMD_SLOTS) {
             STATS_INC(cleancall_simd_skipped);
         }
         if (cci->skip_save_flags) {
@@ -735,7 +735,7 @@ analyze_clean_call(dcontext_t *dcontext, clean_call_info_t *cci, instr_t *where,
      * to be saved or if more than GPR_SAVE_TRESHOLD GP registers have to be saved.
      * XXX: This should probably be in arch-specific clean_call_opt.c.
      */
-    if ((NUM_SIMD_REGS - cci->num_simd_skip) > SIMD_SAVE_TRESHOLD ||
+    if ((MCTX_NUM_SIMD_SLOTS - cci->num_simd_skip) > SIMD_SAVE_TRESHOLD ||
         (NUM_GP_REGS - cci->num_regs_skip) > GPR_SAVE_TRESHOLD || always_out_of_line)
         cci->out_of_line_swap = true;
 #    endif
diff --git a/core/arch/proc.h b/core/arch/proc.h
index 0127fee35b8ee6c034b416742ea0d79aebe24bd3..73e03bd30b3ddb6108ba9afc8acf529dc75d3b21 100644
--- a/core/arch/proc.h
+++ b/core/arch/proc.h
@@ -457,6 +457,34 @@ DR_API
 size_t
 proc_fpstate_save_size(void);
 
+DR_API
+/**
+ * Returns number of SIMD registers to be saved.
+ *
+ * The number of saved SIMD registers may be variable. For example, we may decide
+ * to optimize the number of saved registers in a context switch to avoid frequency
+ * scaling (https://github.com/DynamoRIO/dynamorio/issues/3169).
+ */
+/* PR 306394: for 32-bit xmm0-7 are caller-saved, and are touched by
+ * libc routines invoked by DR in some Linux systems (xref i#139),
+ * so they should be saved in 32-bit Linux.
+ *
+ * Xref i#139:
+ * XMM register preservation will cause extra runtime overhead.
+ * We test it over 32-bit SPEC2006 on a 64-bit Debian Linux, which shows
+ * that DR with xmm preservation adds negligible overhead over DR without
+ * xmm preservation.
+ * It means xmm preservation would have little performance impact over
+ * DR base system. This is mainly because DR's own operations' overhead
+ * is much higher than the context switch overhead.
+ * However, if a program is running with a DR client which performs many
+ * clean calls (one or more per basic block), xmm preservation may
+ * have noticable impacts, i.e. pushing bbs over the max size limit,
+ * and could have a noticeable performance hit.
+ */
+int
+proc_num_simd_saved(void);
+
 DR_API
 /**
  * Saves the floating point state into the buffer \p buf.
diff --git a/core/arch/x86/clean_call_opt.c b/core/arch/x86/clean_call_opt.c
index a3583b191191a8d444e8033cddc4e1be925774e0..8a1a60b90d0a34ca7d518576063517b70fa0a8ae 100644
--- a/core/arch/x86/clean_call_opt.c
+++ b/core/arch/x86/clean_call_opt.c
@@ -63,7 +63,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
     uint i, num_regparm;
 
     ci->num_simd_used = 0;
-    memset(ci->simd_used, 0, sizeof(bool) * NUM_SIMD_REGS);
+    memset(ci->simd_used, 0, sizeof(bool) * MCTX_NUM_SIMD_SLOTS);
     memset(ci->reg_used, 0, sizeof(bool) * NUM_GP_REGS);
     ci->write_flags = false;
     for (instr = instrlist_first(ilist); instr != NULL; instr = instr_get_next(instr)) {
@@ -74,7 +74,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
          * impact unless there are a lot of different clean call callees.
          */
         /* XMM registers usage */
-        for (i = 0; i < NUM_SIMD_REGS; i++) {
+        for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) {
             if (!ci->simd_used[i] && instr_uses_reg(instr, (DR_REG_XMM0 + (reg_id_t)i))) {
                 LOG(THREAD, LOG_CLEANCALL, 2,
                     "CLEANCALL: callee " PFX " uses XMM%d at " PFX "\n", ci->start, i,
@@ -605,7 +605,7 @@ insert_inline_reg_save(dcontext_t *dcontext, clean_call_info_t *cci, instrlist_t
     insert_get_mcontext_base(dcontext, ilist, where, ci->spill_reg);
 
     /* Save used registers. */
-    ASSERT(cci->num_simd_skip == NUM_SIMD_REGS);
+    ASSERT(cci->num_simd_skip == MCTX_NUM_SIMD_SLOTS);
     for (i = 0; i < NUM_GP_REGS; i++) {
         if (!cci->reg_skip[i]) {
             reg_id_t reg_id = DR_REG_XAX + (reg_id_t)i;
diff --git a/core/arch/x86/emit_utils.c b/core/arch/x86/emit_utils.c
index 636418d31b4b8c977528e9a29fbe41d43ef371ed..751c1531c006fe42e63a5bbced130bf6bbe2ddfe 100644
--- a/core/arch/x86/emit_utils.c
+++ b/core/arch/x86/emit_utils.c
@@ -1337,12 +1337,12 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
         int i;
         uint opcode = move_mm_reg_opcode(true /*align32*/, true /*align16*/);
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i = 0; i < NUM_SIMD_SAVED; i++) {
+        for (i = 0; i < 6; i++) {
             APP(ilist,
                 instr_create_1dst_1src(
                     dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i),
                     OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM,
-                                  XMM_OFFSET + i * XMM_SAVED_REG_SIZE)));
+                                  XMM_OFFSET + i * MCTX_SIMD_SLOT_SIZE)));
         }
     }
 }
@@ -1560,12 +1560,13 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
         int i;
         uint opcode = move_mm_reg_opcode(true /*align32*/, true /*align16*/);
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i = 0; i < NUM_SIMD_SAVED; i++) {
+        for (i = 0; i < 6; i++) {
             APP(ilist,
-                instr_create_1dst_1src(dcontext, opcode,
-                                       OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM,
-                                                     XMM_OFFSET + i * XMM_SAVED_REG_SIZE),
-                                       opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i)));
+                instr_create_1dst_1src(
+                    dcontext, opcode,
+                    OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM,
+                                  XMM_OFFSET + i * MCTX_SIMD_SLOT_SIZE),
+                    opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i)));
         }
     }
 }
diff --git a/core/arch/x86/mangle.c b/core/arch/x86/mangle.c
index a37c080f46fde825c6ad9d9319280ee626224cc4..cf28aad59df10543f5801df73f3608c7923c8e40 100644
--- a/core/arch/x86/mangle.c
+++ b/core/arch/x86/mangle.c
@@ -341,8 +341,8 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     int offs_beyond_xmm = 0;
     if (cci == NULL)
         cci = &default_clean_call_info;
-    if (cci->preserve_mcontext || cci->num_simd_skip != NUM_SIMD_REGS) {
-        int offs = XMM_SLOTS_SIZE + PRE_XMM_PADDING;
+    if (cci->preserve_mcontext || cci->num_simd_skip != MCTX_NUM_SIMD_SLOTS) {
+        int offs = MCTX_TOTAL_SIMD_SLOTS_SIZE + PRE_XMM_PADDING;
         if (cci->preserve_mcontext && cci->skip_save_flags) {
             offs_beyond_xmm = 2 * XSP_SZ; /* pc and flags */
             offs += offs_beyond_xmm;
@@ -367,20 +367,19 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
          */
         uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 16), ALIGNED(alignment, 32));
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i = 0; i < NUM_SIMD_SAVED; i++) {
+        for (i = 0; i < 6; i++) {
             if (!cci->simd_skip[i]) {
                 PRE(ilist, instr,
                     instr_create_1dst_1src(
                         dcontext, opcode,
                         opnd_create_base_disp(REG_XSP, REG_NULL, 0,
-                                              PRE_XMM_PADDING + i * XMM_SAVED_REG_SIZE +
+                                              PRE_XMM_PADDING + i * MCTX_SIMD_SLOT_SIZE +
                                                   offs_beyond_xmm,
                                               OPSZ_SAVED_XMM),
                         opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i)));
             }
         }
-        ASSERT(i * XMM_SAVED_REG_SIZE == XMM_SAVED_SIZE);
-        ASSERT(XMM_SAVED_SIZE <= XMM_SLOTS_SIZE);
+        ASSERT(i * MCTX_SIMD_SLOT_SIZE == MCTX_TOTAL_SIMD_SLOTS_SIZE);
     }
     /* pc and aflags */
     if (!cci->skip_save_flags) {
@@ -509,26 +508,26 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
          * is better. */
         uint opcode = move_mm_reg_opcode(ALIGNED(alignment, 32), ALIGNED(alignment, 16));
         ASSERT(proc_has_feature(FEATURE_SSE));
-        for (i = 0; i < NUM_SIMD_SAVED; i++) {
+        for (i = 0; i < 6; i++) {
             if (!cci->simd_skip[i]) {
                 PRE(ilist, instr,
                     instr_create_1dst_1src(
                         dcontext, opcode, opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i),
                         opnd_create_base_disp(REG_XSP, REG_NULL, 0,
-                                              PRE_XMM_PADDING + i * XMM_SAVED_REG_SIZE +
+                                              PRE_XMM_PADDING + i * MCTX_SIMD_SLOT_SIZE +
                                                   offs_beyond_xmm,
                                               OPSZ_SAVED_XMM)));
             }
         }
-        ASSERT(i * XMM_SAVED_REG_SIZE == XMM_SAVED_SIZE);
-        ASSERT(XMM_SAVED_SIZE <= XMM_SLOTS_SIZE);
+        ASSERT(i * MCTX_SIMD_SLOT_SIZE == MCTX_TOTAL_SIMD_SLOTS_SIZE);
     }
 
     PRE(ilist, instr,
-        INSTR_CREATE_lea(
-            dcontext, opnd_create_reg(REG_XSP),
-            OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0,
-                                PRE_XMM_PADDING + XMM_SLOTS_SIZE + offs_beyond_xmm)));
+        INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_XSP),
+                         OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0,
+                                             PRE_XMM_PADDING +
+                                                 MCTX_TOTAL_SIMD_SLOTS_SIZE +
+                                                 offs_beyond_xmm)));
 }
 
 reg_id_t
diff --git a/core/arch/x86/proc.c b/core/arch/x86/proc.c
index a47dcc0a00fcf984cfce010d05eafa08704dcbe0..61846ef2b1ec11e1df7d24cc2501ab825ab66f35 100644
--- a/core/arch/x86/proc.c
+++ b/core/arch/x86/proc.c
@@ -68,6 +68,9 @@
 #define AMD_ECX /* cAMD */ 0x444d4163
 
 static bool avx_enabled;
+
+int num_simd_saved;
+
 /* global writable variable for debug registers value */
 DECLARE_NEVERPROT_VAR(app_pc debugRegister[DEBUG_REGISTERS_NB], { 0 });
 
@@ -353,6 +356,8 @@ proc_init_arch(void)
                       (!proc_has_feature(FEATURE_FXSR) && !proc_has_feature(FEATURE_SSE)),
                   "Unsupported processor type: SSE and FXSR must match");
 
+    num_simd_saved = MCTX_NUM_SIMD_SLOTS;
+
     if (proc_has_feature(FEATURE_AVX) && proc_has_feature(FEATURE_OSXSAVE)) {
         /* Even if the processor supports AVX, it will #UD on any AVX instruction
          * if the OS hasn't enabled YMM and XMM state saving.
@@ -419,6 +424,13 @@ proc_fpstate_save_size(void)
     return (proc_has_feature(FEATURE_FXSR) ? 512 : 108);
 }
 
+DR_API
+int
+proc_num_simd_saved(void)
+{
+    return num_simd_saved;
+}
+
 DR_API
 size_t
 proc_save_fpstate(byte *buf)
diff --git a/core/arch/x86/x86.asm b/core/arch/x86/x86.asm
index adef76677e4c89cecbd0130a90892c83f86c0f93..6d4ae6671b329e24fca02e96dc3009b49e42b4f7 100644
--- a/core/arch/x86/x86.asm
+++ b/core/arch/x86/x86.asm
@@ -2205,25 +2205,25 @@ GLOBAL_LABEL(get_own_context_helper:)
         DECLARE_FUNC(get_xmm_caller_saved)
 GLOBAL_LABEL(get_xmm_caller_saved:)
         mov      REG_XAX, ARG1
-        movups   [REG_XAX + 0*XMM_SAVED_REG_SIZE], xmm0
-        movups   [REG_XAX + 1*XMM_SAVED_REG_SIZE], xmm1
-        movups   [REG_XAX + 2*XMM_SAVED_REG_SIZE], xmm2
-        movups   [REG_XAX + 3*XMM_SAVED_REG_SIZE], xmm3
-        movups   [REG_XAX + 4*XMM_SAVED_REG_SIZE], xmm4
-        movups   [REG_XAX + 5*XMM_SAVED_REG_SIZE], xmm5
+        movups   [REG_XAX + 0*MCTX_SIMD_SLOT_SIZE], xmm0
+        movups   [REG_XAX + 1*MCTX_SIMD_SLOT_SIZE], xmm1
+        movups   [REG_XAX + 2*MCTX_SIMD_SLOT_SIZE], xmm2
+        movups   [REG_XAX + 3*MCTX_SIMD_SLOT_SIZE], xmm3
+        movups   [REG_XAX + 4*MCTX_SIMD_SLOT_SIZE], xmm4
+        movups   [REG_XAX + 5*MCTX_SIMD_SLOT_SIZE], xmm5
 #ifdef UNIX
-        movups   [REG_XAX + 6*XMM_SAVED_REG_SIZE], xmm6
-        movups   [REG_XAX + 7*XMM_SAVED_REG_SIZE], xmm7
+        movups   [REG_XAX + 6*MCTX_SIMD_SLOT_SIZE], xmm6
+        movups   [REG_XAX + 7*MCTX_SIMD_SLOT_SIZE], xmm7
 #endif
 #if defined(UNIX) && defined(X64)
-        movups   [REG_XAX + 8*XMM_SAVED_REG_SIZE], xmm8
-        movups   [REG_XAX + 9*XMM_SAVED_REG_SIZE], xmm9
-        movups   [REG_XAX + 10*XMM_SAVED_REG_SIZE], xmm10
-        movups   [REG_XAX + 11*XMM_SAVED_REG_SIZE], xmm11
-        movups   [REG_XAX + 12*XMM_SAVED_REG_SIZE], xmm12
-        movups   [REG_XAX + 13*XMM_SAVED_REG_SIZE], xmm13
-        movups   [REG_XAX + 14*XMM_SAVED_REG_SIZE], xmm14
-        movups   [REG_XAX + 15*XMM_SAVED_REG_SIZE], xmm15
+        movups   [REG_XAX + 8*MCTX_SIMD_SLOT_SIZE], xmm8
+        movups   [REG_XAX + 9*MCTX_SIMD_SLOT_SIZE], xmm9
+        movups   [REG_XAX + 10*MCTX_SIMD_SLOT_SIZE], xmm10
+        movups   [REG_XAX + 11*MCTX_SIMD_SLOT_SIZE], xmm11
+        movups   [REG_XAX + 12*MCTX_SIMD_SLOT_SIZE], xmm12
+        movups   [REG_XAX + 13*MCTX_SIMD_SLOT_SIZE], xmm13
+        movups   [REG_XAX + 14*MCTX_SIMD_SLOT_SIZE], xmm14
+        movups   [REG_XAX + 15*MCTX_SIMD_SLOT_SIZE], xmm15
 #endif
         ret
         END_FUNC(get_xmm_caller_saved)
diff --git a/core/arch/x86/x86_asm_defines.asm b/core/arch/x86/x86_asm_defines.asm
index f7b28773bd306b1f199bc732959a45a7fb4ea104..4bf88b595b2370e10df40702d61ea72c2f5417a6 100644
--- a/core/arch/x86/x86_asm_defines.asm
+++ b/core/arch/x86/x86_asm_defines.asm
@@ -47,18 +47,19 @@
  */
 #ifdef X64
 # ifdef WINDOWS
-#  define NUM_SIMD_SLOTS 6 /* xmm0-5 */
+#  define MCTX_NUM_SIMD_SLOTS 6 /* xmm0-5 */
 # else
-#  define NUM_SIMD_SLOTS 16 /* xmm0-15 */
+#  define MCTX_NUM_SIMD_SLOTS 16 /* xmm0-15 */
 # endif
 # define PRE_XMM_PADDING 16
 #else
-# define NUM_SIMD_SLOTS 8 /* xmm0-7 */
+# define MCTX_NUM_SIMD_SLOTS 8 /* xmm0-7 */
 # define PRE_XMM_PADDING 24
 #endif
-#define XMM_SAVED_REG_SIZE 32 /* for ymm */
+#define YMM_REG_SIZE 32
+#define MCTX_SIMD_SLOT_SIZE YMM_REG_SIZE
 /* xmm0-5/7/15 for PR 264138/i#139/PR 302107 */
-#define XMM_SAVED_SIZE ((NUM_SIMD_SLOTS)*(XMM_SAVED_REG_SIZE))
+#define MCTX_TOTAL_SIMD_SLOTS_SIZE ((MCTX_NUM_SIMD_SLOTS)*(YMM_REG_SIZE))
 
 #ifdef X64
 /* push GPR registers in priv_mcontext_t order.  does NOT make xsp have a
@@ -103,7 +104,7 @@
         pop      r13 @N@\
         pop      r14 @N@\
         pop      r15 @N@
-# define PRIV_MCXT_SIZE (18*ARG_SZ + PRE_XMM_PADDING + XMM_SAVED_SIZE)
+# define PRIV_MCXT_SIZE (18*ARG_SZ + PRE_XMM_PADDING + MCTX_TOTAL_SIMD_SLOTS_SIZE)
 # define dstack_OFFSET     (PRIV_MCXT_SIZE+UPCXT_EXTRA+3*ARG_SZ)
 # define MCONTEXT_PC_OFFS  (17*ARG_SZ)
 #else
@@ -111,7 +112,7 @@
         pusha
 # define POPGPR  \
         popa
-# define PRIV_MCXT_SIZE (10*ARG_SZ + PRE_XMM_PADDING + XMM_SAVED_SIZE)
+# define PRIV_MCXT_SIZE (10*ARG_SZ + PRE_XMM_PADDING + MCTX_TOTAL_SIMD_SLOTS_SIZE)
 # define dstack_OFFSET     (PRIV_MCXT_SIZE+UPCXT_EXTRA+3*ARG_SZ)
 # define MCONTEXT_PC_OFFS  (9*ARG_SZ)
 #endif
@@ -119,7 +120,7 @@
 #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ)
 #define PUSHGPR_XSP_OFFS  (3*ARG_SZ)
 #define MCONTEXT_XSP_OFFS (PUSHGPR_XSP_OFFS)
-#define PUSH_PRIV_MCXT_PRE_PC_SHIFT (- XMM_SAVED_SIZE - PRE_XMM_PADDING)
+#define PUSH_PRIV_MCXT_PRE_PC_SHIFT (- MCTX_TOTAL_SIMD_SLOTS_SIZE - PRE_XMM_PADDING)
 
 #if defined(WINDOWS) && !defined(X64)
 /* FIXME: check these selector values on all platforms: these are for XPSP2.
diff --git a/core/dynamo.c b/core/dynamo.c
index 6dbf9e430d54f98c143da1761616f5e84c3aaf5d..724a11b9f794743516df914ffc8a3123e25febbc 100644
--- a/core/dynamo.c
+++ b/core/dynamo.c
@@ -1600,7 +1600,8 @@ create_new_dynamo_context(bool initial, byte *dstack_in, priv_mcontext_t *mc)
     ASSERT(ALIGNED(get_mcontext(dcontext)->ymm, YMM_REG_SIZE));
     /* also ensure we don't have extra padding beyond x86.asm defines */
     ASSERT(sizeof(priv_mcontext_t) ==
-           IF_X64_ELSE(18, 10) * sizeof(reg_t) + PRE_XMM_PADDING + XMM_SLOTS_SIZE);
+           IF_X64_ELSE(18, 10) * sizeof(reg_t) + PRE_XMM_PADDING +
+               MCTX_TOTAL_SIMD_SLOTS_SIZE);
 #elif defined(ARM)
     /* FIXME i#1551: add arm alignment check if any */
 #endif /* X86/ARM */
diff --git a/core/lib/globals_shared.h b/core/lib/globals_shared.h
index 4ec16e51257b972fb57040825fc0672e9c606614..35f1f24a657ed0764c2dd9c5e49245f42419bc87 100644
--- a/core/lib/globals_shared.h
+++ b/core/lib/globals_shared.h
@@ -1841,11 +1841,11 @@ typedef union _dr_simd_t {
 } dr_simd_t;
 #    endif
 #    ifdef X64
-#        define NUM_SIMD_SLOTS                                       \
+#        define MCTX_NUM_SIMD_SLOTS                                  \
             32 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t \
                 */
 #    else
-#        define NUM_SIMD_SLOTS                                       \
+#        define MCTX_NUM_SIMD_SLOTS                                  \
             16 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t \
                 */
 #    endif
@@ -1858,7 +1858,7 @@ typedef union _dr_simd_t {
 #    ifdef AVOID_API_EXPORT
 /* If this is increased, you'll probably need to increase the size of
  * inject_into_thread's buf and INTERCEPTION_CODE_SIZE (for Windows).
- * Also, update NUM_SIMD_SLOTS in x86.asm and get_xmm_caller_saved.
+ * Also, update MCTX_NUM_SIMD_SLOTS in x86.asm and get_xmm_caller_saved.
  * i#437: YMM is an extension of XMM from 128-bit to 256-bit without
  * adding new ones, so code operating on XMM often also operates on YMM,
  * and thus some *XMM* macros also apply to *YMM*.
@@ -1867,10 +1867,11 @@ typedef union _dr_simd_t {
 #    ifdef X64
 #        ifdef WINDOWS
 /*xmm0-5*/
-#            define NUM_SIMD_SLOTS 6 /**< Number of [xy]mm reg slots in dr_mcontext_t */
+#            define MCTX_NUM_SIMD_SLOTS \
+                6 /**< Number of [xy]mm reg slots in dr_mcontext_t */
 #        else
 /*xmm0-15*/
-#            define NUM_SIMD_SLOTS                                  \
+#            define MCTX_NUM_SIMD_SLOTS                             \
                 16 /**< Number of [xy]mm reg slots in dr_mcontext_t \
                     */
 #        endif
@@ -1878,17 +1879,29 @@ typedef union _dr_simd_t {
             16 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
 #    else
 /*xmm0-7*/
-#        define NUM_SIMD_SLOTS 8 /**< Number of [xy]mm reg slots in dr_mcontext_t */
+#        define MCTX_NUM_SIMD_SLOTS                            \
+            8 /**< Number of [xy]mm reg slots in dr_mcontext_t \
+               */
 #        define PRE_XMM_PADDING \
             24 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots */
 #    endif
 
-#    define NUM_XMM_SLOTS NUM_SIMD_SLOTS /* for backward compatibility */
-
 #else
 #    error NYI
 #endif /* AARCHXX/X86 */
 
+#ifdef DR_NUM_SIMD_SLOTS_COMPATIBILITY
+
+#    undef NUM_SIMD_SLOTS
+/**
+ * Number of saved SIMD slots in dr_mcontext_t.
+ */
+#    define NUM_SIMD_SLOTS proc_num_simd_saved()
+
+#    define NUM_XMM_SLOTS NUM_SIMD_SLOTS /* for backward compatibility */
+
+#endif /* DR_NUM_SIMD_SLOTS_COMPATIBILITY */
+
 /** Values for the flags field of dr_mcontext_t */
 typedef enum {
     /**
@@ -1948,27 +1961,4 @@ typedef struct _priv_mcontext_t {
 #include "mcxtx.h"
 } priv_mcontext_t;
 
-/* PR 306394: for 32-bit xmm0-7 are caller-saved, and are touched by
- * libc routines invoked by DR in some Linux systems (xref i#139),
- * so they should be saved in 32-bit Linux.
- */
-/* Xref i#139:
- * XMM register preservation will cause extra runtime overhead.
- * We test it over 32-bit SPEC2006 on a 64-bit Debian Linux, which shows
- * that DR with xmm preservation adds negligible overhead over DR without
- * xmm preservation.
- * It means xmm preservation would have little performance impact over
- * DR base system. This is mainly because DR's own operations' overhead
- * is much higher than the context switch overhead.
- * However, if a program is running with a DR client which performs many
- * clean calls (one or more per basic block), xmm preservation may
- * have noticable impacts, i.e. pushing bbs over the max size limit,
- * and could have a noticeable performance hit.
- */
-/* We now save everything but we keep separate NUM_SIMD_SLOTS vs NUM_SIMD_SAVED
- * in case we go back to not saving some slots in the future: e.g., w/o
- * CLIENT_INTERFACE we could control our own libs enough to avoid some saves.
- */
-#define NUM_SIMD_SAVED NUM_SIMD_SLOTS
-
 #endif /* _GLOBALS_SHARED_H_ */
diff --git a/core/lib/instrument.c b/core/lib/instrument.c
index 567872b30c50e90bf37f64a6f9ec44044e818d4d..6196c97cec21e5bea4d4f18a078e5cb5bc7cf4bc 100644
--- a/core/lib/instrument.c
+++ b/core/lib/instrument.c
@@ -5363,7 +5363,7 @@ dr_insert_clean_call_ex_varg(void *drcontext, instrlist_t *ilist, instr_t *where
         cci.num_simd_skip = 6;
 #else
         /* all 8 (or 16) are scratch */
-        cci.num_simd_skip = NUM_SIMD_REGS;
+        cci.num_simd_skip = MCTX_NUM_SIMD_SLOTS;
 #endif
         for (i = 0; i < cci.num_simd_skip; i++)
             cci.simd_skip[i] = true;
diff --git a/core/lib/mcxtx.h b/core/lib/mcxtx.h
index c5f250a262f39a696bc7bc3c172b2b30dd9b021d..54a705fa0e1e40959851eb903c4d5f1432189e9e 100644
--- a/core/lib/mcxtx.h
+++ b/core/lib/mcxtx.h
@@ -134,7 +134,7 @@
      * all.  We do not need anything more than word alignment for OP_vldm/OP_vstm,
      * and dr_simd_t has no fields larger than 32 bits, so we have no padding.
      */
-    dr_simd_t simd[NUM_SIMD_SLOTS];
+    dr_simd_t simd[MCTX_NUM_SIMD_SLOTS];
 #else /* X86 */
 #    ifdef AVOID_API_EXPORT
     /* FIXME: have special comment syntax instead of bogus ifdef to
@@ -245,5 +245,5 @@
      * DrMi#665: we now preserve all of the xmm registers.
      */
 #    endif
-    dr_ymm_t ymm[NUM_SIMD_SLOTS];
+    dr_ymm_t ymm[MCTX_NUM_SIMD_SLOTS];
 #endif /* ARM/X86 */
diff --git a/core/unix/signal_linux_x86.c b/core/unix/signal_linux_x86.c
index 0ce6f7493e6deb5f7732e98d8a436a8b30ffa96b..6186c152e67bd569010c7d8444d61c459ee216db 100644
--- a/core/unix/signal_linux_x86.c
+++ b/core/unix/signal_linux_x86.c
@@ -227,7 +227,7 @@ save_xmm(dcontext_t *dcontext, sigframe_rt_t *frame)
         dr_xgetbv(&bv_high, &bv_low);
         xstate->xstate_hdr.xstate_bv = (((uint64)bv_high) << 32) | bv_low;
     }
-    for (i = 0; i < NUM_SIMD_SAVED; i++) {
+    for (i = 0; i < proc_num_simd_saved(); i++) {
         /* we assume no padding */
 #ifdef X64
         /* __u32 xmm_space[64] */
@@ -379,7 +379,7 @@ dump_fpstate(dcontext_t *dcontext, kernel_fpstate_t *fp)
             ASSERT(TEST(XCR0_AVX, fp->sw_reserved.xstate_bv));
             LOG(THREAD, LOG_ASYNCH, 1, "\txstate_bv = 0x" HEX64_FORMAT_STRING "\n",
                 xstate->xstate_hdr.xstate_bv);
-            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+            for (i = 0; i < proc_num_simd_saved(); i++) {
                 LOG(THREAD, LOG_ASYNCH, 1, "\tymmh%d = ", i);
                 for (j = 0; j < 4; j++) {
                     LOG(THREAD, LOG_ASYNCH, 1, "%04x ",
@@ -445,7 +445,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
     sigcontext_t *sc = sc_full->sc;
     if (sc->fpstate != NULL) {
         int i;
-        for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+        for (i = 0; i < proc_num_simd_saved(); i++) {
             memcpy(&mc->ymm[i], &sc->fpstate->IF_X64_ELSE(xmm_space[i * 4], _xmm[i]),
                    XMM_REG_SIZE);
         }
@@ -457,7 +457,7 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
                  */
                 ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate));
                 ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv));
-                for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+                for (i = 0; i < proc_num_simd_saved(); i++) {
                     memcpy(&mc->ymm[i].u32[4], &xstate->ymmh.ymmh_space[i * 4],
                            YMMH_REG_SIZE);
                 }
@@ -472,7 +472,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
     sigcontext_t *sc = sc_full->sc;
     if (sc->fpstate != NULL) {
         int i;
-        for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+        for (i = 0; i < proc_num_simd_saved(); i++) {
             memcpy(&sc->fpstate->IF_X64_ELSE(xmm_space[i * 4], _xmm[i]), &mc->ymm[i],
                    XMM_REG_SIZE);
         }
@@ -484,7 +484,7 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
                  */
                 ASSERT(sc->fpstate->sw_reserved.extended_size >= sizeof(*xstate));
                 ASSERT(TEST(XCR0_AVX, sc->fpstate->sw_reserved.xstate_bv));
-                for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+                for (i = 0; i < proc_num_simd_saved(); i++) {
                     memcpy(&xstate->ymmh.ymmh_space[i * 4], &mc->ymm[i].u32[4],
                            YMMH_REG_SIZE);
                 }
diff --git a/core/unix/signal_macos.c b/core/unix/signal_macos.c
index cf38fb8459c0e468f531d7f3c46a61159c250213..c4c20bb6cf18815c0389efb313e019a0685147ba 100644
--- a/core/unix/signal_macos.c
+++ b/core/unix/signal_macos.c
@@ -153,11 +153,11 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
      */
     sigcontext_t *sc = sc_full->sc;
     int i;
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < proc_num_simd_saved(); i++) {
         memcpy(&mc->ymm[i], &sc->__fs.__fpu_xmm0 + i, XMM_REG_SIZE);
     }
     if (YMM_ENABLED()) {
-        for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+        for (i = 0; i < proc_num_simd_saved(); i++) {
             memcpy(&mc->ymm[i].u32[4], &sc->__fs.__fpu_ymmh0 + i, YMMH_REG_SIZE);
         }
     }
@@ -168,11 +168,11 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
 {
     sigcontext_t *sc = sc_full->sc;
     int i;
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < proc_num_simd_saved(); i++) {
         memcpy(&sc->__fs.__fpu_xmm0 + i, &mc->ymm[i], XMM_REG_SIZE);
     }
     if (YMM_ENABLED()) {
-        for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+        for (i = 0; i < proc_num_simd_saved(); i++) {
             memcpy(&sc->__fs.__fpu_ymmh0 + i, &mc->ymm[i].u32[4], YMMH_REG_SIZE);
         }
     }
@@ -200,7 +200,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc)
         }
         LOG(THREAD, LOG_ASYNCH, 1, "\n");
     }
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < proc_num_simd_saved(); i++) {
         LOG(THREAD, LOG_ASYNCH, 1, "\txmm%d = ", i);
         for (j = 0; j < 4; j++) {
             LOG(THREAD, LOG_ASYNCH, 1, "%08x ",
@@ -209,7 +209,7 @@ dump_fpstate(dcontext_t *dcontext, sigcontext_t *sc)
         LOG(THREAD, LOG_ASYNCH, 1, "\n");
     }
     if (YMM_ENABLED()) {
-        for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+        for (i = 0; i < proc_num_simd_saved(); i++) {
             LOG(THREAD, LOG_ASYNCH, 1, "\tymmh%d = ", i);
             for (j = 0; j < 4; j++) {
                 LOG(THREAD, LOG_ASYNCH, 1, "%08x ",
diff --git a/core/win32/callback.c b/core/win32/callback.c
index 1308415f03bdbf83eb481d923928f6e0ea494311..ea88abd0edea7203f30b1a2020fe60ff8616aa75 100644
--- a/core/win32/callback.c
+++ b/core/win32/callback.c
@@ -4715,7 +4715,7 @@ dump_context_info(CONTEXT *context, file_t file, bool all)
         TESTALL(CONTEXT_XMM_FLAG, context->ContextFlags)) {
         int i, j;
         byte *ymmh_area;
-        for (i = 0; i < NUM_SIMD_SAVED; i++) {
+        for (i = 0; i < 6; i++) {
             LOG(file, LOG_ASYNCH, 2, "xmm%d=0x", i);
             /* This would be simpler if we had uint64 fields in dr_xmm_t but
              * that complicates our struct layouts */
diff --git a/core/win32/inject.c b/core/win32/inject.c
index a8f72103b4af1fc64e90a3ab1b2c1aed84c539f2..fd6e5a641e224637122b954cde29bc209d46592d 100644
--- a/core/win32/inject.c
+++ b/core/win32/inject.c
@@ -243,7 +243,7 @@ inject_into_thread(HANDLE phandle, CONTEXT *cxt, HANDLE thandle, char *dynamo_pa
             int i, j;
             /* For x86, ensure we have ExtendedRegisters space (i#1223) */
             IF_NOT_X64(ASSERT(TEST(CONTEXT_XMM_FLAG, cxt->ContextFlags)));
-            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+            for (i = 0; i < MCTX_TOTAL_SIMD_SLOTS_SIZE; i++) {
                 for (j = 0; j < IF_X64_ELSE(2, 4); j++) {
                     *bufptr++ = CXT_XMM(cxt, i)->reg[j];
                 }
@@ -254,7 +254,7 @@ inject_into_thread(HANDLE phandle, CONTEXT *cxt, HANDLE thandle, char *dynamo_pa
             }
         } else {
             /* skip xmm slots */
-            bufptr += XMM_SLOTS_SIZE / sizeof(*bufptr);
+            bufptr += MCTX_TOTAL_SIMD_SLOTS_SIZE / sizeof(*bufptr);
         }
         ASSERT((char *)bufptr - (char *)buf == sizeof(priv_mcontext_t));
         *bufptr++ = (ptr_uint_t)load_dynamo_code;
diff --git a/core/win32/ntdll.c b/core/win32/ntdll.c
index 589b45b075335992b4ac158dc45739206ff73116..7d6f8c554cf92f47a5ecf3f42a34195f86179165 100644
--- a/core/win32/ntdll.c
+++ b/core/win32/ntdll.c
@@ -1124,7 +1124,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt)
         /* no harm done if no sse support */
         /* CONTEXT_FLOATING_POINT or CONTEXT_EXTENDED_REGISTERS */
         int i;
-        for (i = 0; i < NUM_SIMD_SLOTS; i++)
+        for (i = 0; i < 6; i++)
             memcpy(&mcontext->ymm[i], CXT_XMM(cxt, i), XMM_REG_SIZE);
     }
     /* if XSTATE is NOT set, the app has NOT used any ymm state and
@@ -1134,7 +1134,7 @@ context_to_mcontext_internal(priv_mcontext_t *mcontext, CONTEXT *cxt)
         byte *ymmh_area = context_ymmh_saved_area(cxt);
         if (ymmh_area != NULL) {
             int i;
-            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+            for (i = 0; i < 6; i++) {
                 memcpy(&mcontext->ymm[i].u32[4], &YMMH_AREA(ymmh_area, i).u32[0],
                        YMMH_REG_SIZE);
             }
@@ -1225,7 +1225,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg)
         memcpy(&cxt->ExtendedRegisters, fpstate, written);
 #        endif
         /* Now update w/ the xmm values from mcontext */
-        for (i = 0; i < NUM_SIMD_SLOTS; i++)
+        for (i = 0; i < 6; i++)
             memcpy(CXT_XMM(cxt, i), &mcontext->ymm[i], XMM_REG_SIZE);
     }
     if (CONTEXT_PRESERVE_YMM && TESTALL(CONTEXT_XSTATE, cxt->ContextFlags)) {
@@ -1255,7 +1255,7 @@ mcontext_to_context(CONTEXT *cxt, priv_mcontext_t *mcontext, bool set_cur_seg)
             memcpy(&YMMH_AREA(ymmh_area, 6).u32[0], &ymms[0].u32[4], YMMH_REG_SIZE);
             memcpy(&YMMH_AREA(ymmh_area, 7).u32[0], &ymms[1].u32[4], YMMH_REG_SIZE);
 #        endif
-            for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+            for (i = 0; i < 6; i++) {
                 memcpy(&YMMH_AREA(ymmh_area, i).u32[0], &mcontext->ymm[i].u32[4],
                        YMMH_REG_SIZE);
             }
diff --git a/core/win32/syscall.c b/core/win32/syscall.c
index bd3f7c738117e2e13a7f4c5ff5fd48fbd734db16..38525dffc004d3f5be0f5bc1eb71b3277d199b9d 100644
--- a/core/win32/syscall.c
+++ b/core/win32/syscall.c
@@ -3297,13 +3297,15 @@ postsys_GetContextThread(dcontext_t *dcontext, reg_t *param_base, bool success)
             if (TESTALL(CONTEXT_XMM_FLAG, cxt->ContextFlags) &&
                 preserve_xmm_caller_saved()) {
                 /* PR 264138 */
-                memcpy(CXT_XMM(cxt, 0), CXT_XMM(xlate_cxt, 0), XMM_SAVED_SIZE);
+                memcpy(CXT_XMM(cxt, 0), CXT_XMM(xlate_cxt, 0),
+                       MCTX_TOTAL_SIMD_SLOTS_SIZE);
             }
             if (TESTALL(CONTEXT_YMM_FLAG, cxt->ContextFlags) &&
                 preserve_xmm_caller_saved()) {
                 byte *ymmh_area = context_ymmh_saved_area(cxt);
                 ASSERT(ymmh_area != NULL);
-                memcpy(ymmh_area, context_ymmh_saved_area(xlate_cxt), YMMH_SAVED_SIZE);
+                memcpy(ymmh_area, context_ymmh_saved_area(xlate_cxt),
+                       MCTX_YMMH_SLOTS_SIZE);
             }
         }
         SELF_PROTECT_LOCAL(trec->dcontext, READONLY);
diff --git a/make/DynamoRIOConfig.cmake.in b/make/DynamoRIOConfig.cmake.in
index 292f84d2658e85e2c64613293545ff59b18adda4..371034f692f7852e81f7918e690aeee5f3210016 100755
--- a/make/DynamoRIOConfig.cmake.in
+++ b/make/DynamoRIOConfig.cmake.in
@@ -186,6 +186,10 @@
 #
 #  set(DynamoRIO_PAGE_SIZE_COMPATIBILITY ON)
 #
+# To request that NUM_SIMD_SLOTS and NUM_XMM_SLOTS be defined set this variable:
+#
+#  set(DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY ON)
+#
 # To request a preferred base even when not targeting 64-bit:
 # (the actual base will use PREFERRED_BASE if set)
 #
@@ -574,6 +578,10 @@ function (DynamoRIO_extra_cflags flags_out extra_cflags tgt_cxx)
     set(extra_cflags "${extra_cflags} -DDR_PAGE_SIZE_COMPATIBILITY")
   endif (DynamoRIO_PAGE_SIZE_COMPATIBILITY)
 
+  if (DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY)
+    set(extra_cflags "${extra_cflags} -DDR_NUM_SIMD_SLOTS_COMPATIBILITY")
+  endif (DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY)
+
   if (DynamoRIO_LOG_COMPATIBILITY)
     set(extra_cflags "${extra_cflags} -DDR_LOG_DEFINE_COMPATIBILITY")
   endif ()
diff --git a/make/DynamoRIOConfigVersion.cmake.in b/make/DynamoRIOConfigVersion.cmake.in
index 837f5b719b94a1564b604601513307a1a1f15fd1..b96bf40461cff7a5cfb18fcd8cf049cbcc6d8bf2 100644
--- a/make/DynamoRIOConfigVersion.cmake.in
+++ b/make/DynamoRIOConfigVersion.cmake.in
@@ -96,4 +96,8 @@ if (NOT "${PACKAGE_FIND_VERSION_MAJOR}" EQUAL 0)
   elseif ("${PACKAGE_FIND_VERSION}" VERSION_LESS "7.0")
     set(DynamoRIO_LOG_COMPATIBILITY ON PARENT_SCOPE)
   endif ()
+  # Automatically define NUM_SIMD_SLOTS if client targets older version
+  if ("${PACKAGE_FIND_VERSION}" VERSION_LESS "7.1")
+    set(DynamoRIO_NUM_SIMD_SLOTS_COMPATIBILITY ON PARENT_SCOPE)
+  endif ()
 endif ()
diff --git a/suite/runsuite_wrapper.pl b/suite/runsuite_wrapper.pl
index a8684ff4cf6d9dcdea8fa1105e2712245c5f2488..c11e71230cc33f32ef4be3b82b26882657f3326a 100755
--- a/suite/runsuite_wrapper.pl
+++ b/suite/runsuite_wrapper.pl
@@ -77,7 +77,7 @@ if ($child) {
     # that has to be manually downloaded.  We thus stick with -V for
     # Travis.  For Appveyor where many devs have no local Visual
     # Studio we do use -VV so build warning details are visible.
-    my $verbose = "-V";
+    my $verbose = "-VV";
     if ($^O eq 'cygwin') {
         $verbose = "-VV";
         # CMake is native Windows so pass it a Windows path.
diff --git a/suite/tests/api/opnd-a64.c b/suite/tests/api/opnd-a64.c
index 26cc302e16244002ea6d0216f59bee6eb69912cf..5f62c872f8f848221c713203783b8c3b130e3e72 100644
--- a/suite/tests/api/opnd-a64.c
+++ b/suite/tests/api/opnd-a64.c
@@ -58,7 +58,7 @@ test_get_size()
     }
 
     // Check sizes of FP/SIMD regs.
-    for (uint i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (int i = 0; i < 6; i++) {
         ASSERT(reg_get_size((reg_id_t)DR_REG_H0 + i) == OPSZ_2);
         ASSERT(reg_get_size((reg_id_t)DR_REG_S0 + i) == OPSZ_4);
         ASSERT(reg_get_size((reg_id_t)DR_REG_D0 + i) == OPSZ_8);
diff --git a/suite/tests/client-interface/cleancall-opt-1.dll.c b/suite/tests/client-interface/cleancall-opt-1.dll.c
index c0eaea3037d853f361c22153e5899317dfbbc93f..32e293169752602e420316002acd840ca023c27c 100644
--- a/suite/tests/client-interface/cleancall-opt-1.dll.c
+++ b/suite/tests/client-interface/cleancall-opt-1.dll.c
@@ -125,7 +125,7 @@ event_basic_block(void *dc, void *tag, instrlist_t *bb, bool for_trace, bool tra
 static instrlist_t *
 codegen_out_of_line(void *dc)
 {
-    uint i;
+    int i;
     instrlist_t *ilist = instrlist_create(dc);
 
     codegen_prologue(dc, ilist);
@@ -138,7 +138,7 @@ codegen_out_of_line(void *dc)
     }
     /* FIXME i#1569: FMOV support is NYI on AArch64 */
 #ifdef X86
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < 6; i++) {
         reg_id_t reg = DR_REG_XMM0 + (reg_id_t)i;
         APP(ilist,
             INSTR_CREATE_movd(dc, opnd_create_reg(reg),
diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h
index 8ceb2fb9a08b76e58c0f87808908426a1e801af6..1e0f2f3d1837f90479b9a9071a83dcd95effe293 100644
--- a/suite/tests/client-interface/cleancall-opt-shared.h
+++ b/suite/tests/client-interface/cleancall-opt-shared.h
@@ -269,12 +269,12 @@ mcontexts_equal(dr_mcontext_t *mc_a, dr_mcontext_t *mc_b, int func_index)
 #ifdef X86
     /* Only look at the initialized bits of the SSE regs. */
     ymm_bytes_used = (proc_has_feature(FEATURE_AVX) ? 32 : 16);
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < 6; i++) {
         if (memcmp(&mc_a->ymm[i], &mc_b->ymm[i], ymm_bytes_used) != 0)
             return false;
     }
 #elif defined(AARCH64)
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < 6; i++) {
         if (memcmp(&mc_a->simd[i], &mc_b->simd[i], sizeof(dr_simd_t)) != 0)
             return false;
     }
@@ -301,7 +301,7 @@ dump_diff_mcontexts(void)
     }
 
     dr_fprintf(STDERR, "Printing XMM regs:\n");
-    for (i = 0; i < NUM_SIMD_SLOTS; i++) {
+    for (i = 0; i < MCTX_NUM_SIMD_SLOTS; i++) {
 #ifdef X86
         dr_ymm_t before_reg = before_mcontext.ymm[i];
         dr_ymm_t after_reg = after_mcontext.ymm[i];