diff --git a/core/arch/x86/x86.asm b/core/arch/x86/x86.asm
index 55ca3091a39261fb28ccaf3826c2a26756ed2aeb..4daa2756b8dfa4de925d4493c3b71e03e36343f7 100644
--- a/core/arch/x86/x86.asm
+++ b/core/arch/x86/x86.asm
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2011-2017 Google, Inc.  All rights reserved.
+ * Copyright (c) 2011-2018 Google, Inc.  All rights reserved.
  * Copyright (c) 2001-2010 VMware, Inc.  All rights reserved.
  * ********************************************************** */
 
@@ -594,7 +594,7 @@ GLOBAL_LABEL(cleanup_and_terminate:)
         mov      REG_XBX, PTRSZ [1*ARG_SZ + REG_XBP] /* dcontext */
         SAVE_TO_DCONTEXT_VIA_REG(REG_XBX,is_exiting_OFFSET,1)
         CALLC1(GLOBAL_REF(is_currently_on_dstack), REG_XBX) /* xbx is callee-saved */
-        cmp      REG_XAX, 0
+        cmp      al, 0
         jnz      cat_save_dstack
         mov      REG_XBX, 0 /* save 0 for dstack to avoid double-free */
         jmp      cat_done_saving_dstack
@@ -1486,7 +1486,7 @@ GLOBAL_LABEL(master_signal_handler:)
         mov      REG_XAX, REG_XSP
         /* call a C routine rather than writing everything in asm */
         CALLC2(GLOBAL_REF(sig_should_swap_stack), REG_XAX, REG_XDX)
-        cmp      REG_XAX, 0
+        cmp      al, 0
         pop      REG_XAX /* clone_and_swap_args.stack */
         pop      REG_XCX /* clone_and_swap_args.tos */
         je       no_swap
diff --git a/core/dynamo.c b/core/dynamo.c
index 4443502e0beffdb4b54661b25d37029ecd2e94a2..25d2725253e257480f46475824dd6cff5c369f98 100644
--- a/core/dynamo.c
+++ b/core/dynamo.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2018 Google, Inc.  All rights reserved.
  * Copyright (c) 2000-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -1662,6 +1662,7 @@ delete_dynamo_context(dcontext_t *dcontext, bool free_stack)
     if (free_stack) {
         ASSERT(dcontext->dstack != NULL);
         ASSERT(!is_currently_on_dstack(dcontext));
+        LOG(GLOBAL, LOG_THREADS, 1, "Freeing DR stack "PFX"\n", dcontext->dstack);
         stack_free(dcontext->dstack, DYNAMORIO_STACK_SIZE);
     } /* else will be cleaned up by caller */
 
@@ -2313,8 +2314,8 @@ dynamo_thread_init(byte *dstack_in, priv_mcontext_t *mc
         IF_CLIENT_INTERFACE_ELSE(client_thread ? "CLIENT " : "", ""),
         get_thread_id(), dcontext);
     LOG(THREAD, LOG_TOP|LOG_THREADS, 1,
-        "DR stack is "PFX"-"PFX"\n", dcontext->dstack - DYNAMORIO_STACK_SIZE,
-        dcontext->dstack);
+        "DR stack is "PFX"-"PFX" (passed in "PFX")\n",
+        dcontext->dstack - DYNAMORIO_STACK_SIZE, dcontext->dstack, dstack_in);
 #endif
 
 #ifdef DEADLOCK_AVOIDANCE
diff --git a/core/nudge.c b/core/nudge.c
index 2449f3581a8c5e4ff15fbceb7985195b6034e655..cac29dd57252b0d9f74b5d5e7422166ed0df3a4d 100644
--- a/core/nudge.c
+++ b/core/nudge.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2011-2017 Google, Inc.  All rights reserved.
+ * Copyright (c) 2011-2018 Google, Inc.  All rights reserved.
  * Copyright (c) 2008-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -125,14 +125,17 @@ nudge_thread_cleanup(dcontext_t *dcontext, bool exit_process, uint exit_code)
      *  terminate nudge threads instead of allowing them to return and exit normally.
      *  On XP and 2k3 none of our nudge creation routines inform csrss of the new thread
      *  (which is who typically frees the stacks).
-     *  On Vista we don't use NtCreateThreadEx to create the nudge threads so the kernel
-     *  doesn't free the stack.
+     * On Vista and Win7 we don't use NtCreateThreadEx to create the nudge threads so
+     *  the kernel doesn't free the stack.
      * As such we are left with two options: free the app stack here (nudgee free) or
-     * have the nudge thread creator free the app stack (nudger free).  Going with
-     * nudgee free means we leak exit race nudge stacks whereas if we go with nudger free
-     * for external nudges then we'll leak timed out nudge stacks (for internal nudges
-     * we pretty much have to do nudgee free).  A nudge_arg_t flag is used to specify
-     * which model we use, but currently we always nudgee free.
+     *  have the nudge thread creator free the app stack (nudger free).  Going with
+     *  nudgee free means we leak exit race nudge stacks whereas if we go with nudger free
+     *  for external nudges then we'll leak timed out nudge stacks (for internal nudges
+     *  we pretty much have to do nudgee free).  A nudge_arg_t flag is used to specify
+     *  which model we use, but currently we always nudgee free.
+     * On Win8+ we do use NtCreateThreadEx to create the nudge threads so the kernel
+     *  does free the stack.  We could use this on Vista and Win7 too -- should we?
+     *  It requires someone to free the argument buffer (NUDGE_FREE_ARG).
      *
      * dynamo_thread_exit_common() is where the app stack is actually freed, not here.
      */
@@ -219,8 +222,6 @@ generic_nudge_handler(nudge_arg_t *arg_dont_use)
     /* if needed tell thread exit to free the application stack */
     if (!TEST(NUDGE_NUDGER_FREE_STACK, safe_arg.flags)) {
         dcontext->free_app_stack = true;
-    } else {
-        ASSERT_NOT_TESTED();
     }
 
     /* FIXME - would be nice to inform nudge creator if we need to nop the nudge. */
@@ -262,7 +263,6 @@ generic_nudge_handler(nudge_arg_t *arg_dont_use)
 
     /* Free the arg if requested. */
     if (TEST(NUDGE_FREE_ARG, safe_arg.flags)) {
-        ASSERT_NOT_TESTED();
         nt_free_virtual_memory(arg_dont_use);
     }
 
@@ -488,10 +488,18 @@ nudge_internal(process_id_t pid, uint nudge_action_mask,
 
     nudge_arg.version = NUDGE_ARG_CURRENT_VERSION;
     nudge_arg.nudge_action_mask = nudge_action_mask;
-    /* we do not set NUDGE_NUDGER_FREE_STACK so the stack will be freed
-     * in the target process
+    /* We do not set NUDGE_NUDGER_FREE_STACK so the stack will be freed
+     * in the target process, for <=win7.
      */
     nudge_arg.flags = (internal ? NUDGE_IS_INTERNAL : 0);
+#ifdef WINDOWS
+    if (get_os_version() >= WINDOWS_VERSION_8) {
+        /* The kernel owns and frees the stack. */
+        nudge_arg.flags |= NUDGE_NUDGER_FREE_STACK;
+        /* The arg was placed in a new kernel alloc. */
+        nudge_arg.flags |= NUDGE_FREE_ARG;
+    }
+#endif
     nudge_arg.client_arg = client_arg;
     nudge_arg.client_id = client_id;
 
diff --git a/core/win32/ntdll.c b/core/win32/ntdll.c
index cb6e9443769be297e3add65107f566685115e93e..d945c7ae1b585634ccceec63402b25bf645aaeea 100644
--- a/core/win32/ntdll.c
+++ b/core/win32/ntdll.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2010-2017 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2018 Google, Inc.  All rights reserved.
  * Copyright (c) 2003-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -4596,7 +4596,7 @@ create_thread_common(HANDLE hProcess, bool target_64bit, void *start_addr,
     res = NT_SYSCALL(CreateThread, &hthread, THREAD_ALL_ACCESS, &oa,
                      hProcess, &cid, context, stack, (byte)TRUE);
     if (!NT_SUCCESS(res)) {
-        NTPRINT("create_thread: failed to create thread\n");
+        NTPRINT("create_thread: failed to create thread: %x\n", res);
         return INVALID_HANDLE_VALUE;
     }
     /* Xref PR 252008 & PR 252745 - on 32-bit Windows the kernel will set esp
@@ -4617,6 +4617,74 @@ create_thread_common(HANDLE hProcess, bool target_64bit, void *start_addr,
     return hthread;
 }
 
+static HANDLE
+our_create_thread_ex(HANDLE hProcess, bool target_64bit, void *start_addr,
+                     void *arg, const void *arg_buf, size_t arg_buf_size,
+                     uint stack_reserve, uint stack_commit, bool suspended,
+                     thread_id_t *tid)
+{
+    HANDLE hthread;
+    OBJECT_ATTRIBUTES oa;
+    CLIENT_ID cid;
+    TEB *teb;
+    void *thread_arg = arg;
+    create_thread_info_t info = {0};
+    NTSTATUS res;
+    /* NtCreateThreadEx doesn't exist prior to Vista. */
+    ASSERT(syscalls[SYS_CreateThreadEx] != SYSCALL_NOT_PRESENT);
+    GET_RAW_SYSCALL(CreateThreadEx,
+                    OUT PHANDLE ThreadHandle,
+                    IN ACCESS_MASK DesiredAccess,
+                    IN POBJECT_ATTRIBUTES ObjectAttributes,
+                    IN HANDLE ProcessHandle,
+                    IN LPTHREAD_START_ROUTINE Win32StartAddress,
+                    IN LPVOID StartParameter,
+                    IN BOOL CreateSuspended,
+                    IN uint StackZeroBits,
+                    IN SIZE_T StackCommitSize,
+                    IN SIZE_T StackReserveSize,
+                    INOUT create_thread_info_t *thread_info);
+
+    InitializeObjectAttributes(&oa, NULL, OBJ_CASE_INSENSITIVE, NULL, NULL);
+
+    if (arg_buf != NULL) {
+        /* XXX: Currently we leak this memory, except for nudge where the caller
+         * sets NUDGE_FREE_ARG.
+         */
+        if (!NT_SUCCESS
+            (nt_remote_allocate_virtual_memory
+             (hProcess, &thread_arg, arg_buf_size, PAGE_READWRITE, MEM_COMMIT))) {
+            NTPRINT("create_thread: failed to allocate arg buf\n");
+            return INVALID_HANDLE_VALUE;
+        }
+        if (!nt_write_virtual_memory(hProcess, thread_arg, arg_buf,
+                                     arg_buf_size, NULL)) {
+            NTPRINT("create_thread: failed to write arguments\n");
+            return INVALID_HANDLE_VALUE;
+        }
+    }
+
+    info.struct_size = sizeof(info);
+    info.client_id.flags = THREAD_INFO_ELEMENT_CLIENT_ID | THREAD_INFO_ELEMENT_UNKNOWN_2;
+    info.client_id.buffer_size = sizeof(cid);
+    info.client_id.buffer = &cid;
+    /* We get STATUS_INVALID_PARAMETER unless we also ask for teb. */
+    info.teb.flags = THREAD_INFO_ELEMENT_TEB | THREAD_INFO_ELEMENT_UNKNOWN_2;
+    info.teb.buffer_size = sizeof(TEB*);
+    info.teb.buffer = &teb;
+    res = NT_RAW_SYSCALL(CreateThreadEx, &hthread, THREAD_ALL_ACCESS, &oa, hProcess,
+                         (LPTHREAD_START_ROUTINE)convert_data_to_function(start_addr),
+                         thread_arg, !!suspended, 0, stack_commit, stack_reserve,
+                         &info);
+    if (!NT_SUCCESS(res)) {
+        NTPRINT("create_thread_ex: failed to create thread: %x\n", res);
+        return INVALID_HANDLE_VALUE;
+    }
+    if (tid != NULL)
+        *tid = (thread_id_t)cid.UniqueThread;
+    return hthread;
+}
+
 /* Creates a new stack w/ guard page */
 HANDLE
 our_create_thread(HANDLE hProcess, bool target_64bit, void *start_addr,
@@ -4631,6 +4699,14 @@ our_create_thread(HANDLE hProcess, bool target_64bit, void *start_addr,
     ASSERT(stack_commit + PAGE_SIZE <= stack_reserve &&
            ALIGNED(stack_commit, PAGE_SIZE) &&
            ALIGNED(stack_reserve, PAGE_SIZE));
+
+    if (get_os_version() >= WINDOWS_VERSION_8) {
+        /* NtCreateThread not available: use Ex where the kernel makes the stack. */
+        return our_create_thread_ex(hProcess, target_64bit, start_addr, arg, arg_buf,
+                                    arg_buf_size, stack_reserve, stack_commit,
+                                    suspended, tid);
+    }
+
     if (!NT_SUCCESS(nt_remote_allocate_virtual_memory(hProcess,
                                                       &stack.ExpandableStackBottom,
                                                       stack_reserve, PAGE_READWRITE,
@@ -4638,8 +4714,7 @@ our_create_thread(HANDLE hProcess, bool target_64bit, void *start_addr,
         NTPRINT("create_thread: failed to allocate stack\n");
         return INVALID_HANDLE_VALUE;
     }
-    /* FIXME : for failures beyond this point we don't bother deallocating the
-     * stack. */
+    /* For failures beyond this point we don't bother deallocating the stack. */
     stack.ExpandableStackBase =
         ((byte *)stack.ExpandableStackBottom) + stack_reserve;
     stack.ExpandableStackLimit =
@@ -4662,13 +4737,22 @@ our_create_thread(HANDLE hProcess, bool target_64bit, void *start_addr,
                                 arg_buf, arg_buf_size, &stack, suspended, tid);
 }
 
-/* Uses caller-allocated stack */
+/* Uses caller-allocated stack.  hProcess must be NT_CURRENT_PROCESS for win8+. */
 HANDLE
 our_create_thread_have_stack(HANDLE hProcess, bool target_64bit, void *start_addr,
                              void *arg, const void *arg_buf, size_t arg_buf_size,
                              byte *stack_base, size_t stack_size,
                              bool suspended, thread_id_t *tid)
 {
+    if (get_os_version() >= WINDOWS_VERSION_8) {
+        /* FIXME i#1309: we need a wrapper function so we can use NtCreateThreadEx
+         * and then switch stacks.  This is too hard to arrange in another process.
+         * NtCreateThread seems to work in some cases (on Appveyor) so until we have
+         * an NtCreateThreadEx solution we fall through.
+         */
+        ASSERT(hProcess == NT_CURRENT_PROCESS &&
+               "No support for creating a remote thread with a custom stack");
+    }
     USER_STACK stack = {0};
     stack.ExpandableStackBase = stack_base;
     stack.ExpandableStackLimit = stack_base - stack_size;
diff --git a/core/win32/ntdll.h b/core/win32/ntdll.h
index b9cc2d654209e4875658fb99cb5dab7738755cc5..7494a53b585aec62e623a410c346add6f8e2a4c5 100644
--- a/core/win32/ntdll.h
+++ b/core/win32/ntdll.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2011-2017 Google, Inc.  All rights reserved.
+ * Copyright (c) 2011-2018 Google, Inc.  All rights reserved.
  * Copyright (c) 2003-2010 VMware, Inc.  All rights reserved.
  * **********************************************************/
 
@@ -939,6 +939,7 @@ typedef enum { /* NOTE - these are speculative */
                                                 * - INOUT */
     THREAD_INFO_ELEMENT_UNKNOWN_1       = 0x9, /* Unknown - ptr_uint_t sized
                                                 * [ observed 1 ] - IN */
+    THREAD_INFO_ELEMENT_UNKNOWN_2       = 0x10000,
 } thread_info_elm_buf_type_t;
 
 typedef struct _thread_info_element_t { /* NOTE - this is speculative */
@@ -2102,13 +2103,18 @@ nt_stop_profile(HANDLE profile_handle);
 HANDLE
 create_process(wchar_t *exe, wchar_t *cmdline);
 
-/* NOTE see important usage information in ntdll.c, threads created with this
- * function can NOT return from their start routine */
+/* See important usage information in ntdll.c: threads created with this
+ * function can NOT return from their start routine.
+ * On Win8+, the kernel owns the created stack; o/w, we own it.
+ * On Win8+, if arg_buf != NULL, it's placed in a new virtual alloc and it's
+ * up to the caller to free it.
+ */
 HANDLE
 our_create_thread(HANDLE hProcess, bool target_64bit, void *start_addr,
                   void *arg, const void *arg_buf, size_t arg_buf_size,
                   uint stack_reserve, uint stack_commit, bool suspended,
                   thread_id_t *tid);
+/* Uses caller-allocated stack.  hProcess must be NT_CURRENT_PROCESS for win8+. */
 HANDLE
 our_create_thread_have_stack(HANDLE hProcess, bool target_64bit, void *start_addr,
                              void *arg, const void *arg_buf, size_t arg_buf_size,