From 8d81ca6ae7fcd6c2f9e673c990d5432f5457716a Mon Sep 17 00:00:00 2001
From: Kevin Malachowski <kevin@chowski.com>
Date: Tue, 12 Dec 2017 17:07:45 -0800
Subject: [PATCH 1/4] i#2157 re-attach: Add timeout to os_thread_suspend

After 5 seconds of waiting for a thread to acknowledge a received
signal, os_thread_suspend now returns false so that the caller can
retry.

Issue #2157
---
 core/synch.c   |  4 ++--
 core/unix/os.c | 13 ++++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/core/synch.c b/core/synch.c
index 4f6609941..3a10dbde7 100644
--- a/core/synch.c
+++ b/core/synch.c
@@ -1922,10 +1922,10 @@ detach_on_permanent_stack(bool internal, bool do_cleanup)
     DEBUG_DECLARE(bool ok;)
     DEBUG_DECLARE(int exit_res;)
     /* synch-all flags: if we fail to suspend a thread (e.g., privilege
-     * problems) ignore it.  XXX Should we retry instead?
+     * problems) retry it.
      */
     /* i#297: we only synch client threads after process exit event. */
-    uint flags = THREAD_SYNCH_SUSPEND_FAILURE_IGNORE | THREAD_SYNCH_SKIP_CLIENT_THREAD;
+    uint flags = THREAD_SYNCH_SUSPEND_FAILURE_RETRY | THREAD_SYNCH_SKIP_CLIENT_THREAD;
 
     ENTERING_DR();
 
diff --git a/core/unix/os.c b/core/unix/os.c
index a538cec3d..0512ae422 100644
--- a/core/unix/os.c
+++ b/core/unix/os.c
@@ -3474,10 +3474,17 @@ os_thread_suspend(thread_record_t *tr)
      */
     mutex_unlock(&ostd->suspend_lock);
     while (ksynch_get_value(&ostd->suspended) == 0) {
-        /* For Linux, waits only if the suspended flag is not set as 1. Return value
-         * doesn't matter because the flag will be re-checked.
+        /* For Linux, waits only if the suspended flag is not set as 1. Other
+         * than a timeout value, the return value doesn't matter because the
+         * flag will be re-checked.
          */
-        ksynch_wait(&ostd->suspended, 0, 0);
+        if (ksynch_wait(&ostd->suspended, 0, 5000) == -ETIMEDOUT) {
+            mutex_lock(&ostd->suspend_lock);
+            ASSERT(ostd->suspend_count > 0);
+            ostd->suspend_count--;
+            mutex_unlock(&ostd->suspend_lock);
+            return false;
+        }
         if (ksynch_get_value(&ostd->suspended) == 0) {
             /* If it still has to wait, give up the cpu. */
             os_thread_yield();
-- 
GitLab


From f1602798dd46619a974fc41fde5e59a5cb12ab62 Mon Sep 17 00:00:00 2001
From: Kevin Malachowski <kevin@chowski.com>
Date: Thu, 14 Dec 2017 13:58:56 -0800
Subject: [PATCH 2/4] Address review comments.

* Add timeout parameter to os_suspend_thread.
* Move decision about whether a thread suspend request times out to the caller of synch_with_thread, specifically its 'block' parameter.
* Implemented correctly for macos
* Changed code around so that Windows is unaffected.
---
 core/dynamo.c            |  2 +-
 core/os_shared.h         |  6 +++++-
 core/synch.c             | 25 +++++++++++++++++++++----
 core/unix/ksynch.h       |  3 ++-
 core/unix/ksynch_macos.c | 11 ++++++++++-
 core/unix/os.c           |  4 ++--
 core/win32/os.c          |  2 +-
 7 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/core/dynamo.c b/core/dynamo.c
index 2882b5547..c7b935fad 100644
--- a/core/dynamo.c
+++ b/core/dynamo.c
@@ -3111,7 +3111,7 @@ dynamorio_unprotect(void)
                     for (tr = all_threads[i]; tr; tr = tr->next) {
                         if (tr->under_dynamo_control) {
                             DEBUG_DECLARE(bool ok =)
-                                os_thread_suspend(all_threads[i]);
+                                os_thread_suspend(all_threads[i], 0);
                             ASSERT(ok);
                             protect_info->num_threads_suspended++;
                         }
diff --git a/core/os_shared.h b/core/os_shared.h
index 112907503..074b79ed6 100644
--- a/core/os_shared.h
+++ b/core/os_shared.h
@@ -151,7 +151,11 @@ thread_id_t get_thread_id(void);
 process_id_t get_process_id(void);
 void os_thread_yield(void);
 void os_thread_sleep(uint64 milliseconds);
-bool os_thread_suspend(thread_record_t *tr);
+/* os_thread_suspend may return false in the case of a timeout. Note that the
+ * timeout field is best effort and may be ignored by implementations (i.e.
+ * Windows); setting timeout to 0 results in blocking forever.
+ */
+bool os_thread_suspend(thread_record_t *tr, int timeout_ms);
 bool os_thread_resume(thread_record_t *tr);
 bool os_thread_terminate(thread_record_t *tr);
 
diff --git a/core/synch.c b/core/synch.c
index 3a10dbde7..968b38f94 100644
--- a/core/synch.c
+++ b/core/synch.c
@@ -45,6 +45,9 @@
 #include "native_exec.h"
 #include <string.h> /* for memcpy */
 
+/* Give threads 5 seconds to suspend when given a non-blocking synch request */
+#define SUSPEND_THREAD_TIMEOUT 5000
+
 extern vm_area_vector_t *fcache_unit_areas; /* from fcache.c */
 
 static bool started_detach = false; /* set before synchall */
@@ -888,6 +891,7 @@ synch_with_thread(thread_id_t id, bool block, bool hold_initexit_lock,
     IF_UNIX(bool actually_suspended = true;)
     const uint max_loops = TEST(THREAD_SYNCH_SMALL_LOOP_MAX, flags) ?
         (SYNCH_MAXIMUM_LOOPS/10) : SYNCH_MAXIMUM_LOOPS;
+    int thread_suspend_timeout_ms = block ? 0 : SUSPEND_THREAD_TIMEOUT;
 
     ASSERT(id != my_id);
     /* Must set ABORT or IGNORE.  Only caller can RETRY as need a new
@@ -960,7 +964,7 @@ synch_with_thread(thread_id_t id, bool block, bool hold_initexit_lock,
                 adjust_wait_at_safe_spot(trec->dcontext, 1);
                 first_loop = false;
             }
-            if (!os_thread_suspend(trec)) {
+            if (!os_thread_suspend(trec, thread_suspend_timeout_ms)) {
                 /* FIXME : eventually should be a real assert once we figure out
                  * how to handle threads with low privilege handles */
                 /* For dr_api_exit, we may have missed a thread exit. */
@@ -1921,11 +1925,24 @@ detach_on_permanent_stack(bool internal, bool do_cleanup)
 #endif
     DEBUG_DECLARE(bool ok;)
     DEBUG_DECLARE(int exit_res;)
-    /* synch-all flags: if we fail to suspend a thread (e.g., privilege
-     * problems) retry it.
+
+    /* synch-all flags: */
+    uint flags = 0;
+#ifdef WINDOWS
+    /* For Windows we may fail to suspend a thread (e.g., privilege
+     * problems), and in that case we want to just ignore the failure.
      */
+    flags |= THREAD_SYNCH_SUSPEND_FAILURE_IGNORE;
+#elif defined(UNIX)
+    /*  For Unix, we don't have that sort of permissions troubles, just timing
+     *  races which may cause the SUSPEND_SIGNAL to be dropped (xref i#26). In
+     *  that case, we know we want to retry any failures.
+     */
+    flags |= THREAD_SYNCH_SUSPEND_FAILURE_RETRY;
+#endif
+
     /* i#297: we only synch client threads after process exit event. */
-    uint flags = THREAD_SYNCH_SUSPEND_FAILURE_RETRY | THREAD_SYNCH_SKIP_CLIENT_THREAD;
+    flags |= THREAD_SYNCH_SKIP_CLIENT_THREAD;
 
     ENTERING_DR();
 
diff --git a/core/unix/ksynch.h b/core/unix/ksynch.h
index cff310b86..8a642670d 100644
--- a/core/unix/ksynch.h
+++ b/core/unix/ksynch.h
@@ -87,7 +87,8 @@ ksynch_set_value(mac_synch_t *synch, int new_val);
 KSYNCH_TYPE *
 mutex_get_contended_event(mutex_t *lock);
 
-/* These return 0 on success: */
+/* These return 0 on success and a negative value on failure. ksynch_wait
+ * returns -ETIMEDOUT if there was a timeout condition. */
 
 ptr_int_t
 ksynch_wait(KSYNCH_TYPE *var, int mustbe, int timeout_ms);
diff --git a/core/unix/ksynch_macos.c b/core/unix/ksynch_macos.c
index 5fb7be7df..a00e167b5 100644
--- a/core/unix/ksynch_macos.c
+++ b/core/unix/ksynch_macos.c
@@ -125,7 +125,16 @@ ksynch_wait(mac_synch_t *synch, int mustbe, int timeout_ms)
         res = semaphore_timedwait(synch->sem, timeout);
     } else
         res = semaphore_wait(synch->sem);
-    return (res == KERN_SUCCESS ? 0 : -1);
+
+    /* Conform to the API specified in ksynch.h */
+    switch (res) {
+    case KERN_SUCCESS:
+        return 0;
+    case KERN_OPERATION_TIMED_OUT:
+        return -ETIMEDOUT;
+    default:
+        return -1;
+    }
 }
 
 ptr_int_t
diff --git a/core/unix/os.c b/core/unix/os.c
index 0512ae422..9036e6699 100644
--- a/core/unix/os.c
+++ b/core/unix/os.c
@@ -3440,7 +3440,7 @@ os_thread_sleep(uint64 milliseconds)
 }
 
 bool
-os_thread_suspend(thread_record_t *tr)
+os_thread_suspend(thread_record_t *tr, int timeout_ms)
 {
     os_thread_data_t *ostd = (os_thread_data_t *) tr->dcontext->os_field;
     ASSERT(ostd != NULL);
@@ -3478,7 +3478,7 @@ os_thread_suspend(thread_record_t *tr)
          * than a timeout value, the return value doesn't matter because the
          * flag will be re-checked.
          */
-        if (ksynch_wait(&ostd->suspended, 0, 5000) == -ETIMEDOUT) {
+        if (ksynch_wait(&ostd->suspended, 0, timeout_ms) == -ETIMEDOUT) {
             mutex_lock(&ostd->suspend_lock);
             ASSERT(ostd->suspend_count > 0);
             ostd->suspend_count--;
diff --git a/core/win32/os.c b/core/win32/os.c
index cfa9f039d..5381cc8da 100644
--- a/core/win32/os.c
+++ b/core/win32/os.c
@@ -4905,7 +4905,7 @@ os_timeout(int time_in_milliseconds)
 }
 
 bool
-os_thread_suspend(thread_record_t *tr)
+os_thread_suspend(thread_record_t *tr, int timeout_ms)
 {
     return nt_thread_suspend(tr->handle, NULL);
 }
-- 
GitLab


From 95b2a24ea7be8f2fc8436c00937ba353bc1fbf6a Mon Sep 17 00:00:00 2001
From: Kevin Malachowski <kevin@chowski.com>
Date: Thu, 14 Dec 2017 15:52:55 -0800
Subject: [PATCH 3/4] Fix windows build

---
 core/win32/os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/win32/os.c b/core/win32/os.c
index 5381cc8da..892ace828 100644
--- a/core/win32/os.c
+++ b/core/win32/os.c
@@ -7679,7 +7679,7 @@ os_dump_core_live_dump(const char *msg, char *path OUT, size_t path_sz)
                         nt_get_handle_access_rights(tr->handle);
                     TEB *teb_addr = get_teb(tr->handle);
                     DEBUG_DECLARE(bool res = )
-                        os_thread_suspend(tr);
+                        os_thread_suspend(tr, 0);
                     /* we can't assert here (could infinite loop) */
                     DODEBUG({ suspend_failures = suspend_failures || !res; });
                     if (thread_get_context(tr, &cxt)) {
-- 
GitLab


From 9594e901a5cf2b3a3d0168d136fb3f363ca28ee0 Mon Sep 17 00:00:00 2001
From: Kevin Malachowski <kevin@chowski.com>
Date: Fri, 15 Dec 2017 07:30:40 -0800
Subject: [PATCH 4/4] Fix comment format

---
 core/unix/ksynch.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/unix/ksynch.h b/core/unix/ksynch.h
index 8a642670d..ad0ccfba4 100644
--- a/core/unix/ksynch.h
+++ b/core/unix/ksynch.h
@@ -88,7 +88,8 @@ KSYNCH_TYPE *
 mutex_get_contended_event(mutex_t *lock);
 
 /* These return 0 on success and a negative value on failure. ksynch_wait
- * returns -ETIMEDOUT if there was a timeout condition. */
+ * returns -ETIMEDOUT if there was a timeout condition.
+ */
 
 ptr_int_t
 ksynch_wait(KSYNCH_TYPE *var, int mustbe, int timeout_ms);
-- 
GitLab