diff --git a/api/samples/memtrace_simple.c b/api/samples/memtrace_simple.c
index f1b2a5de1e5a861f6e4872ff9a9f95cce479a12b..8f950bd801b8dacb6c89a507a1e46fc0762ec626 100644
--- a/api/samples/memtrace_simple.c
+++ b/api/samples/memtrace_simple.c
@@ -37,7 +37,11 @@
  * Collects the memory reference information and dumps it to a file as text.
  *
  * (1) It fills a per-thread-buffer with inlined instrumentation.
- * (2) It calls a clean call to dump the buffer into a file.
+ * (2) It calls a clean call to dump the buffer into a file. On AArch64, clean
+ *     calls add too many additional instructions. To reduce the number of
+ *     instructions added to each instrumented basic block, the clean call
+ *     is placed in a separate code cache page and jumps to that code page
+ *     are inserted instead of clean calls.
  *
  * The profile consists of list of <type, size, addr> entries representing
  * - mem ref instr: e.g., { type = 42 (call), size = 5, addr = 0x7f59c2d002d3 }
@@ -108,6 +112,10 @@ enum {
 static reg_id_t tls_seg;
 static uint     tls_offs;
 static int      tls_idx;
+#if defined(AARCH64)
+static size_t page_size;
+static app_pc code_cache;
+#endif
 #define TLS_SLOT(tls_base, enum_val) (void **)((byte *)(tls_base)+tls_offs+(enum_val))
 #define BUF_PTR(tls_base) *(mem_ref_t **)TLS_SLOT(tls_base, MEMTRACE_TLS_OFFS_BUF_PTR)
 
@@ -147,6 +155,42 @@ clean_call(void)
     memtrace(drcontext);
 }
 
+#if defined(AARCH64)
+static void
+code_cache_init(void)
+{
+    void         *drcontext;
+    instrlist_t  *ilist;
+    instr_t      *where;
+    byte         *end;
+
+    drcontext  = dr_get_current_drcontext();
+    code_cache = dr_nonheap_alloc(page_size,
+                                  DR_MEMPROT_READ  |
+                                  DR_MEMPROT_WRITE |
+                                  DR_MEMPROT_EXEC);
+    ilist = instrlist_create(drcontext);
+    /* The lean procecure simply performs a clean call, and then jump back */
+    /* jump back to the DR's code cache */
+    where = INSTR_CREATE_br(drcontext, opnd_create_reg(DR_REG_X14));
+    instrlist_meta_append(ilist, where);
+    /* clean call */
+    dr_insert_clean_call(drcontext, ilist, where, (void *)clean_call, false, 0);
+    /* Encodes the instructions into memory and then cleans up. */
+    end = instrlist_encode(drcontext, ilist, code_cache, false);
+    DR_ASSERT((size_t)(end - code_cache) < page_size);
+    instrlist_clear_and_destroy(drcontext, ilist);
+    /* set the memory as just +rx now */
+    dr_memory_protect(code_cache, page_size, DR_MEMPROT_READ | DR_MEMPROT_EXEC);
+}
+
+static void
+code_cache_exit(void)
+{
+    dr_nonheap_free(code_cache, page_size);
+}
+#endif
+
 static void
 insert_load_buf_ptr(void *drcontext, instrlist_t *ilist, instr_t *where,
                     reg_id_t reg_ptr)
@@ -231,17 +275,9 @@ insert_save_addr(void *drcontext, instrlist_t *ilist, instr_t *where,
 
 /* insert inline code to add an instruction entry into the buffer */
 static void
-instrument_instr(void *drcontext, instrlist_t *ilist, instr_t *where)
+instrument_instr(void *drcontext, instrlist_t *ilist, instr_t *where,
+                 reg_id_t reg_ptr, reg_id_t reg_tmp)
 {
-    /* We need two scratch registers */
-    reg_id_t reg_ptr, reg_tmp;
-    if (drreg_reserve_register(drcontext, ilist, where, NULL, &reg_ptr) !=
-        DRREG_SUCCESS ||
-        drreg_reserve_register(drcontext, ilist, where, NULL, &reg_tmp) !=
-        DRREG_SUCCESS) {
-        DR_ASSERT(false); /* cannot recover */
-        return;
-    }
     insert_load_buf_ptr(drcontext, ilist, where, reg_ptr);
     insert_save_type(drcontext, ilist, where, reg_ptr, reg_tmp,
                      (ushort)instr_get_opcode(where));
@@ -250,26 +286,13 @@ instrument_instr(void *drcontext, instrlist_t *ilist, instr_t *where)
     insert_save_pc(drcontext, ilist, where, reg_ptr, reg_tmp,
                    instr_get_app_pc(where));
     insert_update_buf_ptr(drcontext, ilist, where, reg_ptr, sizeof(mem_ref_t));
-    /* Restore scratch registers */
-    if (drreg_unreserve_register(drcontext, ilist, where, reg_ptr) != DRREG_SUCCESS ||
-        drreg_unreserve_register(drcontext, ilist, where, reg_tmp) != DRREG_SUCCESS)
-        DR_ASSERT(false);
 }
 
 /* insert inline code to add a memory reference info entry into the buffer */
 static void
 instrument_mem(void *drcontext, instrlist_t *ilist, instr_t *where,
-               opnd_t ref, bool write)
+               opnd_t ref, bool write, reg_id_t reg_ptr, reg_id_t reg_tmp)
 {
-    /* We need two scratch registers */
-    reg_id_t reg_ptr, reg_tmp;
-    if (drreg_reserve_register(drcontext, ilist, where, NULL, &reg_ptr) !=
-        DRREG_SUCCESS ||
-        drreg_reserve_register(drcontext, ilist, where, NULL, &reg_tmp) !=
-        DRREG_SUCCESS) {
-        DR_ASSERT(false); /* cannot recover */
-        return;
-    }
     /* save_addr should be called first as reg_ptr or reg_tmp maybe used in ref */
     insert_save_addr(drcontext, ilist, where, ref, reg_ptr, reg_tmp);
     insert_save_type(drcontext, ilist, where, reg_ptr, reg_tmp,
@@ -277,12 +300,37 @@ instrument_mem(void *drcontext, instrlist_t *ilist, instr_t *where,
     insert_save_size(drcontext, ilist, where, reg_ptr, reg_tmp,
                      (ushort)drutil_opnd_mem_size_in_bytes(ref, where));
     insert_update_buf_ptr(drcontext, ilist, where, reg_ptr, sizeof(mem_ref_t));
-    /* Restore scratch registers */
-    if (drreg_unreserve_register(drcontext, ilist, where, reg_ptr) != DRREG_SUCCESS ||
-        drreg_unreserve_register(drcontext, ilist, where, reg_tmp) != DRREG_SUCCESS)
-        DR_ASSERT(false);
 }
 
+#if defined(AARCH64)
+static void
+insert_lean_call(void *drcontext, instrlist_t *ilist, instr_t *where,
+                 app_pc pc, reg_id_t scratch1, reg_id_t scratch2)
+{
+    /* We jump to lean procedure which performs full context switch and
+     * clean call invocation. This is to reduce the code cache size.
+     */
+    DR_ASSERT(scratch1 == DR_REG_X14);
+
+    instr_t *restore = INSTR_CREATE_label(drcontext);
+
+    /* this is the return address for jumping back from lean procedure */
+    MINSERT(ilist, where,
+        INSTR_CREATE_adr(drcontext,
+                         opnd_create_reg(DR_REG_X14),
+                         opnd_create_instr(restore)));
+
+    /* Jump to clean call in code cache */
+    instrlist_insert_mov_immed_ptrsz(drcontext, (ptr_int_t)code_cache,
+                                    opnd_create_reg(scratch2),
+                                    ilist, where, NULL, NULL);
+    MINSERT(ilist, where,
+        INSTR_CREATE_br(drcontext, opnd_create_reg(scratch2)));
+
+    MINSERT(ilist, where, restore);
+}
+#endif
+
 /* For each memory reference app instr, we insert inline code to fill the buffer
  * with an instruction entry and memory reference entries.
  */
@@ -292,24 +340,45 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb,
                       bool translating, void *user_data)
 {
     int i;
+    reg_id_t reg_ptr, reg_tmp;
+    IF_AARCH64(drvector_t allowed;)
 
     if (!instr_is_app(instr))
         return DR_EMIT_DEFAULT;
+
     if (!instr_reads_memory(instr) && !instr_writes_memory(instr))
         return DR_EMIT_DEFAULT;
 
+#if defined(AARCH64)
+    drreg_init_and_fill_vector(&allowed, false);
+    drreg_set_vector_entry(&allowed, DR_REG_X14, true);
+#endif
+    /* We need two scratch registers */
+    if (drreg_reserve_register(drcontext, bb, instr,
+                               IF_AARCH64_ELSE(&allowed, NULL), &reg_ptr) !=
+        DRREG_SUCCESS ||
+        drreg_reserve_register(drcontext, bb, instr, NULL, &reg_tmp) !=
+        DRREG_SUCCESS) {
+        IF_AARCH64(drvector_delete(&allowed));
+        DR_ASSERT(false); /* cannot recover */
+        return DR_EMIT_DEFAULT;
+    }
+    IF_AARCH64(drvector_delete(&allowed));
+
     /* insert code to add an entry for app instruction */
-    instrument_instr(drcontext, bb, instr);
+    instrument_instr(drcontext, bb, instr, reg_ptr, reg_tmp);
 
     /* insert code to add an entry for each memory reference opnd */
     for (i = 0; i < instr_num_srcs(instr); i++) {
         if (opnd_is_memory_reference(instr_get_src(instr, i)))
-            instrument_mem(drcontext, bb, instr, instr_get_src(instr, i), false);
+            instrument_mem(drcontext, bb, instr, instr_get_src(instr, i), false,
+                           reg_ptr, reg_tmp);
     }
 
     for (i = 0; i < instr_num_dsts(instr); i++) {
         if (opnd_is_memory_reference(instr_get_dst(instr, i)))
-            instrument_mem(drcontext, bb, instr, instr_get_dst(instr, i), true);
+            instrument_mem(drcontext, bb, instr, instr_get_dst(instr, i), true,
+                           reg_ptr, reg_tmp);
     }
 
     /* insert code to call clean_call for processing the buffer */
@@ -329,7 +398,17 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb,
          * forthcoming buffer filling API (i#513) will provide that.
          */
         IF_AARCHXX(&& !instr_is_exclusive_store(instr)))
+#if defined(AARCH64)
+        insert_lean_call(drcontext, bb, instr,
+                 instr_get_app_pc(instr), reg_ptr, reg_tmp);
+#else
         dr_insert_clean_call(drcontext, bb, instr, (void *)clean_call, false, 0);
+#endif
+
+    /* Restore scratch registers */
+    if (drreg_unreserve_register(drcontext, bb, instr, reg_ptr) != DRREG_SUCCESS ||
+        drreg_unreserve_register(drcontext, bb, instr, reg_tmp) != DRREG_SUCCESS)
+        DR_ASSERT(false);
 
     return DR_EMIT_DEFAULT;
 }
@@ -401,6 +480,7 @@ event_thread_exit(void *drcontext)
 static void
 event_exit(void)
 {
+    IF_AARCH64(code_cache_exit());
     dr_log(NULL, LOG_ALL, 1, "Client 'memtrace' num refs seen: "SZFMT"\n", num_refs);
     if (!dr_raw_tls_cfree(tls_offs, MEMTRACE_TLS_COUNT))
         DR_ASSERT(false);
@@ -425,6 +505,7 @@ dr_client_main(client_id_t id, int argc, const char *argv[])
     drreg_options_t ops = {sizeof(ops), 3, false};
     dr_set_client_name("DynamoRIO Sample Client 'memtrace'",
                        "http://dynamorio.org/issues");
+    IF_AARCH64(page_size = dr_page_size());
     if (!drmgr_init() || drreg_init(&ops) != DRREG_SUCCESS || !drutil_init())
         DR_ASSERT(false);
 
@@ -450,6 +531,7 @@ dr_client_main(client_id_t id, int argc, const char *argv[])
     if (!dr_raw_tls_calloc(&tls_seg, &tls_offs, MEMTRACE_TLS_COUNT, 0))
         DR_ASSERT(false);
 
+    IF_AARCH64(code_cache_init());
     /* make it easy to tell, by looking at log file, which client executed */
     dr_log(NULL, LOG_ALL, 1, "Client 'memtrace' initializing\n");
 }
diff --git a/core/arch/aarch64/codec.c b/core/arch/aarch64/codec.c
index ecdf6d251034e6016fde3e696c66d14732a3ebb6..e8c17331747abd3bbe2307e5e40a8a36bb1fc298 100644
--- a/core/arch/aarch64/codec.c
+++ b/core/arch/aarch64/codec.c
@@ -431,15 +431,21 @@ decode_opnd_adr_page(int scale, uint enc, byte *pc, OUT opnd_t *opnd)
 }
 
 static bool
-encode_opnd_adr_page(int scale, byte *pc, opnd_t opnd, OUT uint *enc_out)
+encode_opnd_adr_page(int scale, byte *pc, opnd_t opnd, OUT uint *enc_out,
+                     instr_t *instr)
 {
-    void *addr;
+    ptr_int_t offset;
     uint bits;
-    if (!opnd_is_rel_addr(opnd))
+    if (opnd_is_rel_addr(opnd))
+        offset = (ptr_int_t)opnd_get_addr(opnd) -
+             (ptr_int_t)((ptr_uint_t)pc >> scale << scale);
+    else if (opnd.kind == INSTR_kind)
+        offset = (ptr_int_t)
+            ((byte *)opnd_get_instr(opnd)->note - (byte *)instr->note);
+    else
         return false;
-    addr = opnd_get_addr(opnd);
-    if (!try_encode_int(&bits, 21, scale,
-                        (ptr_int_t)addr - (ptr_int_t)((ptr_uint_t)pc >> scale << scale)))
+
+    if (!try_encode_int(&bits, 21, scale, offset))
         return false;
     *enc_out = (bits & 3) << 29 | (bits & 0x1ffffc) << 3;
     return true;
@@ -846,9 +852,10 @@ decode_opnd_adr(uint enc, int opcode, byte *pc, OUT opnd_t *opnd)
 }
 
 static inline bool
-encode_opnd_adr(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out)
+encode_opnd_adr(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out,
+                instr_t *instr)
 {
-    return encode_opnd_adr_page(0, pc, opnd, enc_out);
+    return encode_opnd_adr_page(0, pc, opnd, enc_out, instr);
 }
 
 /* adrp: operand of ADRP */
@@ -860,9 +867,10 @@ decode_opnd_adrp(uint enc, int opcode, byte *pc, OUT opnd_t *opnd)
 }
 
 static inline bool
-encode_opnd_adrp(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out)
+encode_opnd_adrp(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out,
+                 instr_t *instr)
 {
-    return encode_opnd_adr_page(12, pc, opnd, enc_out);
+    return encode_opnd_adr_page(12, pc, opnd, enc_out, instr);
 }
 
 /* b0: B register at bit position 0 */
diff --git a/core/arch/aarch64/codec.py b/core/arch/aarch64/codec.py
index 3c3054bcf10939aecd0db2e786cd30fa6bd9a537..9f1d7e3e096143c4ae520c2019d08396f61b2dad 100755
--- a/core/arch/aarch64/codec.py
+++ b/core/arch/aarch64/codec.py
@@ -124,6 +124,14 @@ def generate_decoder(patterns, opndsgen, opndtypes):
     c.append('}')
     return '\n'.join(c) + '\n'
 
+
+def maybe_instr(opnd):
+    if opnd in ('adr', 'adrp'):
+        return ', instr'
+    else:
+        return ''
+
+
 def generate_encoder(patterns, opndsgen, opndtypes):
     c = []
     for name in sorted(opndsgen):
@@ -144,11 +152,11 @@ def generate_encoder(patterns, opndsgen, opndtypes):
             tests = (['instr_num_dsts(instr) == %d && instr_num_srcs(instr) == %d' %
                       (len(dsts), len(srcs))] +
                      ['encode_opnd_%s(enc & 0x%08x, opcode, '
-                      'pc, instr_get_dst(instr, %d), &dst%d)' %
-                      (dsts[i], f | opndtypes[dsts[i]], i, i) for i in range(len(dsts))] +
+                      'pc, instr_get_dst(instr, %d), &dst%d%s)' %
+                      (dsts[i], f | opndtypes[dsts[i]], i, i, maybe_instr(dsts[i])) for i in range(len(dsts))] +
                      ['encode_opnd_%s(enc & 0x%08x, opcode, '
-                      'pc, instr_get_src(instr, %d), &src%d)' %
-                      (srcs[i], f | opndtypes[srcs[i]], i, i) for i in range(len(srcs))])
+                      'pc, instr_get_src(instr, %d), &src%d%s)' %
+                      (srcs[i], f | opndtypes[srcs[i]], i, i, maybe_instr(srcs[i])) for i in range(len(srcs))])
             tests2 = (['dst%d == (enc & 0x%08x)' % (i, opndtypes[dsts[i]])
                        for i in range(len(dsts))] +
                       ['src%d == (enc & 0x%08x)' % (i, opndtypes[srcs[i]])
diff --git a/core/arch/aarch64/codec.txt b/core/arch/aarch64/codec.txt
index 732914e3adffd24ac9f9681fa85c74a6d1f015b3..b17563572042da05760f65ee63428a997b301bda 100644
--- a/core/arch/aarch64/codec.txt
+++ b/core/arch/aarch64/codec.txt
@@ -143,6 +143,8 @@ x---------------------xxxxx-----  wx5sp      # W/X register or WSP/XSP
 x----------------xxxxx----------  wx10       # W/X register (or WZR/XZR)
 x----------xxxxx----------------  wx16       # W/X register (or WZR/XZR)
 
+# Note: The encoders for adr and adrp take the current instruction as argument
+#       in order to support calculating offsets for instruction operands.
 ################################################################################
 # Instruction patterns
 
diff --git a/core/arch/aarch64/encode_gen.h b/core/arch/aarch64/encode_gen.h
index c7dc2d65de352c72d096b72850e7faffcaf016cf..4a24f31d66be48ca05e67afd9a056ccf581e6164 100644
--- a/core/arch/aarch64/encode_gen.h
+++ b/core/arch/aarch64/encode_gen.h
@@ -2959,7 +2959,7 @@ encode_opndsgen_10000000(byte *pc, instr_t *instr, uint enc)
     uint dst0, src0;
     if (instr_num_dsts(instr) == 1 && instr_num_srcs(instr) == 1 &&
         encode_opnd_x0(enc & 0x9f00001f, opcode, pc, instr_get_dst(instr, 0), &dst0) &&
-        encode_opnd_adr(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0)) {
+        encode_opnd_adr(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0, instr)) {
         ASSERT((dst0 & 0xffffffe0) == 0);
         ASSERT((src0 & 0x9f00001f) == 0);
         enc |= dst0 | src0;
@@ -5077,7 +5077,7 @@ encode_opndsgen_90000000(byte *pc, instr_t *instr, uint enc)
     uint dst0, src0;
     if (instr_num_dsts(instr) == 1 && instr_num_srcs(instr) == 1 &&
         encode_opnd_x0(enc & 0x9f00001f, opcode, pc, instr_get_dst(instr, 0), &dst0) &&
-        encode_opnd_adrp(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0)) {
+        encode_opnd_adrp(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0, instr)) {
         ASSERT((dst0 & 0xffffffe0) == 0);
         ASSERT((src0 & 0x9f00001f) == 0);
         enc |= dst0 | src0;
diff --git a/core/arch/aarch64/instr_create.h b/core/arch/aarch64/instr_create.h
index 8a7fa192a9b99073d68950d6e494373b5dd3014d..5725fc8b174201fe1e18e38b09e6f38578220c8b 100644
--- a/core/arch/aarch64/instr_create.h
+++ b/core/arch/aarch64/instr_create.h
@@ -334,6 +334,8 @@
   instr_create_1dst_4src((dc), OP_sub, (rd), (rn), (rm_or_imm), (sht), (sha))
 #define INSTR_CREATE_svc(dc, imm) \
   instr_create_0dst_1src((dc), OP_svc, (imm))
+#define INSTR_CREATE_adr(dc, rt, imm) \
+  instr_create_1dst_1src(dc, OP_adr, rt, imm)
 
 /* FIXME i#1569: these two should perhaps not be provided */
 #define INSTR_CREATE_add_shimm(dc, rd, rn, rm_or_imm, sht, sha) \