diff --git a/api/samples/memtrace_simple.c b/api/samples/memtrace_simple.c index f1b2a5de1e5a861f6e4872ff9a9f95cce479a12b..8f950bd801b8dacb6c89a507a1e46fc0762ec626 100644 --- a/api/samples/memtrace_simple.c +++ b/api/samples/memtrace_simple.c @@ -37,7 +37,11 @@ * Collects the memory reference information and dumps it to a file as text. * * (1) It fills a per-thread-buffer with inlined instrumentation. - * (2) It calls a clean call to dump the buffer into a file. + * (2) It calls a clean call to dump the buffer into a file. On AArch64, clean + * calls add too many additional instructions. To reduce the number of + * instructions added to each instrumented basic block, the clean call + * is placed in a separate code cache page and jumps to that code page + * are inserted instead of clean calls. * * The profile consists of list of <type, size, addr> entries representing * - mem ref instr: e.g., { type = 42 (call), size = 5, addr = 0x7f59c2d002d3 } @@ -108,6 +112,10 @@ enum { static reg_id_t tls_seg; static uint tls_offs; static int tls_idx; +#if defined(AARCH64) +static size_t page_size; +static app_pc code_cache; +#endif #define TLS_SLOT(tls_base, enum_val) (void **)((byte *)(tls_base)+tls_offs+(enum_val)) #define BUF_PTR(tls_base) *(mem_ref_t **)TLS_SLOT(tls_base, MEMTRACE_TLS_OFFS_BUF_PTR) @@ -147,6 +155,42 @@ clean_call(void) memtrace(drcontext); } +#if defined(AARCH64) +static void +code_cache_init(void) +{ + void *drcontext; + instrlist_t *ilist; + instr_t *where; + byte *end; + + drcontext = dr_get_current_drcontext(); + code_cache = dr_nonheap_alloc(page_size, + DR_MEMPROT_READ | + DR_MEMPROT_WRITE | + DR_MEMPROT_EXEC); + ilist = instrlist_create(drcontext); + /* The lean procecure simply performs a clean call, and then jump back */ + /* jump back to the DR's code cache */ + where = INSTR_CREATE_br(drcontext, opnd_create_reg(DR_REG_X14)); + instrlist_meta_append(ilist, where); + /* clean call */ + dr_insert_clean_call(drcontext, ilist, where, (void *)clean_call, false, 0); + /* Encodes the instructions into memory and then cleans up. */ + end = instrlist_encode(drcontext, ilist, code_cache, false); + DR_ASSERT((size_t)(end - code_cache) < page_size); + instrlist_clear_and_destroy(drcontext, ilist); + /* set the memory as just +rx now */ + dr_memory_protect(code_cache, page_size, DR_MEMPROT_READ | DR_MEMPROT_EXEC); +} + +static void +code_cache_exit(void) +{ + dr_nonheap_free(code_cache, page_size); +} +#endif + static void insert_load_buf_ptr(void *drcontext, instrlist_t *ilist, instr_t *where, reg_id_t reg_ptr) @@ -231,17 +275,9 @@ insert_save_addr(void *drcontext, instrlist_t *ilist, instr_t *where, /* insert inline code to add an instruction entry into the buffer */ static void -instrument_instr(void *drcontext, instrlist_t *ilist, instr_t *where) +instrument_instr(void *drcontext, instrlist_t *ilist, instr_t *where, + reg_id_t reg_ptr, reg_id_t reg_tmp) { - /* We need two scratch registers */ - reg_id_t reg_ptr, reg_tmp; - if (drreg_reserve_register(drcontext, ilist, where, NULL, ®_ptr) != - DRREG_SUCCESS || - drreg_reserve_register(drcontext, ilist, where, NULL, ®_tmp) != - DRREG_SUCCESS) { - DR_ASSERT(false); /* cannot recover */ - return; - } insert_load_buf_ptr(drcontext, ilist, where, reg_ptr); insert_save_type(drcontext, ilist, where, reg_ptr, reg_tmp, (ushort)instr_get_opcode(where)); @@ -250,26 +286,13 @@ instrument_instr(void *drcontext, instrlist_t *ilist, instr_t *where) insert_save_pc(drcontext, ilist, where, reg_ptr, reg_tmp, instr_get_app_pc(where)); insert_update_buf_ptr(drcontext, ilist, where, reg_ptr, sizeof(mem_ref_t)); - /* Restore scratch registers */ - if (drreg_unreserve_register(drcontext, ilist, where, reg_ptr) != DRREG_SUCCESS || - drreg_unreserve_register(drcontext, ilist, where, reg_tmp) != DRREG_SUCCESS) - DR_ASSERT(false); } /* insert inline code to add a memory reference info entry into the buffer */ static void instrument_mem(void *drcontext, instrlist_t *ilist, instr_t *where, - opnd_t ref, bool write) + opnd_t ref, bool write, reg_id_t reg_ptr, reg_id_t reg_tmp) { - /* We need two scratch registers */ - reg_id_t reg_ptr, reg_tmp; - if (drreg_reserve_register(drcontext, ilist, where, NULL, ®_ptr) != - DRREG_SUCCESS || - drreg_reserve_register(drcontext, ilist, where, NULL, ®_tmp) != - DRREG_SUCCESS) { - DR_ASSERT(false); /* cannot recover */ - return; - } /* save_addr should be called first as reg_ptr or reg_tmp maybe used in ref */ insert_save_addr(drcontext, ilist, where, ref, reg_ptr, reg_tmp); insert_save_type(drcontext, ilist, where, reg_ptr, reg_tmp, @@ -277,12 +300,37 @@ instrument_mem(void *drcontext, instrlist_t *ilist, instr_t *where, insert_save_size(drcontext, ilist, where, reg_ptr, reg_tmp, (ushort)drutil_opnd_mem_size_in_bytes(ref, where)); insert_update_buf_ptr(drcontext, ilist, where, reg_ptr, sizeof(mem_ref_t)); - /* Restore scratch registers */ - if (drreg_unreserve_register(drcontext, ilist, where, reg_ptr) != DRREG_SUCCESS || - drreg_unreserve_register(drcontext, ilist, where, reg_tmp) != DRREG_SUCCESS) - DR_ASSERT(false); } +#if defined(AARCH64) +static void +insert_lean_call(void *drcontext, instrlist_t *ilist, instr_t *where, + app_pc pc, reg_id_t scratch1, reg_id_t scratch2) +{ + /* We jump to lean procedure which performs full context switch and + * clean call invocation. This is to reduce the code cache size. + */ + DR_ASSERT(scratch1 == DR_REG_X14); + + instr_t *restore = INSTR_CREATE_label(drcontext); + + /* this is the return address for jumping back from lean procedure */ + MINSERT(ilist, where, + INSTR_CREATE_adr(drcontext, + opnd_create_reg(DR_REG_X14), + opnd_create_instr(restore))); + + /* Jump to clean call in code cache */ + instrlist_insert_mov_immed_ptrsz(drcontext, (ptr_int_t)code_cache, + opnd_create_reg(scratch2), + ilist, where, NULL, NULL); + MINSERT(ilist, where, + INSTR_CREATE_br(drcontext, opnd_create_reg(scratch2))); + + MINSERT(ilist, where, restore); +} +#endif + /* For each memory reference app instr, we insert inline code to fill the buffer * with an instruction entry and memory reference entries. */ @@ -292,24 +340,45 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, bool translating, void *user_data) { int i; + reg_id_t reg_ptr, reg_tmp; + IF_AARCH64(drvector_t allowed;) if (!instr_is_app(instr)) return DR_EMIT_DEFAULT; + if (!instr_reads_memory(instr) && !instr_writes_memory(instr)) return DR_EMIT_DEFAULT; +#if defined(AARCH64) + drreg_init_and_fill_vector(&allowed, false); + drreg_set_vector_entry(&allowed, DR_REG_X14, true); +#endif + /* We need two scratch registers */ + if (drreg_reserve_register(drcontext, bb, instr, + IF_AARCH64_ELSE(&allowed, NULL), ®_ptr) != + DRREG_SUCCESS || + drreg_reserve_register(drcontext, bb, instr, NULL, ®_tmp) != + DRREG_SUCCESS) { + IF_AARCH64(drvector_delete(&allowed)); + DR_ASSERT(false); /* cannot recover */ + return DR_EMIT_DEFAULT; + } + IF_AARCH64(drvector_delete(&allowed)); + /* insert code to add an entry for app instruction */ - instrument_instr(drcontext, bb, instr); + instrument_instr(drcontext, bb, instr, reg_ptr, reg_tmp); /* insert code to add an entry for each memory reference opnd */ for (i = 0; i < instr_num_srcs(instr); i++) { if (opnd_is_memory_reference(instr_get_src(instr, i))) - instrument_mem(drcontext, bb, instr, instr_get_src(instr, i), false); + instrument_mem(drcontext, bb, instr, instr_get_src(instr, i), false, + reg_ptr, reg_tmp); } for (i = 0; i < instr_num_dsts(instr); i++) { if (opnd_is_memory_reference(instr_get_dst(instr, i))) - instrument_mem(drcontext, bb, instr, instr_get_dst(instr, i), true); + instrument_mem(drcontext, bb, instr, instr_get_dst(instr, i), true, + reg_ptr, reg_tmp); } /* insert code to call clean_call for processing the buffer */ @@ -329,7 +398,17 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, * forthcoming buffer filling API (i#513) will provide that. */ IF_AARCHXX(&& !instr_is_exclusive_store(instr))) +#if defined(AARCH64) + insert_lean_call(drcontext, bb, instr, + instr_get_app_pc(instr), reg_ptr, reg_tmp); +#else dr_insert_clean_call(drcontext, bb, instr, (void *)clean_call, false, 0); +#endif + + /* Restore scratch registers */ + if (drreg_unreserve_register(drcontext, bb, instr, reg_ptr) != DRREG_SUCCESS || + drreg_unreserve_register(drcontext, bb, instr, reg_tmp) != DRREG_SUCCESS) + DR_ASSERT(false); return DR_EMIT_DEFAULT; } @@ -401,6 +480,7 @@ event_thread_exit(void *drcontext) static void event_exit(void) { + IF_AARCH64(code_cache_exit()); dr_log(NULL, LOG_ALL, 1, "Client 'memtrace' num refs seen: "SZFMT"\n", num_refs); if (!dr_raw_tls_cfree(tls_offs, MEMTRACE_TLS_COUNT)) DR_ASSERT(false); @@ -425,6 +505,7 @@ dr_client_main(client_id_t id, int argc, const char *argv[]) drreg_options_t ops = {sizeof(ops), 3, false}; dr_set_client_name("DynamoRIO Sample Client 'memtrace'", "http://dynamorio.org/issues"); + IF_AARCH64(page_size = dr_page_size()); if (!drmgr_init() || drreg_init(&ops) != DRREG_SUCCESS || !drutil_init()) DR_ASSERT(false); @@ -450,6 +531,7 @@ dr_client_main(client_id_t id, int argc, const char *argv[]) if (!dr_raw_tls_calloc(&tls_seg, &tls_offs, MEMTRACE_TLS_COUNT, 0)) DR_ASSERT(false); + IF_AARCH64(code_cache_init()); /* make it easy to tell, by looking at log file, which client executed */ dr_log(NULL, LOG_ALL, 1, "Client 'memtrace' initializing\n"); } diff --git a/core/arch/aarch64/codec.c b/core/arch/aarch64/codec.c index ecdf6d251034e6016fde3e696c66d14732a3ebb6..e8c17331747abd3bbe2307e5e40a8a36bb1fc298 100644 --- a/core/arch/aarch64/codec.c +++ b/core/arch/aarch64/codec.c @@ -431,15 +431,21 @@ decode_opnd_adr_page(int scale, uint enc, byte *pc, OUT opnd_t *opnd) } static bool -encode_opnd_adr_page(int scale, byte *pc, opnd_t opnd, OUT uint *enc_out) +encode_opnd_adr_page(int scale, byte *pc, opnd_t opnd, OUT uint *enc_out, + instr_t *instr) { - void *addr; + ptr_int_t offset; uint bits; - if (!opnd_is_rel_addr(opnd)) + if (opnd_is_rel_addr(opnd)) + offset = (ptr_int_t)opnd_get_addr(opnd) - + (ptr_int_t)((ptr_uint_t)pc >> scale << scale); + else if (opnd.kind == INSTR_kind) + offset = (ptr_int_t) + ((byte *)opnd_get_instr(opnd)->note - (byte *)instr->note); + else return false; - addr = opnd_get_addr(opnd); - if (!try_encode_int(&bits, 21, scale, - (ptr_int_t)addr - (ptr_int_t)((ptr_uint_t)pc >> scale << scale))) + + if (!try_encode_int(&bits, 21, scale, offset)) return false; *enc_out = (bits & 3) << 29 | (bits & 0x1ffffc) << 3; return true; @@ -846,9 +852,10 @@ decode_opnd_adr(uint enc, int opcode, byte *pc, OUT opnd_t *opnd) } static inline bool -encode_opnd_adr(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out) +encode_opnd_adr(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out, + instr_t *instr) { - return encode_opnd_adr_page(0, pc, opnd, enc_out); + return encode_opnd_adr_page(0, pc, opnd, enc_out, instr); } /* adrp: operand of ADRP */ @@ -860,9 +867,10 @@ decode_opnd_adrp(uint enc, int opcode, byte *pc, OUT opnd_t *opnd) } static inline bool -encode_opnd_adrp(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out) +encode_opnd_adrp(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out, + instr_t *instr) { - return encode_opnd_adr_page(12, pc, opnd, enc_out); + return encode_opnd_adr_page(12, pc, opnd, enc_out, instr); } /* b0: B register at bit position 0 */ diff --git a/core/arch/aarch64/codec.py b/core/arch/aarch64/codec.py index 3c3054bcf10939aecd0db2e786cd30fa6bd9a537..9f1d7e3e096143c4ae520c2019d08396f61b2dad 100755 --- a/core/arch/aarch64/codec.py +++ b/core/arch/aarch64/codec.py @@ -124,6 +124,14 @@ def generate_decoder(patterns, opndsgen, opndtypes): c.append('}') return '\n'.join(c) + '\n' + +def maybe_instr(opnd): + if opnd in ('adr', 'adrp'): + return ', instr' + else: + return '' + + def generate_encoder(patterns, opndsgen, opndtypes): c = [] for name in sorted(opndsgen): @@ -144,11 +152,11 @@ def generate_encoder(patterns, opndsgen, opndtypes): tests = (['instr_num_dsts(instr) == %d && instr_num_srcs(instr) == %d' % (len(dsts), len(srcs))] + ['encode_opnd_%s(enc & 0x%08x, opcode, ' - 'pc, instr_get_dst(instr, %d), &dst%d)' % - (dsts[i], f | opndtypes[dsts[i]], i, i) for i in range(len(dsts))] + + 'pc, instr_get_dst(instr, %d), &dst%d%s)' % + (dsts[i], f | opndtypes[dsts[i]], i, i, maybe_instr(dsts[i])) for i in range(len(dsts))] + ['encode_opnd_%s(enc & 0x%08x, opcode, ' - 'pc, instr_get_src(instr, %d), &src%d)' % - (srcs[i], f | opndtypes[srcs[i]], i, i) for i in range(len(srcs))]) + 'pc, instr_get_src(instr, %d), &src%d%s)' % + (srcs[i], f | opndtypes[srcs[i]], i, i, maybe_instr(srcs[i])) for i in range(len(srcs))]) tests2 = (['dst%d == (enc & 0x%08x)' % (i, opndtypes[dsts[i]]) for i in range(len(dsts))] + ['src%d == (enc & 0x%08x)' % (i, opndtypes[srcs[i]]) diff --git a/core/arch/aarch64/codec.txt b/core/arch/aarch64/codec.txt index 732914e3adffd24ac9f9681fa85c74a6d1f015b3..b17563572042da05760f65ee63428a997b301bda 100644 --- a/core/arch/aarch64/codec.txt +++ b/core/arch/aarch64/codec.txt @@ -143,6 +143,8 @@ x---------------------xxxxx----- wx5sp # W/X register or WSP/XSP x----------------xxxxx---------- wx10 # W/X register (or WZR/XZR) x----------xxxxx---------------- wx16 # W/X register (or WZR/XZR) +# Note: The encoders for adr and adrp take the current instruction as argument +# in order to support calculating offsets for instruction operands. ################################################################################ # Instruction patterns diff --git a/core/arch/aarch64/encode_gen.h b/core/arch/aarch64/encode_gen.h index c7dc2d65de352c72d096b72850e7faffcaf016cf..4a24f31d66be48ca05e67afd9a056ccf581e6164 100644 --- a/core/arch/aarch64/encode_gen.h +++ b/core/arch/aarch64/encode_gen.h @@ -2959,7 +2959,7 @@ encode_opndsgen_10000000(byte *pc, instr_t *instr, uint enc) uint dst0, src0; if (instr_num_dsts(instr) == 1 && instr_num_srcs(instr) == 1 && encode_opnd_x0(enc & 0x9f00001f, opcode, pc, instr_get_dst(instr, 0), &dst0) && - encode_opnd_adr(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0)) { + encode_opnd_adr(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0, instr)) { ASSERT((dst0 & 0xffffffe0) == 0); ASSERT((src0 & 0x9f00001f) == 0); enc |= dst0 | src0; @@ -5077,7 +5077,7 @@ encode_opndsgen_90000000(byte *pc, instr_t *instr, uint enc) uint dst0, src0; if (instr_num_dsts(instr) == 1 && instr_num_srcs(instr) == 1 && encode_opnd_x0(enc & 0x9f00001f, opcode, pc, instr_get_dst(instr, 0), &dst0) && - encode_opnd_adrp(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0)) { + encode_opnd_adrp(enc & 0xffffffe0, opcode, pc, instr_get_src(instr, 0), &src0, instr)) { ASSERT((dst0 & 0xffffffe0) == 0); ASSERT((src0 & 0x9f00001f) == 0); enc |= dst0 | src0; diff --git a/core/arch/aarch64/instr_create.h b/core/arch/aarch64/instr_create.h index 8a7fa192a9b99073d68950d6e494373b5dd3014d..5725fc8b174201fe1e18e38b09e6f38578220c8b 100644 --- a/core/arch/aarch64/instr_create.h +++ b/core/arch/aarch64/instr_create.h @@ -334,6 +334,8 @@ instr_create_1dst_4src((dc), OP_sub, (rd), (rn), (rm_or_imm), (sht), (sha)) #define INSTR_CREATE_svc(dc, imm) \ instr_create_0dst_1src((dc), OP_svc, (imm)) +#define INSTR_CREATE_adr(dc, rt, imm) \ + instr_create_1dst_1src(dc, OP_adr, rt, imm) /* FIXME i#1569: these two should perhaps not be provided */ #define INSTR_CREATE_add_shimm(dc, rd, rn, rm_or_imm, sht, sha) \