mirror of https://github.com/python/cpython.git
1265 lines
50 KiB
C
1265 lines
50 KiB
C
/*
|
|
* Python Perf Trampoline Support - JIT Dump Implementation
|
|
*
|
|
* This file implements the perf jitdump API for Python's performance profiling
|
|
* integration. It allows perf (Linux performance analysis tool) to understand
|
|
* and profile dynamically generated Python bytecode by creating JIT dump files
|
|
* that perf can inject into its analysis.
|
|
*
|
|
*
|
|
* IMPORTANT: This file exports specific callback functions that are part of
|
|
* Python's internal API. Do not modify the function signatures or behavior
|
|
* of exported functions without coordinating with the Python core team.
|
|
*
|
|
* Usually the binary and libraries are mapped in separate region like below:
|
|
*
|
|
* address ->
|
|
* --+---------------------+--//--+---------------------+--
|
|
* | .text | .data | ... | | .text | .data | ... |
|
|
* --+---------------------+--//--+---------------------+--
|
|
* myprog libc.so
|
|
*
|
|
* So it'd be easy and straight-forward to find a mapped binary or library from an
|
|
* address.
|
|
*
|
|
* But for JIT code, the code arena only cares about the code section. But the
|
|
* resulting DSOs (which is generated by perf inject -j) contain ELF headers and
|
|
* unwind info too. Then it'd generate following address space with synthesized
|
|
* MMAP events. Let's say it has a sample between address B and C.
|
|
*
|
|
* sample
|
|
* |
|
|
* address -> A B v C
|
|
* ---------------------------------------------------------------------------------------------------
|
|
* /tmp/jitted-PID-0.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-1.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-2.so | (headers) | .text | unwind info |
|
|
* ...
|
|
* ---------------------------------------------------------------------------------------------------
|
|
*
|
|
* If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
|
|
* the unwind info. If it maps both .text section and unwind sections, the sample
|
|
* could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
|
|
* which one is right. So to make perf happy we have non-overlapping ranges for each
|
|
* DSO:
|
|
*
|
|
* address ->
|
|
* -------------------------------------------------------------------------------------------------------
|
|
* /tmp/jitted-PID-0.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-1.so | (headers) | .text | unwind info |
|
|
* /tmp/jitted-PID-2.so | (headers) | .text | unwind info |
|
|
* ...
|
|
* -------------------------------------------------------------------------------------------------------
|
|
*
|
|
* As the trampolines are constant, we add a constant padding but in general the padding needs to have the
|
|
* size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
|
|
*/
|
|
|
|
|
|
|
|
#include "Python.h"
|
|
#include "pycore_ceval.h" // _PyPerf_Callbacks
|
|
#include "pycore_frame.h"
|
|
#include "pycore_interp.h"
|
|
#include "pycore_runtime.h" // _PyRuntime
|
|
|
|
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
|
|
|
/* Standard library includes for perf jitdump implementation */
|
|
#include <elf.h> // ELF architecture constants
|
|
#include <fcntl.h> // File control operations
|
|
#include <stdio.h> // Standard I/O operations
|
|
#include <stdlib.h> // Standard library functions
|
|
#include <sys/mman.h> // Memory mapping functions (mmap)
|
|
#include <sys/types.h> // System data types
|
|
#include <unistd.h> // System calls (sysconf, getpid)
|
|
#include <sys/time.h> // Time functions (gettimeofday)
|
|
#include <sys/syscall.h> // System call interface
|
|
|
|
// =============================================================================
|
|
// CONSTANTS AND CONFIGURATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Memory layout considerations for perf jitdump:
|
|
*
|
|
* Perf expects non-overlapping memory regions for each JIT-compiled function.
|
|
* When perf processes the jitdump file, it creates synthetic DSO (Dynamic
|
|
* Shared Object) files that contain:
|
|
* - ELF headers
|
|
* - .text section (actual machine code)
|
|
* - Unwind information (for stack traces)
|
|
*
|
|
* To ensure proper address space layout, we add padding between code regions.
|
|
* This prevents address conflicts when perf maps the synthesized DSOs.
|
|
*
|
|
* Memory layout example:
|
|
* /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
|
|
* /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding]
|
|
*
|
|
* The padding size (0x100) is chosen to accommodate typical unwind info sizes
|
|
* while maintaining 16-byte alignment requirements.
|
|
*/
|
|
#define PERF_JIT_CODE_PADDING 0x100
|
|
|
|
/* Convenient access to the global trampoline API state */
|
|
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
|
|
|
|
/* Type aliases for clarity and portability */
|
|
typedef uint64_t uword; // Word-sized unsigned integer
|
|
typedef const char* CodeComments; // Code comment strings
|
|
|
|
/* Memory size constants */
|
|
#define MB (1024 * 1024) // 1 Megabyte for buffer sizing
|
|
|
|
// =============================================================================
|
|
// ARCHITECTURE-SPECIFIC DEFINITIONS
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Returns the ELF machine architecture constant for the current platform.
|
|
* This is required for the jitdump header to correctly identify the target
|
|
* architecture for perf processing.
|
|
*
|
|
*/
|
|
static uint64_t GetElfMachineArchitecture(void) {
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
return EM_X86_64;
|
|
#elif defined(__i386__) || defined(_M_IX86)
|
|
return EM_386;
|
|
#elif defined(__aarch64__)
|
|
return EM_AARCH64;
|
|
#elif defined(__arm__) || defined(_M_ARM)
|
|
return EM_ARM;
|
|
#elif defined(__riscv)
|
|
return EM_RISCV;
|
|
#else
|
|
Py_UNREACHABLE(); // Unsupported architecture - should never reach here
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
// =============================================================================
|
|
// PERF JITDUMP DATA STRUCTURES
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Perf jitdump file format structures
|
|
*
|
|
* These structures define the binary format that perf expects for JIT dump files.
|
|
* The format is documented in the Linux perf tools source code and must match
|
|
* exactly for proper perf integration.
|
|
*/
|
|
|
|
/*
|
|
* Jitdump file header - written once at the beginning of each jitdump file
|
|
* Contains metadata about the process and jitdump format version
|
|
*/
|
|
typedef struct {
|
|
uint32_t magic; // Magic number (0x4A695444 = "JiTD")
|
|
uint32_t version; // Jitdump format version (currently 1)
|
|
uint32_t size; // Size of this header structure
|
|
uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture)
|
|
uint32_t reserved; // Reserved field (must be 0)
|
|
uint32_t process_id; // Process ID of the JIT compiler
|
|
uint64_t time_stamp; // Timestamp when jitdump was created
|
|
uint64_t flags; // Feature flags (currently unused)
|
|
} Header;
|
|
|
|
/*
|
|
* Perf event types supported by the jitdump format
|
|
* Each event type has a corresponding structure format
|
|
*/
|
|
enum PerfEvent {
|
|
PerfLoad = 0, // Code load event (new JIT function)
|
|
PerfMove = 1, // Code move event (function relocated)
|
|
PerfDebugInfo = 2, // Debug information event
|
|
PerfClose = 3, // JIT session close event
|
|
PerfUnwindingInfo = 4 // Stack unwinding information event
|
|
};
|
|
|
|
/*
|
|
* Base event structure - common header for all perf events
|
|
* Every event in the jitdump file starts with this structure
|
|
*/
|
|
struct BaseEvent {
|
|
uint32_t event; // Event type (from PerfEvent enum)
|
|
uint32_t size; // Total size of this event including payload
|
|
uint64_t time_stamp; // Timestamp when event occurred
|
|
};
|
|
|
|
/*
|
|
* Code load event - indicates a new JIT-compiled function is available
|
|
* This is the most important event type for Python profiling
|
|
*/
|
|
typedef struct {
|
|
struct BaseEvent base; // Common event header
|
|
uint32_t process_id; // Process ID where code was generated
|
|
uint32_t thread_id; // Thread ID where code was generated
|
|
uint64_t vma; // Virtual memory address where code is loaded
|
|
uint64_t code_address; // Address of the actual machine code
|
|
uint64_t code_size; // Size of the machine code in bytes
|
|
uint64_t code_id; // Unique identifier for this code region
|
|
/* Followed by:
|
|
* - null-terminated function name string
|
|
* - raw machine code bytes
|
|
*/
|
|
} CodeLoadEvent;
|
|
|
|
/*
|
|
* Code unwinding information event - provides DWARF data for stack traces
|
|
* Essential for proper stack unwinding during profiling
|
|
*/
|
|
typedef struct {
|
|
struct BaseEvent base; // Common event header
|
|
uint64_t unwind_data_size; // Size of the unwinding data
|
|
uint64_t eh_frame_hdr_size; // Size of the EH frame header
|
|
uint64_t mapped_size; // Total mapped size (with padding)
|
|
/* Followed by:
|
|
* - EH frame header
|
|
* - DWARF unwinding information
|
|
* - Padding to alignment boundary
|
|
*/
|
|
} CodeUnwindingInfoEvent;
|
|
|
|
// =============================================================================
|
|
// GLOBAL STATE MANAGEMENT
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Global state for the perf jitdump implementation
|
|
*
|
|
* This structure maintains all the state needed for generating jitdump files.
|
|
* It's designed as a singleton since there's typically only one jitdump file
|
|
* per Python process.
|
|
*/
|
|
typedef struct {
|
|
FILE* perf_map; // File handle for the jitdump file
|
|
PyThread_type_lock map_lock; // Thread synchronization lock
|
|
void* mapped_buffer; // Memory-mapped region (signals perf we're active)
|
|
size_t mapped_size; // Size of the mapped region
|
|
int code_id; // Counter for unique code region identifiers
|
|
} PerfMapJitState;
|
|
|
|
/* Global singleton instance */
|
|
static PerfMapJitState perf_jit_map_state;
|
|
|
|
// =============================================================================
|
|
// TIME UTILITIES
|
|
// =============================================================================
|
|
|
|
/* Time conversion constant */
|
|
static const intptr_t nanoseconds_per_second = 1000000000;
|
|
|
|
/*
|
|
* Get current monotonic time in nanoseconds
|
|
*
|
|
* Monotonic time is preferred for event timestamps because it's not affected
|
|
* by system clock adjustments. This ensures consistent timing relationships
|
|
* between events even if the system clock is changed.
|
|
*
|
|
* Returns: Current monotonic time in nanoseconds since an arbitrary epoch
|
|
*/
|
|
static int64_t get_current_monotonic_ticks(void) {
|
|
struct timespec ts;
|
|
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
|
|
Py_UNREACHABLE(); // Should never fail on supported systems
|
|
return 0;
|
|
}
|
|
|
|
/* Convert to nanoseconds for maximum precision */
|
|
int64_t result = ts.tv_sec;
|
|
result *= nanoseconds_per_second;
|
|
result += ts.tv_nsec;
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Get current wall clock time in microseconds
|
|
*
|
|
* Used for the jitdump file header timestamp. Unlike monotonic time,
|
|
* this represents actual wall clock time that can be correlated with
|
|
* other system events.
|
|
*
|
|
* Returns: Current time in microseconds since Unix epoch
|
|
*/
|
|
static int64_t get_current_time_microseconds(void) {
|
|
struct timeval tv;
|
|
if (gettimeofday(&tv, NULL) < 0) {
|
|
Py_UNREACHABLE(); // Should never fail on supported systems
|
|
return 0;
|
|
}
|
|
return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
|
|
}
|
|
|
|
// =============================================================================
|
|
// UTILITY FUNCTIONS
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Round up a value to the next multiple of a given number
|
|
*
|
|
* This is essential for maintaining proper alignment requirements in the
|
|
* jitdump format. Many structures need to be aligned to specific boundaries
|
|
* (typically 8 or 16 bytes) for efficient processing by perf.
|
|
*
|
|
* Args:
|
|
* value: The value to round up
|
|
* multiple: The multiple to round up to
|
|
*
|
|
* Returns: The smallest value >= input that is a multiple of 'multiple'
|
|
*/
|
|
static size_t round_up(int64_t value, int64_t multiple) {
|
|
if (multiple == 0) {
|
|
return value; // Avoid division by zero
|
|
}
|
|
|
|
int64_t remainder = value % multiple;
|
|
if (remainder == 0) {
|
|
return value; // Already aligned
|
|
}
|
|
|
|
/* Calculate how much to add to reach the next multiple */
|
|
int64_t difference = multiple - remainder;
|
|
int64_t rounded_up_value = value + difference;
|
|
|
|
return rounded_up_value;
|
|
}
|
|
|
|
// =============================================================================
|
|
// FILE I/O UTILITIES
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Write data to the jitdump file with error handling
|
|
*
|
|
* This function ensures that all data is written to the file, handling
|
|
* partial writes that can occur with large buffers or when the system
|
|
* is under load.
|
|
*
|
|
* Args:
|
|
* buffer: Pointer to data to write
|
|
* size: Number of bytes to write
|
|
*/
|
|
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
|
|
FILE* out_file = perf_jit_map_state.perf_map;
|
|
const char* ptr = (const char*)(buffer);
|
|
|
|
while (size > 0) {
|
|
const size_t written = fwrite(ptr, 1, size, out_file);
|
|
if (written == 0) {
|
|
Py_UNREACHABLE(); // Write failure - should be very rare
|
|
break;
|
|
}
|
|
size -= written;
|
|
ptr += written;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Write the jitdump file header
|
|
*
|
|
* The header must be written exactly once at the beginning of each jitdump
|
|
* file. It provides metadata that perf uses to parse the rest of the file.
|
|
*
|
|
* Args:
|
|
* pid: Process ID to include in the header
|
|
* out_file: File handle to write to (currently unused, uses global state)
|
|
*/
|
|
static void perf_map_jit_write_header(int pid, FILE* out_file) {
|
|
Header header;
|
|
|
|
/* Initialize header with required values */
|
|
header.magic = 0x4A695444; // "JiTD" magic number
|
|
header.version = 1; // Current jitdump version
|
|
header.size = sizeof(Header); // Header size for validation
|
|
header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture
|
|
header.process_id = pid; // Process identifier
|
|
header.time_stamp = get_current_time_microseconds(); // Creation time
|
|
header.flags = 0; // No special flags currently used
|
|
|
|
perf_map_jit_write_fully(&header, sizeof(header));
|
|
}
|
|
|
|
// =============================================================================
|
|
// DWARF CONSTANTS AND UTILITIES
|
|
// =============================================================================
|
|
|
|
/*
|
|
* DWARF (Debug With Arbitrary Record Formats) constants
|
|
*
|
|
* DWARF is a debugging data format used to provide stack unwinding information.
|
|
* These constants define the various encoding types and opcodes used in
|
|
* DWARF Call Frame Information (CFI) records.
|
|
*/
|
|
|
|
/* DWARF Call Frame Information version */
|
|
#define DWRF_CIE_VERSION 1
|
|
|
|
/* DWARF CFA (Call Frame Address) opcodes */
|
|
enum {
|
|
DWRF_CFA_nop = 0x0, // No operation
|
|
DWRF_CFA_offset_extended = 0x5, // Extended offset instruction
|
|
DWRF_CFA_def_cfa = 0xc, // Define CFA rule
|
|
DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset
|
|
DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset
|
|
DWRF_CFA_advance_loc = 0x40, // Advance location counter
|
|
DWRF_CFA_offset = 0x80 // Simple offset instruction
|
|
};
|
|
|
|
/* DWARF Exception Handling pointer encodings */
|
|
enum {
|
|
DWRF_EH_PE_absptr = 0x00, // Absolute pointer
|
|
DWRF_EH_PE_omit = 0xff, // Omitted value
|
|
|
|
/* Data type encodings */
|
|
DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128
|
|
DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte
|
|
DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte
|
|
DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte
|
|
DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128
|
|
DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte
|
|
DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte
|
|
DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte
|
|
DWRF_EH_PE_signed = 0x08, // Signed flag
|
|
|
|
/* Reference type encodings */
|
|
DWRF_EH_PE_pcrel = 0x10, // PC-relative
|
|
DWRF_EH_PE_textrel = 0x20, // Text-relative
|
|
DWRF_EH_PE_datarel = 0x30, // Data-relative
|
|
DWRF_EH_PE_funcrel = 0x40, // Function-relative
|
|
DWRF_EH_PE_aligned = 0x50, // Aligned
|
|
DWRF_EH_PE_indirect = 0x80 // Indirect
|
|
};
|
|
|
|
/* Additional DWARF constants for debug information */
|
|
enum { DWRF_TAG_compile_unit = 0x11 };
|
|
enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
|
|
enum {
|
|
DWRF_AT_name = 0x03, // Name attribute
|
|
DWRF_AT_stmt_list = 0x10, // Statement list
|
|
DWRF_AT_low_pc = 0x11, // Low PC address
|
|
DWRF_AT_high_pc = 0x12 // High PC address
|
|
};
|
|
enum {
|
|
DWRF_FORM_addr = 0x01, // Address form
|
|
DWRF_FORM_data4 = 0x06, // 4-byte data
|
|
DWRF_FORM_string = 0x08 // String form
|
|
};
|
|
|
|
/* Line number program opcodes */
|
|
enum {
|
|
DWRF_LNS_extended_op = 0, // Extended opcode
|
|
DWRF_LNS_copy = 1, // Copy operation
|
|
DWRF_LNS_advance_pc = 2, // Advance program counter
|
|
DWRF_LNS_advance_line = 3 // Advance line number
|
|
};
|
|
|
|
/* Line number extended opcodes */
|
|
enum {
|
|
DWRF_LNE_end_sequence = 1, // End of sequence
|
|
DWRF_LNE_set_address = 2 // Set address
|
|
};
|
|
|
|
/*
|
|
* Architecture-specific DWARF register numbers
|
|
*
|
|
* These constants define the register numbering scheme used by DWARF
|
|
* for each supported architecture. The numbers must match the ABI
|
|
* specification for proper stack unwinding.
|
|
*/
|
|
enum {
|
|
#ifdef __x86_64__
|
|
/* x86_64 register numbering (note: order is defined by x86_64 ABI) */
|
|
DWRF_REG_AX, // RAX
|
|
DWRF_REG_DX, // RDX
|
|
DWRF_REG_CX, // RCX
|
|
DWRF_REG_BX, // RBX
|
|
DWRF_REG_SI, // RSI
|
|
DWRF_REG_DI, // RDI
|
|
DWRF_REG_BP, // RBP
|
|
DWRF_REG_SP, // RSP
|
|
DWRF_REG_8, // R8
|
|
DWRF_REG_9, // R9
|
|
DWRF_REG_10, // R10
|
|
DWRF_REG_11, // R11
|
|
DWRF_REG_12, // R12
|
|
DWRF_REG_13, // R13
|
|
DWRF_REG_14, // R14
|
|
DWRF_REG_15, // R15
|
|
DWRF_REG_RA, // Return address (RIP)
|
|
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
|
|
/* AArch64 register numbering */
|
|
DWRF_REG_FP = 29, // Frame Pointer
|
|
DWRF_REG_RA = 30, // Link register (return address)
|
|
DWRF_REG_SP = 31, // Stack pointer
|
|
#else
|
|
# error "Unsupported target architecture"
|
|
#endif
|
|
};
|
|
|
|
/* DWARF encoding constants used in EH frame headers */
|
|
static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data
|
|
static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data
|
|
static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding
|
|
static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding
|
|
|
|
// =============================================================================
|
|
// ELF OBJECT CONTEXT
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Context for building ELF/DWARF structures
|
|
*
|
|
* This structure maintains state while constructing DWARF unwind information.
|
|
* It acts as a simple buffer manager with pointers to track current position
|
|
* and important landmarks within the buffer.
|
|
*/
|
|
typedef struct ELFObjectContext {
|
|
uint8_t* p; // Current write position in buffer
|
|
uint8_t* startp; // Start of buffer (for offset calculations)
|
|
uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets)
|
|
uint32_t code_size; // Size of the code being described
|
|
} ELFObjectContext;
|
|
|
|
/*
|
|
* EH Frame Header structure for DWARF unwinding
|
|
*
|
|
* This structure provides metadata about the DWARF unwinding information
|
|
* that follows. It's required by the perf jitdump format to enable proper
|
|
* stack unwinding during profiling.
|
|
*/
|
|
typedef struct {
|
|
unsigned char version; // EH frame version (always 1)
|
|
unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer
|
|
unsigned char fde_count_enc; // Encoding of FDE count
|
|
unsigned char table_enc; // Encoding of table entries
|
|
int32_t eh_frame_ptr; // Pointer to EH frame data
|
|
int32_t eh_fde_count; // Number of FDEs (Frame Description Entries)
|
|
int32_t from; // Start address of code range
|
|
int32_t to; // End address of code range
|
|
} EhFrameHeader;
|
|
|
|
// =============================================================================
|
|
// DWARF GENERATION UTILITIES
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Append a null-terminated string to the ELF context buffer
|
|
*
|
|
* Args:
|
|
* ctx: ELF object context
|
|
* str: String to append (must be null-terminated)
|
|
*
|
|
* Returns: Offset from start of buffer where string was written
|
|
*/
|
|
static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) {
|
|
uint8_t* p = ctx->p;
|
|
uint32_t ofs = (uint32_t)(p - ctx->startp);
|
|
|
|
/* Copy string including null terminator */
|
|
do {
|
|
*p++ = (uint8_t)*str;
|
|
} while (*str++);
|
|
|
|
ctx->p = p;
|
|
return ofs;
|
|
}
|
|
|
|
/*
|
|
* Append a SLEB128 (Signed Little Endian Base 128) value
|
|
*
|
|
* SLEB128 is a variable-length encoding used extensively in DWARF.
|
|
* It efficiently encodes small numbers in fewer bytes.
|
|
*
|
|
* Args:
|
|
* ctx: ELF object context
|
|
* v: Signed value to encode
|
|
*/
|
|
static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) {
|
|
uint8_t* p = ctx->p;
|
|
|
|
/* Encode 7 bits at a time, with continuation bit in MSB */
|
|
for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
|
|
*p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit
|
|
}
|
|
*p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit
|
|
|
|
ctx->p = p;
|
|
}
|
|
|
|
/*
|
|
* Append a ULEB128 (Unsigned Little Endian Base 128) value
|
|
*
|
|
* Similar to SLEB128 but for unsigned values.
|
|
*
|
|
* Args:
|
|
* ctx: ELF object context
|
|
* v: Unsigned value to encode
|
|
*/
|
|
static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) {
|
|
uint8_t* p = ctx->p;
|
|
|
|
/* Encode 7 bits at a time, with continuation bit in MSB */
|
|
for (; v >= 0x80; v >>= 7) {
|
|
*p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit
|
|
}
|
|
*p++ = (char)v; // Final byte without continuation bit
|
|
|
|
ctx->p = p;
|
|
}
|
|
|
|
/*
|
|
* Macros for generating DWARF structures
|
|
*
|
|
* These macros provide a convenient way to write various data types
|
|
* to the DWARF buffer while automatically advancing the pointer.
|
|
*/
|
|
#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit
|
|
#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit
|
|
#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit
|
|
#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit
|
|
#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address
|
|
#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128
|
|
#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128
|
|
#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string
|
|
|
|
/* Align to specified boundary with NOP instructions */
|
|
#define DWRF_ALIGNNOP(s) \
|
|
while ((uintptr_t)p & ((s)-1)) { \
|
|
*p++ = DWRF_CFA_nop; \
|
|
}
|
|
|
|
/* Write a DWARF section with automatic size calculation */
|
|
#define DWRF_SECTION(name, stmt) \
|
|
{ \
|
|
uint32_t* szp_##name = (uint32_t*)p; \
|
|
p += 4; \
|
|
stmt; \
|
|
*szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
|
|
}
|
|
|
|
// =============================================================================
|
|
// DWARF EH FRAME GENERATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Initialize DWARF .eh_frame section for a code region
|
|
*
|
|
* The .eh_frame section contains Call Frame Information (CFI) that describes
|
|
* how to unwind the stack at any point in the code. This is essential for
|
|
* proper profiling as it allows perf to generate accurate call graphs.
|
|
*
|
|
* The function generates two main components:
|
|
* 1. CIE (Common Information Entry) - describes calling conventions
|
|
* 2. FDE (Frame Description Entry) - describes specific function unwinding
|
|
*
|
|
* Args:
|
|
* ctx: ELF object context containing code size and buffer pointers
|
|
*/
|
|
static void elf_init_ehframe(ELFObjectContext* ctx) {
|
|
uint8_t* p = ctx->p;
|
|
uint8_t* framep = p; // Remember start of frame data
|
|
|
|
/*
|
|
* DWARF Unwind Table for Trampoline Function
|
|
*
|
|
* This section defines DWARF Call Frame Information (CFI) using encoded macros
|
|
* like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function
|
|
* preserves and restores registers. This is used by profiling tools (e.g., `perf`)
|
|
* and debuggers for stack unwinding in JIT-compiled code.
|
|
*
|
|
* -------------------------------------------------
|
|
* TO REGENERATE THIS TABLE FROM GCC OBJECTS:
|
|
* -------------------------------------------------
|
|
*
|
|
* 1. Create a trampoline source file (e.g., `trampoline.c`):
|
|
*
|
|
* #include <Python.h>
|
|
* typedef PyObject* (*py_evaluator)(void*, void*, int);
|
|
* PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) {
|
|
* return evaluator(ts, f, throwflag);
|
|
* }
|
|
*
|
|
* 2. Compile to an object file with frame pointer preservation:
|
|
*
|
|
* gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
|
|
*
|
|
* 3. Extract DWARF unwind info from the object file:
|
|
*
|
|
* readelf -w trampoline.o
|
|
*
|
|
* Example output from `.eh_frame`:
|
|
*
|
|
* 00000000 CIE
|
|
* Version: 1
|
|
* Augmentation: "zR"
|
|
* Code alignment factor: 4
|
|
* Data alignment factor: -8
|
|
* Return address column: 30
|
|
* DW_CFA_def_cfa: r31 (sp) ofs 0
|
|
*
|
|
* 00000014 FDE cie=00000000 pc=0..14
|
|
* DW_CFA_advance_loc: 4
|
|
* DW_CFA_def_cfa_offset: 16
|
|
* DW_CFA_offset: r29 at cfa-16
|
|
* DW_CFA_offset: r30 at cfa-8
|
|
* DW_CFA_advance_loc: 12
|
|
* DW_CFA_restore: r30
|
|
* DW_CFA_restore: r29
|
|
* DW_CFA_def_cfa_offset: 0
|
|
*
|
|
* -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`.
|
|
*
|
|
* ----------------------------------
|
|
* HOW TO TRANSLATE TO DWRF_* MACROS:
|
|
* ----------------------------------
|
|
*
|
|
* After compiling your trampoline with:
|
|
*
|
|
* gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c
|
|
*
|
|
* run:
|
|
*
|
|
* readelf -w trampoline.o
|
|
*
|
|
* to inspect the generated `.eh_frame` data. You will see two main components:
|
|
*
|
|
* 1. A CIE (Common Information Entry): shared configuration used by all FDEs.
|
|
* 2. An FDE (Frame Description Entry): function-specific unwind instructions.
|
|
*
|
|
* ---------------------
|
|
* Translating the CIE:
|
|
* ---------------------
|
|
* From `readelf -w`, you might see:
|
|
*
|
|
* 00000000 0000000000000010 00000000 CIE
|
|
* Version: 1
|
|
* Augmentation: "zR"
|
|
* Code alignment factor: 4
|
|
* Data alignment factor: -8
|
|
* Return address column: 30
|
|
* Augmentation data: 1b
|
|
* DW_CFA_def_cfa: r31 (sp) ofs 0
|
|
*
|
|
* Map this to:
|
|
*
|
|
* DWRF_SECTION(CIE,
|
|
* DWRF_U32(0); // CIE ID (always 0 for CIEs)
|
|
* DWRF_U8(DWRF_CIE_VERSION); // Version: 1
|
|
* DWRF_STR("zR"); // Augmentation string "zR"
|
|
* DWRF_UV(4); // Code alignment factor = 4
|
|
* DWRF_SV(-8); // Data alignment factor = -8
|
|
* DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30)
|
|
* DWRF_UV(1); // Augmentation data length = 1
|
|
* DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers
|
|
*
|
|
* DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa
|
|
* DWRF_UV(DWRF_REG_SP); // Register: SP (r31)
|
|
* DWRF_UV(0); // Offset = 0
|
|
*
|
|
* DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary
|
|
* )
|
|
*
|
|
* Notes:
|
|
* - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128.
|
|
* - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants.
|
|
*
|
|
* ---------------------
|
|
* Translating the FDE:
|
|
* ---------------------
|
|
* From `readelf -w`:
|
|
*
|
|
* 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014
|
|
* DW_CFA_advance_loc: 4
|
|
* DW_CFA_def_cfa_offset: 16
|
|
* DW_CFA_offset: r29 at cfa-16
|
|
* DW_CFA_offset: r30 at cfa-8
|
|
* DW_CFA_advance_loc: 12
|
|
* DW_CFA_restore: r30
|
|
* DW_CFA_restore: r29
|
|
* DW_CFA_def_cfa_offset: 0
|
|
*
|
|
* Map the FDE header and instructions to:
|
|
*
|
|
* DWRF_SECTION(FDE,
|
|
* DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here)
|
|
* DWRF_U32(-0x30); // Initial PC-relative location of the code
|
|
* DWRF_U32(ctx->code_size); // Code range covered by this FDE
|
|
* DWRF_U8(0); // Augmentation data length (none)
|
|
*
|
|
* DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes)
|
|
* DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16
|
|
* DWRF_UV(16);
|
|
*
|
|
* DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer)
|
|
* DWRF_UV(2); // At offset 2 * 8 = 16 bytes
|
|
*
|
|
* DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address)
|
|
* DWRF_UV(1); // At offset 1 * 8 = 8 bytes
|
|
*
|
|
* DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes)
|
|
*
|
|
* DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30
|
|
* DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29
|
|
*
|
|
* DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP
|
|
* DWRF_UV(0);
|
|
* )
|
|
*
|
|
* To regenerate:
|
|
* 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE.
|
|
* 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as
|
|
* the code is in a different address space every time.
|
|
* 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro:
|
|
* - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value)
|
|
* - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset)
|
|
* - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset
|
|
* - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor))
|
|
* 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers.
|
|
* 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment.
|
|
*/
|
|
|
|
/*
|
|
* Emit DWARF EH CIE (Common Information Entry)
|
|
*
|
|
* The CIE describes the calling conventions and basic unwinding rules
|
|
* that apply to all functions in this compilation unit.
|
|
*/
|
|
DWRF_SECTION(CIE,
|
|
DWRF_U32(0); // CIE ID (0 indicates this is a CIE)
|
|
DWRF_U8(DWRF_CIE_VERSION); // CIE version (1)
|
|
DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA)
|
|
DWRF_UV(1); // Code alignment factor
|
|
DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative)
|
|
DWRF_U8(DWRF_REG_RA); // Return address register number
|
|
DWRF_UV(1); // Augmentation data length
|
|
DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding
|
|
|
|
/* Initial CFI instructions - describe default calling convention */
|
|
DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address)
|
|
DWRF_UV(DWRF_REG_SP); // CFA = SP register
|
|
DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size
|
|
DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved
|
|
DWRF_UV(1); // At offset 1 from CFA
|
|
|
|
DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary
|
|
)
|
|
|
|
ctx->eh_frame_p = p; // Remember start of FDE data
|
|
|
|
/*
|
|
* Emit DWARF EH FDE (Frame Description Entry)
|
|
*
|
|
* The FDE describes unwinding information specific to this function.
|
|
* It references the CIE and provides function-specific CFI instructions.
|
|
*/
|
|
DWRF_SECTION(FDE,
|
|
DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference)
|
|
DWRF_U32(-0x30); // Machine code offset relative to .text
|
|
DWRF_U32(ctx->code_size); // Address range covered by this FDE (code lenght)
|
|
DWRF_U8(0); // Augmentation data length (none)
|
|
|
|
/*
|
|
* Architecture-specific CFI instructions
|
|
*
|
|
* These instructions describe how registers are saved and restored
|
|
* during function calls. Each architecture has different calling
|
|
* conventions and register usage patterns.
|
|
*/
|
|
#ifdef __x86_64__
|
|
/* x86_64 calling convention unwinding rules */
|
|
# if defined(__CET__) && (__CET__ & 1)
|
|
DWRF_U8(DWRF_CFA_advance_loc | 8); // Advance location by 8 bytes when CET protection is enabled
|
|
# else
|
|
DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance location by 4 bytes
|
|
# endif
|
|
DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset
|
|
DWRF_UV(16); // New offset: SP + 16
|
|
DWRF_U8(DWRF_CFA_advance_loc | 6); // Advance location by 6 bytes
|
|
DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset
|
|
DWRF_UV(8); // New offset: SP + 8
|
|
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
|
|
/* AArch64 calling convention unwinding rules */
|
|
DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 instruction (stp x29, x30)
|
|
DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset
|
|
DWRF_UV(16); // CFA = SP + 16 (stack pointer after push)
|
|
DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Frame pointer (x29) saved
|
|
DWRF_UV(2); // At offset 2 from CFA (2 * 8 = 16 bytes)
|
|
DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Link register (x30) saved
|
|
DWRF_UV(1); // At offset 1 from CFA (1 * 8 = 8 bytes)
|
|
DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...)
|
|
DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore frame pointer (x29)
|
|
DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore link register (x30)
|
|
DWRF_U8(DWRF_CFA_def_cfa_offset); // Final CFA adjustment
|
|
DWRF_UV(0); // CFA = SP + 0 (stack restored)
|
|
|
|
#else
|
|
# error "Unsupported target architecture"
|
|
#endif
|
|
|
|
DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary
|
|
)
|
|
|
|
ctx->p = p; // Update context pointer to end of generated data
|
|
}
|
|
|
|
// =============================================================================
|
|
// JITDUMP INITIALIZATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Initialize the perf jitdump interface
|
|
*
|
|
* This function sets up everything needed to generate jitdump files:
|
|
* 1. Creates the jitdump file with a unique name
|
|
* 2. Maps the first page to signal perf that we're using the interface
|
|
* 3. Writes the jitdump header
|
|
* 4. Initializes synchronization primitives
|
|
*
|
|
* The memory mapping is crucial - perf detects jitdump files by scanning
|
|
* for processes that have mapped files matching the pattern /tmp/jit-*.dump
|
|
*
|
|
* Returns: Pointer to initialized state, or NULL on failure
|
|
*/
|
|
static void* perf_map_jit_init(void) {
|
|
char filename[100];
|
|
int pid = getpid();
|
|
|
|
/* Create unique filename based on process ID */
|
|
snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
|
|
|
|
/* Create/open the jitdump file with appropriate permissions */
|
|
const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
|
|
if (fd == -1) {
|
|
return NULL; // Failed to create file
|
|
}
|
|
|
|
/* Get system page size for memory mapping */
|
|
const long page_size = sysconf(_SC_PAGESIZE);
|
|
if (page_size == -1) {
|
|
close(fd);
|
|
return NULL; // Failed to get page size
|
|
}
|
|
|
|
/*
|
|
* Map the first page of the jitdump file
|
|
*
|
|
* This memory mapping serves as a signal to perf that this process
|
|
* is generating JIT code. Perf scans /proc/.../maps looking for mapped
|
|
* files that match the jitdump naming pattern.
|
|
*
|
|
* The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
|
|
*/
|
|
perf_jit_map_state.mapped_buffer = mmap(
|
|
NULL, // Let kernel choose address
|
|
page_size, // Map one page
|
|
PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf)
|
|
MAP_PRIVATE, // Private mapping
|
|
fd, // File descriptor
|
|
0 // Offset 0 (first page)
|
|
);
|
|
|
|
if (perf_jit_map_state.mapped_buffer == NULL) {
|
|
close(fd);
|
|
return NULL; // Memory mapping failed
|
|
}
|
|
|
|
perf_jit_map_state.mapped_size = page_size;
|
|
|
|
/* Convert file descriptor to FILE* for easier I/O operations */
|
|
perf_jit_map_state.perf_map = fdopen(fd, "w+");
|
|
if (perf_jit_map_state.perf_map == NULL) {
|
|
close(fd);
|
|
return NULL; // Failed to create FILE*
|
|
}
|
|
|
|
/*
|
|
* Set up file buffering for better performance
|
|
*
|
|
* We use a large buffer (2MB) because jitdump files can be written
|
|
* frequently during program execution. Buffering reduces system call
|
|
* overhead and improves overall performance.
|
|
*/
|
|
setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
|
|
|
|
/* Write the jitdump file header */
|
|
perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
|
|
|
|
/*
|
|
* Initialize thread synchronization lock
|
|
*
|
|
* Multiple threads may attempt to write to the jitdump file
|
|
* simultaneously. This lock ensures thread-safe access to the
|
|
* global jitdump state.
|
|
*/
|
|
perf_jit_map_state.map_lock = PyThread_allocate_lock();
|
|
if (perf_jit_map_state.map_lock == NULL) {
|
|
fclose(perf_jit_map_state.perf_map);
|
|
return NULL; // Failed to create lock
|
|
}
|
|
|
|
/* Initialize code ID counter */
|
|
perf_jit_map_state.code_id = 0;
|
|
|
|
/* Configure trampoline API with padding information */
|
|
trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
|
|
|
|
return &perf_jit_map_state;
|
|
}
|
|
|
|
// =============================================================================
|
|
// MAIN JITDUMP ENTRY WRITING
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Write a complete jitdump entry for a Python function
|
|
*
|
|
* This is the main function called by Python's trampoline system whenever
|
|
* a new piece of JIT-compiled code needs to be recorded. It writes both
|
|
* the unwinding information and the code load event to the jitdump file.
|
|
*
|
|
* The function performs these steps:
|
|
* 1. Initialize jitdump system if not already done
|
|
* 2. Extract function name and filename from Python code object
|
|
* 3. Generate DWARF unwinding information
|
|
* 4. Write unwinding info event to jitdump file
|
|
* 5. Write code load event to jitdump file
|
|
*
|
|
* Args:
|
|
* state: Jitdump state (currently unused, uses global state)
|
|
* code_addr: Address where the compiled code resides
|
|
* code_size: Size of the compiled code in bytes
|
|
* co: Python code object containing metadata
|
|
*
|
|
* IMPORTANT: This function signature is part of Python's internal API
|
|
* and must not be changed without coordinating with core Python development.
|
|
*/
|
|
static void perf_map_jit_write_entry(void *state, const void *code_addr,
|
|
unsigned int code_size, PyCodeObject *co)
|
|
{
|
|
/* Initialize jitdump system on first use */
|
|
if (perf_jit_map_state.perf_map == NULL) {
|
|
void* ret = perf_map_jit_init();
|
|
if(ret == NULL){
|
|
return; // Initialization failed, silently abort
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Extract function information from Python code object
|
|
*
|
|
* We create a human-readable function name by combining the qualified
|
|
* name (includes class/module context) with the filename. This helps
|
|
* developers identify functions in perf reports.
|
|
*/
|
|
const char *entry = "";
|
|
if (co->co_qualname != NULL) {
|
|
entry = PyUnicode_AsUTF8(co->co_qualname);
|
|
}
|
|
|
|
const char *filename = "";
|
|
if (co->co_filename != NULL) {
|
|
filename = PyUnicode_AsUTF8(co->co_filename);
|
|
}
|
|
|
|
/*
|
|
* Create formatted function name for perf display
|
|
*
|
|
* Format: "py::<function_name>:<filename>"
|
|
* The "py::" prefix helps identify Python functions in mixed-language
|
|
* profiles (e.g., when profiling C extensions alongside Python code).
|
|
*/
|
|
size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
|
|
char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
|
|
if (perf_map_entry == NULL) {
|
|
return; // Memory allocation failed
|
|
}
|
|
snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
|
|
|
|
const size_t name_length = strlen(perf_map_entry);
|
|
uword base = (uword)code_addr;
|
|
uword size = code_size;
|
|
|
|
/*
|
|
* Generate DWARF unwinding information
|
|
*
|
|
* DWARF data is essential for proper stack unwinding during profiling.
|
|
* Without it, perf cannot generate accurate call graphs, especially
|
|
* in optimized code where frame pointers may be omitted.
|
|
*/
|
|
ELFObjectContext ctx;
|
|
char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient)
|
|
ctx.code_size = code_size;
|
|
ctx.startp = ctx.p = (uint8_t*)buffer;
|
|
|
|
/* Generate EH frame (Exception Handling frame) data */
|
|
elf_init_ehframe(&ctx);
|
|
int eh_frame_size = ctx.p - ctx.startp;
|
|
|
|
/*
|
|
* Write Code Unwinding Information Event
|
|
*
|
|
* This event must be written before the code load event to ensure
|
|
* perf has the unwinding information available when it processes
|
|
* the code region.
|
|
*/
|
|
CodeUnwindingInfoEvent ev2;
|
|
ev2.base.event = PerfUnwindingInfo;
|
|
ev2.base.time_stamp = get_current_monotonic_ticks();
|
|
ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
|
|
|
|
/* Verify we don't exceed our padding budget */
|
|
assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
|
|
|
|
ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
|
|
ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment
|
|
|
|
/* Calculate total event size with padding */
|
|
int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
|
|
int padding_size = round_up(content_size, 8) - content_size; // 8-byte align
|
|
ev2.base.size = content_size + padding_size;
|
|
|
|
/* Write the unwinding info event header */
|
|
perf_map_jit_write_fully(&ev2, sizeof(ev2));
|
|
|
|
/*
|
|
* Write EH Frame Header
|
|
*
|
|
* The EH frame header provides metadata about the DWARF unwinding
|
|
* information that follows. It includes pointers and counts that
|
|
* help perf navigate the unwinding data efficiently.
|
|
*/
|
|
EhFrameHeader f;
|
|
f.version = 1;
|
|
f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte
|
|
f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count
|
|
f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte
|
|
|
|
/* Calculate relative offsets for EH frame navigation */
|
|
f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
|
|
f.eh_fde_count = 1; // We generate exactly one FDE per function
|
|
f.from = -(round_up(code_size, 8) + eh_frame_size);
|
|
|
|
int cie_size = ctx.eh_frame_p - ctx.startp;
|
|
f.to = -(eh_frame_size - cie_size);
|
|
|
|
/* Write EH frame data and header */
|
|
perf_map_jit_write_fully(ctx.startp, eh_frame_size);
|
|
perf_map_jit_write_fully(&f, sizeof(f));
|
|
|
|
/* Write padding to maintain alignment */
|
|
char padding_bytes[] = "\0\0\0\0\0\0\0\0";
|
|
perf_map_jit_write_fully(&padding_bytes, padding_size);
|
|
|
|
/*
|
|
* Write Code Load Event
|
|
*
|
|
* This event tells perf about the new code region. It includes:
|
|
* - Memory addresses and sizes
|
|
* - Process and thread identification
|
|
* - Function name for symbol resolution
|
|
* - The actual machine code bytes
|
|
*/
|
|
CodeLoadEvent ev;
|
|
ev.base.event = PerfLoad;
|
|
ev.base.size = sizeof(ev) + (name_length+1) + size;
|
|
ev.base.time_stamp = get_current_monotonic_ticks();
|
|
ev.process_id = getpid();
|
|
ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call
|
|
ev.vma = base; // Virtual memory address
|
|
ev.code_address = base; // Same as VMA for our use case
|
|
ev.code_size = size;
|
|
|
|
/* Assign unique code ID and increment counter */
|
|
perf_jit_map_state.code_id += 1;
|
|
ev.code_id = perf_jit_map_state.code_id;
|
|
|
|
/* Write code load event and associated data */
|
|
perf_map_jit_write_fully(&ev, sizeof(ev));
|
|
perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator
|
|
perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code
|
|
|
|
/* Clean up allocated memory */
|
|
PyMem_RawFree(perf_map_entry);
|
|
}
|
|
|
|
// =============================================================================
|
|
// CLEANUP AND FINALIZATION
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Finalize and cleanup the perf jitdump system
|
|
*
|
|
* This function is called when Python is shutting down or when the
|
|
* perf trampoline system is being disabled. It ensures all resources
|
|
* are properly released and all buffered data is flushed to disk.
|
|
*
|
|
* Args:
|
|
* state: Jitdump state (currently unused, uses global state)
|
|
*
|
|
* Returns: 0 on success
|
|
*
|
|
* IMPORTANT: This function signature is part of Python's internal API
|
|
* and must not be changed without coordinating with core Python development.
|
|
*/
|
|
static int perf_map_jit_fini(void* state) {
|
|
/*
|
|
* Close jitdump file with proper synchronization
|
|
*
|
|
* We need to acquire the lock to ensure no other threads are
|
|
* writing to the file when we close it. This prevents corruption
|
|
* and ensures all data is properly flushed.
|
|
*/
|
|
if (perf_jit_map_state.perf_map != NULL) {
|
|
PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
|
|
fclose(perf_jit_map_state.perf_map); // This also flushes buffers
|
|
PyThread_release_lock(perf_jit_map_state.map_lock);
|
|
|
|
/* Clean up synchronization primitive */
|
|
PyThread_free_lock(perf_jit_map_state.map_lock);
|
|
perf_jit_map_state.perf_map = NULL;
|
|
}
|
|
|
|
/*
|
|
* Unmap the memory region
|
|
*
|
|
* This removes the signal to perf that we were generating JIT code.
|
|
* After this point, perf will no longer detect this process as
|
|
* having JIT capabilities.
|
|
*/
|
|
if (perf_jit_map_state.mapped_buffer != NULL) {
|
|
munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
|
|
perf_jit_map_state.mapped_buffer = NULL;
|
|
}
|
|
|
|
/* Clear global state reference */
|
|
trampoline_api.state = NULL;
|
|
|
|
return 0; // Success
|
|
}
|
|
|
|
// =============================================================================
|
|
// PUBLIC API EXPORT
|
|
// =============================================================================
|
|
|
|
/*
|
|
* Python Perf Callbacks Structure
|
|
*
|
|
* This structure defines the callback interface that Python's trampoline
|
|
* system uses to integrate with perf profiling. It contains function
|
|
* pointers for initialization, event writing, and cleanup.
|
|
*
|
|
* CRITICAL: This structure and its contents are part of Python's internal
|
|
* API. The function signatures and behavior must remain stable to maintain
|
|
* compatibility with the Python interpreter's perf integration system.
|
|
*
|
|
* Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
|
|
*/
|
|
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
|
|
&perf_map_jit_init, // Initialization function
|
|
&perf_map_jit_write_entry, // Event writing function
|
|
&perf_map_jit_fini, // Cleanup function
|
|
};
|
|
|
|
#endif /* PY_HAVE_PERF_TRAMPOLINE */ |