From 4db16a9480af2c4f36eb8023193cd54545efbe54 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 11 Oct 2011 15:59:03 -0700 Subject: [PATCH] intel: Add .aub file output support. This will allow the driver to capture all of its execution state to a file for later debugging. intel_gpu_dump is limited in that it only captures batchbuffers, and Mesa's captures, while more complete, still capture only a portion of the state involved in execution. This is a squash commit of a long series of hacking as we tried to get the resulting traces to work in the internal simulator. It contains contributions by Yuanhan Liu and Kenneth Graunke. v2: Drop the MI_FLUSH_ENABLE setup. Reviewed-by: Kenneth Graunke Signed-off-by: Eric Anholt Signed-off-by: Yuanhan Liu Signed-off-by: Kenneth Graunke --- intel/Makefile.am | 1 + intel/intel_aub.h | 123 ++++++++++++++++ intel/intel_bufmgr.h | 14 ++ intel/intel_bufmgr_gem.c | 305 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 443 insertions(+) create mode 100644 intel/intel_aub.h diff --git a/intel/Makefile.am b/intel/Makefile.am index 06362b69..dc01a96d 100644 --- a/intel/Makefile.am +++ b/intel/Makefile.am @@ -53,6 +53,7 @@ intel_bufmgr_gem_o_CFLAGS = $(AM_CFLAGS) -c99 libdrm_intelincludedir = ${includedir}/libdrm libdrm_intelinclude_HEADERS = intel_bufmgr.h \ + intel_aub.h \ intel_debug.h # This may be interesting even outside of "make check", due to the -dump option. diff --git a/intel/intel_aub.h b/intel/intel_aub.h new file mode 100644 index 00000000..a36fd53a --- /dev/null +++ b/intel/intel_aub.h @@ -0,0 +1,123 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt + * + */ + +/** @file intel_aub.h + * + * The AUB file is a file format used by Intel's internal simulation + * and other validation tools. It can be used at various levels by a + * driver to input state to the simulated hardware or a replaying + * debugger. + * + * We choose to dump AUB files using the trace block format for ease + * of implementation -- dump out the blocks of memory as plain blobs + * and insert ring commands to execute the batchbuffer blob. + */ + +#ifndef _INTEL_AUB_H +#define _INTEL_AUB_H + +#define AUB_MI_NOOP (0) +#define AUB_MI_BATCH_BUFFER_START (0x31 << 23) +#define AUB_PIPE_CONTROL (0x7a000002) + +/* DW0: instruction type. */ + +#define CMD_AUB (7 << 29) + +#define CMD_AUB_HEADER (CMD_AUB | (1 << 23) | (0x05 << 16)) +/* DW1 */ +# define AUB_HEADER_MAJOR_SHIFT 24 +# define AUB_HEADER_MINOR_SHIFT 16 + +#define CMD_AUB_TRACE_HEADER_BLOCK (CMD_AUB | (1 << 23) | (0x41 << 16)) +#define CMD_AUB_DUMP_BMP (CMD_AUB | (1 << 23) | (0x9e << 16)) + +/* DW1 */ +#define AUB_TRACE_OPERATION_MASK 0x000000ff +#define AUB_TRACE_OP_COMMENT 0x00000000 +#define AUB_TRACE_OP_DATA_WRITE 0x00000001 +#define AUB_TRACE_OP_COMMAND_WRITE 0x00000002 +#define AUB_TRACE_OP_MMIO_WRITE 0x00000003 +// operation = TRACE_DATA_WRITE, Type +#define AUB_TRACE_TYPE_MASK 0x0000ff00 +#define AUB_TRACE_TYPE_NOTYPE (0 << 8) +#define AUB_TRACE_TYPE_BATCH (1 << 8) +#define AUB_TRACE_TYPE_VERTEX_BUFFER (5 << 8) +#define AUB_TRACE_TYPE_2D_MAP (6 << 8) +#define AUB_TRACE_TYPE_CUBE_MAP (7 << 8) +#define AUB_TRACE_TYPE_VOLUME_MAP (9 << 8) +#define AUB_TRACE_TYPE_1D_MAP (10 << 8) +#define AUB_TRACE_TYPE_CONSTANT_BUFFER (11 << 8) +#define AUB_TRACE_TYPE_CONSTANT_URB (12 << 8) +#define AUB_TRACE_TYPE_INDEX_BUFFER (13 << 8) +#define AUB_TRACE_TYPE_GENERAL (14 << 8) +#define AUB_TRACE_TYPE_SURFACE (15 << 8) + + +// operation = TRACE_COMMAND_WRITE, Type = +#define AUB_TRACE_TYPE_RING_HWB (1 << 8) +#define AUB_TRACE_TYPE_RING_PRB0 (2 << 8) +#define AUB_TRACE_TYPE_RING_PRB1 (3 << 8) +#define AUB_TRACE_TYPE_RING_PRB2 (4 << 8) + +// Address space +#define AUB_TRACE_ADDRESS_SPACE_MASK 0x00ff0000 +#define AUB_TRACE_MEMTYPE_GTT (0 << 16) +#define AUB_TRACE_MEMTYPE_LOCAL (1 << 16) +#define AUB_TRACE_MEMTYPE_NONLOCAL (2 << 16) +#define AUB_TRACE_MEMTYPE_PCI (3 << 16) +#define AUB_TRACE_MEMTYPE_GTT_ENTRY (4 << 16) + +/* DW2 */ +// operation = TRACE_DATA_WRITE, Type = TRACE_DATA_WRITE_GENERAL_STATE +#define AUB_TRACE_GENERAL_STATE_MASK 0x000000ff + +#define AUB_TRACE_VS_STATE 0x00000001 +#define AUB_TRACE_GS_STATE 0x00000002 +#define AUB_TRACE_CL_STATE 0x00000003 +#define AUB_TRACE_SF_STATE 0x00000004 +#define AUB_TRACE_WM_STATE 0x00000005 +#define AUB_TRACE_CC_STATE 0x00000006 +#define AUB_TRACE_CL_VP 0x00000007 +#define AUB_TRACE_SF_VP 0x00000008 +#define AUB_TRACE_CC_VP 0x00000009 +#define AUB_TRACE_SAMPLER_STATE 0x0000000a +#define AUB_TRACE_KERNEL 0x0000000b +#define AUB_TRACE_SCRATCH 0x0000000c +#define AUB_TRACE_SDC 0x0000000d +#define AUB_TRACE_BLEND_STATE 0x00000016 +#define AUB_TRACE_DEPTH_STENCIL_STATE 0x00000017 + +// operation = TRACE_DATA_WRITE, Type = TRACE_DATA_WRITE_SURFACE_STATE +#define AUB_TRACE_SURFACE_STATE_MASK 0x00000ff00 +#define AUB_TRACE_BINDING_TABLE 0x000000100 +#define AUB_TRACE_SURFACE_STATE 0x000000200 + +/* DW3: address */ +/* DW4: len */ + +#endif /* _INTEL_AUB_H */ diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h index 8036031c..fa6f2b8e 100644 --- a/intel/intel_bufmgr.h +++ b/intel/intel_bufmgr.h @@ -36,6 +36,7 @@ #include #include +#include struct drm_clip_rect; @@ -84,6 +85,13 @@ struct _drm_intel_bo { int handle; }; +enum aub_dump_bmp_format { + AUB_DUMP_BMP_FORMAT_8BIT = 1, + AUB_DUMP_BMP_FORMAT_ARGB_4444 = 4, + AUB_DUMP_BMP_FORMAT_ARGB_0888 = 6, + AUB_DUMP_BMP_FORMAT_ARGB_8888 = 7, +}; + #define BO_ALLOC_FOR_RENDER (1<<0) drm_intel_bo *drm_intel_bo_alloc(drm_intel_bufmgr *bufmgr, const char *name, @@ -154,6 +162,12 @@ int drm_intel_gem_bo_get_reloc_count(drm_intel_bo *bo); void drm_intel_gem_bo_clear_relocs(drm_intel_bo *bo, int start); void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable); +void drm_intel_bufmgr_gem_set_aub_dump(drm_intel_bufmgr *bufmgr, int enable); +void drm_intel_gem_bo_aub_dump_bmp(drm_intel_bo *bo, + int x1, int y1, int width, int height, + enum aub_dump_bmp_format format, + int pitch, int offset); + int drm_intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, int crtc_id); int drm_intel_get_aperture_sizes(int fd, size_t *mappable, size_t *total); diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index ba38e503..d56593a5 100644 --- a/intel/intel_bufmgr_gem.c +++ b/intel/intel_bufmgr_gem.c @@ -58,6 +58,7 @@ #include "intel_bufmgr.h" #include "intel_bufmgr_priv.h" #include "intel_chipset.h" +#include "intel_aub.h" #include "string.h" #include "i915_drm.h" @@ -121,6 +122,9 @@ typedef struct _drm_intel_bufmgr_gem { unsigned int bo_reuse : 1; unsigned int no_exec : 1; bool fenced_relocs; + + FILE *aub_file; + uint32_t aub_offset; } drm_intel_bufmgr_gem; #define DRM_INTEL_RELOC_FENCE (1<<0) @@ -215,6 +219,8 @@ struct _drm_intel_bo_gem { /** Flags that we may need to do the SW_FINSIH ioctl on unmap. */ bool mapped_cpu_write; + + uint32_t aub_offset; }; static unsigned int @@ -1715,6 +1721,247 @@ drm_intel_update_buffer_offsets2 (drm_intel_bufmgr_gem *bufmgr_gem) } } +static void +aub_out(drm_intel_bufmgr_gem *bufmgr_gem, uint32_t data) +{ + fwrite(&data, 1, 4, bufmgr_gem->aub_file); +} + +static void +aub_out_data(drm_intel_bufmgr_gem *bufmgr_gem, void *data, size_t size) +{ + fwrite(data, 1, size, bufmgr_gem->aub_file); +} + +static void +aub_write_bo_data(drm_intel_bo *bo, uint32_t offset, uint32_t size) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + uint32_t *data; + unsigned int i; + + data = malloc(bo->size); + drm_intel_bo_get_subdata(bo, offset, size, data); + + /* Easy mode: write out bo with no relocations */ + if (!bo_gem->reloc_count) { + aub_out_data(bufmgr_gem, data, size); + free(data); + return; + } + + /* Otherwise, handle the relocations while writing. */ + for (i = 0; i < size / 4; i++) { + int r; + for (r = 0; r < bo_gem->reloc_count; r++) { + struct drm_i915_gem_relocation_entry *reloc; + drm_intel_reloc_target *info; + + reloc = &bo_gem->relocs[r]; + info = &bo_gem->reloc_target_info[r]; + + if (reloc->offset == offset + i * 4) { + drm_intel_bo_gem *target_gem; + uint32_t val; + + target_gem = (drm_intel_bo_gem *)info->bo; + + val = reloc->delta; + val += target_gem->aub_offset; + + aub_out(bufmgr_gem, val); + data[i] = val; + break; + } + } + if (r == bo_gem->reloc_count) { + /* no relocation, just the data */ + aub_out(bufmgr_gem, data[i]); + } + } + + free(data); +} + +static void +aub_bo_get_address(drm_intel_bo *bo) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + + /* Give the object a graphics address in the AUB file. We + * don't just use the GEM object address because we do AUB + * dumping before execution -- we want to successfully log + * when the hardware might hang, and we might even want to aub + * capture for a driver trying to execute on a different + * generation of hardware by disabling the actual kernel exec + * call. + */ + bo_gem->aub_offset = bufmgr_gem->aub_offset; + bufmgr_gem->aub_offset += bo->size; + /* XXX: Handle aperture overflow. */ + assert(bufmgr_gem->aub_offset < 256 * 1024 * 1024); +} + +static void +aub_write_trace_block(drm_intel_bo *bo, uint32_t type, uint32_t subtype, + uint32_t offset, uint32_t size) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + + aub_out(bufmgr_gem, + CMD_AUB_TRACE_HEADER_BLOCK | + (5 - 2)); + aub_out(bufmgr_gem, + AUB_TRACE_MEMTYPE_GTT | type | AUB_TRACE_OP_DATA_WRITE); + aub_out(bufmgr_gem, subtype); + aub_out(bufmgr_gem, bo_gem->aub_offset + offset); + aub_out(bufmgr_gem, size); + aub_write_bo_data(bo, offset, size); +} + +static void +aub_write_bo(drm_intel_bo *bo) +{ + uint32_t block_size; + uint32_t offset; + + aub_bo_get_address(bo); + + /* Break up large objects into multiple writes. Otherwise a + * 128kb VBO would overflow the 16 bits of size field in the + * packet header and everything goes badly after that. + */ + for (offset = 0; offset < bo->size; offset += block_size) { + block_size = bo->size - offset; + + if (block_size > 8 * 4096) + block_size = 8 * 4096; + + aub_write_trace_block(bo, AUB_TRACE_TYPE_NOTYPE, 0, + offset, block_size); + } +} + +/* + * Make a ringbuffer on fly and dump it + */ +static void +aub_build_dump_ringbuffer(drm_intel_bufmgr_gem *bufmgr_gem, + uint32_t batch_buffer, int ring_flag) +{ + uint32_t ringbuffer[4096]; + int ring = AUB_TRACE_TYPE_RING_PRB0; /* The default ring */ + int ring_count = 0; + + if (ring_flag == I915_EXEC_BSD) + ring = AUB_TRACE_TYPE_RING_PRB1; + + /* Make a ring buffer to execute our batchbuffer. */ + memset(ringbuffer, 0, sizeof(ringbuffer)); + ringbuffer[ring_count++] = AUB_MI_BATCH_BUFFER_START; + ringbuffer[ring_count++] = batch_buffer; + + /* Write out the ring. This appears to trigger execution of + * the ring in the simulator. + */ + aub_out(bufmgr_gem, + CMD_AUB_TRACE_HEADER_BLOCK | + (5 - 2)); + aub_out(bufmgr_gem, + AUB_TRACE_MEMTYPE_GTT | ring | AUB_TRACE_OP_COMMAND_WRITE); + aub_out(bufmgr_gem, 0); /* general/surface subtype */ + aub_out(bufmgr_gem, bufmgr_gem->aub_offset); + aub_out(bufmgr_gem, ring_count * 4); + + /* FIXME: Need some flush operations here? */ + aub_out_data(bufmgr_gem, ringbuffer, ring_count * 4); + + /* Update offset pointer */ + bufmgr_gem->aub_offset += 4096; +} + +void +drm_intel_gem_bo_aub_dump_bmp(drm_intel_bo *bo, + int x1, int y1, int width, int height, + enum aub_dump_bmp_format format, + int pitch, int offset) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo; + uint32_t cpp; + + switch (format) { + case AUB_DUMP_BMP_FORMAT_8BIT: + cpp = 1; + break; + case AUB_DUMP_BMP_FORMAT_ARGB_4444: + cpp = 2; + break; + case AUB_DUMP_BMP_FORMAT_ARGB_0888: + case AUB_DUMP_BMP_FORMAT_ARGB_8888: + cpp = 4; + break; + default: + printf("Unknown AUB dump format %d\n", format); + return; + } + + if (!bufmgr_gem->aub_file) + return; + + aub_out(bufmgr_gem, CMD_AUB_DUMP_BMP | 4); + aub_out(bufmgr_gem, (y1 << 16) | x1); + aub_out(bufmgr_gem, + (format << 24) | + (cpp << 19) | + pitch / 4); + aub_out(bufmgr_gem, (height << 16) | width); + aub_out(bufmgr_gem, bo_gem->aub_offset + offset); + aub_out(bufmgr_gem, + ((bo_gem->tiling_mode != I915_TILING_NONE) ? (1 << 2) : 0) | + ((bo_gem->tiling_mode == I915_TILING_Y) ? (1 << 3) : 0)); +} + +static void +aub_exec(drm_intel_bo *bo, int ring_flag, int used) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + int i; + + if (!bufmgr_gem->aub_file) + return; + + /* Write out all but the batchbuffer to AUB memory */ + for (i = 0; i < bufmgr_gem->exec_count - 1; i++) { + if (bufmgr_gem->exec_bos[i] != bo) + aub_write_bo(bufmgr_gem->exec_bos[i]); + } + + aub_bo_get_address(bo); + + /* Dump the batchbuffer. */ + aub_write_trace_block(bo, AUB_TRACE_TYPE_BATCH, 0, + 0, used); + aub_write_trace_block(bo, AUB_TRACE_TYPE_NOTYPE, 0, + used, bo->size - used); + + /* Dump ring buffer */ + aub_build_dump_ringbuffer(bufmgr_gem, bo_gem->aub_offset, ring_flag); + + fflush(bufmgr_gem->aub_file); + + /* + * One frame has been dumped. So reset the aub_offset for the next frame. + * + * FIXME: Can we do this? + */ + bufmgr_gem->aub_offset = 0x10000; +} + static int drm_intel_gem_bo_exec(drm_intel_bo *bo, int used, drm_clip_rect_t * cliprects, int num_cliprects, int DR4) @@ -1830,6 +2077,8 @@ drm_intel_gem_bo_mrb_exec2(drm_intel_bo *bo, int used, execbuf.rsvd1 = 0; execbuf.rsvd2 = 0; + aub_exec(bo, flags, used); + if (bufmgr_gem->no_exec) goto skip_execution; @@ -2359,6 +2608,62 @@ drm_intel_bufmgr_gem_get_devid(drm_intel_bufmgr *bufmgr) return bufmgr_gem->pci_device; } +/** + * Sets up AUB dumping. + * + * This is a trace file format that can be used with the simulator. + * Packets are emitted in a format somewhat like GPU command packets. + * You can set up a GTT and upload your objects into the referenced + * space, then send off batchbuffers and get BMPs out the other end. + */ +void +drm_intel_bufmgr_gem_set_aub_dump(drm_intel_bufmgr *bufmgr, int enable) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr; + int entry = 0x200003; + int i; + int gtt_size = 0x10000; + + if (!enable) { + if (bufmgr_gem->aub_file) { + fclose(bufmgr_gem->aub_file); + bufmgr_gem->aub_file = NULL; + } + } + + if (geteuid() != getuid()) + return; + + bufmgr_gem->aub_file = fopen("intel.aub", "w+"); + if (!bufmgr_gem->aub_file) + return; + + /* Start allocating objects from just after the GTT. */ + bufmgr_gem->aub_offset = gtt_size; + + /* Start with a (required) version packet. */ + aub_out(bufmgr_gem, CMD_AUB_HEADER | (13 - 2)); + aub_out(bufmgr_gem, + (4 << AUB_HEADER_MAJOR_SHIFT) | + (0 << AUB_HEADER_MINOR_SHIFT)); + for (i = 0; i < 8; i++) { + aub_out(bufmgr_gem, 0); /* app name */ + } + aub_out(bufmgr_gem, 0); /* timestamp */ + aub_out(bufmgr_gem, 0); /* timestamp */ + aub_out(bufmgr_gem, 0); /* comment len */ + + /* Set up the GTT. The max we can handle is 256M */ + aub_out(bufmgr_gem, CMD_AUB_TRACE_HEADER_BLOCK | (5 - 2)); + aub_out(bufmgr_gem, AUB_TRACE_MEMTYPE_NONLOCAL | 0 | AUB_TRACE_OP_DATA_WRITE); + aub_out(bufmgr_gem, 0); /* subtype */ + aub_out(bufmgr_gem, 0); /* offset */ + aub_out(bufmgr_gem, gtt_size); /* size */ + for (i = 0x000; i < gtt_size; i += 4, entry += 0x1000) { + aub_out(bufmgr_gem, entry); + } +} + /** * Initializes the GEM buffer manager, which uses the kernel to allocate, map, * and manage map buffer objections.