diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h index 3dfc0af8756..1a7d7a689de 100644 --- a/include/drm-uapi/v3d_drm.h +++ b/include/drm-uapi/v3d_drm.h @@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu { /* Pointer to an array of ioctl extensions*/ __u64 extensions; + + struct { + __u32 ioc; + __u32 pad; + } v71; }; /* Submits a compute shader for dispatch. This job will block on any diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build index 31a0d5bfa94..8ac32b313e4 100644 --- a/src/broadcom/cle/meson.build +++ b/src/broadcom/cle/meson.build @@ -23,7 +23,8 @@ v3d_versions = [ [21, 21], [33, 33], [41, 33], - [42, 33] + [42, 33], + [71, 33] ] v3d_xml_files = [] diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml index a0242b5f1c2..624353ca2bf 100644 --- a/src/broadcom/cle/v3d_packet_v33.xml +++ b/src/broadcom/cle/v3d_packet_v33.xml @@ -1,4 +1,4 @@ - + @@ -167,13 +167,36 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1099,7 +1263,7 @@ - + @@ -1108,6 +1272,15 @@ + + + + + + + + + @@ -1117,7 +1290,7 @@ - + @@ -1126,6 +1299,19 @@ + + + + + + + + + + + + + @@ -1135,7 +1321,7 @@ - + @@ -1144,6 +1330,13 @@ + + + + + + + @@ -1155,7 +1348,7 @@ - + @@ -1166,6 +1359,13 @@ + + + + + + + @@ -1240,7 +1440,7 @@ - + @@ -1299,6 +1499,63 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1543,7 +1800,7 @@ - + @@ -1558,6 +1815,23 @@ + + + + + + + + + + + + + + + + + @@ -1611,7 +1885,7 @@ - + @@ -1652,6 +1926,82 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h index 5762e5aaa70..e5a1eb26698 100644 --- a/src/broadcom/cle/v3dx_pack.h +++ b/src/broadcom/cle/v3dx_pack.h @@ -37,6 +37,8 @@ # include "cle/v3d_packet_v41_pack.h" #elif (V3D_VERSION == 42) # include "cle/v3d_packet_v42_pack.h" +#elif (V3D_VERSION == 71) +# include "cle/v3d_packet_v71_pack.h" #else # error "Need to add a pack header include for this v3d version" #endif diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h index 6ace62b0310..cda407a00bf 100644 --- a/src/broadcom/clif/clif_private.h +++ b/src/broadcom/clif/clif_private.h @@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl, uint32_t *size, bool reloc_mode); bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl, uint32_t *size, bool reloc_mode); +bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset, + const uint8_t *cl, uint32_t *size, bool reloc_mode); static inline void out(struct clif_dump *clif, const char *fmt, ...) diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c index 272190eb2e5..7bc2b662cfc 100644 --- a/src/broadcom/common/v3d_device_info.c +++ b/src/broadcom/common/v3d_device_info.c @@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i struct drm_v3d_get_param ident1 = { .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1, }; + struct drm_v3d_get_param hub_ident3 = { + .param = DRM_V3D_PARAM_V3D_HUB_IDENT3, + }; int ret; ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0); @@ -62,10 +65,13 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i int qups = (ident1.value >> 8) & 0xf; devinfo->qpu_count = nslc * qups; + devinfo->has_accumulators = devinfo->ver < 71; + switch (devinfo->ver) { case 33: case 41: case 42: + case 71: break; default: fprintf(stderr, @@ -75,5 +81,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i return false; } - return true; + ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3); + if (ret != 0) { + fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n", + strerror(errno)); + return false; + } + + devinfo->rev = (hub_ident3.value >> 8) & 0xff; + + return true; } diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h index 97abd9b8d9f..8dfc7858727 100644 --- a/src/broadcom/common/v3d_device_info.h +++ b/src/broadcom/common/v3d_device_info.h @@ -34,11 +34,17 @@ struct v3d_device_info { /** Simple V3D version: major * 10 + minor */ uint8_t ver; + /** V3D revision number */ + uint8_t rev; + /** Size of the VPM, in bytes. */ int vpm_size; /* NSLC * QUPS from the core's IDENT registers. */ int qpu_count; + + /* If the hw has accumulator registers */ + bool has_accumulators; }; typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg); diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h index 46f38bd7484..354c8784914 100644 --- a/src/broadcom/common/v3d_limits.h +++ b/src/broadcom/common/v3d_limits.h @@ -42,7 +42,8 @@ #define V3D_MAX_SAMPLES 4 -#define V3D_MAX_DRAW_BUFFERS 4 +#define V3D_MAX_DRAW_BUFFERS 8 +#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8) #define V3D_MAX_POINT_SIZE 512.0f #define V3D_MAX_LINE_WIDTH 32 diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h index fe89398208a..b4291fb5350 100644 --- a/src/broadcom/common/v3d_macros.h +++ b/src/broadcom/common/v3d_macros.h @@ -41,6 +41,9 @@ #elif (V3D_VERSION == 42) # define V3DX(x) V3D42_##x # define v3dX(x) v3d42_##x +#elif (V3D_VERSION == 71) +# define V3DX(x) V3D71_##x +# define v3dX(x) v3d71_##x #else # error "Need to add prefixing macros for this v3d version" #endif diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h index 08d750c2cbe..a8f0cff8784 100644 --- a/src/broadcom/common/v3d_performance_counters.h +++ b/src/broadcom/common/v3d_performance_counters.h @@ -28,6 +28,110 @@ #define V3D_PERFCNT_NAME 1 #define V3D_PERFCNT_DESCRIPTION 2 +#ifndef V3D_VERSION +# error "The V3D_VERSION macro must be defined" +#endif + +#if (V3D_VERSION >= 71) + +static const char *v3d_performance_counters[][3] = { + {"CORE", "cycle-count", "[CORE] Cycle counter"}, + {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"}, + {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, + {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, + {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, + {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, + {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, + {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, + {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, + {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, + {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, + {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, + {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, + {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, + {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, + {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"}, + {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, + {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, + {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, + {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, + {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, + {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, + {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, + {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"}, + {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"}, + {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"}, + {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, + {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, + {"L2T", "L2T-local", "[L2T] Local mode access"}, + {"L2T", "L2T-writeback", "[L2T] Writeback"}, + {"L2T", "L2T-zero", "[L2T] Zero"}, + {"L2T", "L2T-merge", "[L2T] Merge"}, + {"L2T", "L2T-fill", "[L2T] Fill"}, + {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"}, + {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"}, + {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"}, + {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"}, + {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"}, + {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"}, + {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"}, + {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"}, + {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"}, + {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"}, + {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, + {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"}, + {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, + {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"}, + {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"}, + {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, + {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, + {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, + {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, + {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, + {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, + {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, + {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, + {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, + {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, + {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, + {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, + {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, + {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, + {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, + {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, + {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, + {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, + {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, + {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, + {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, + {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, + {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, + {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, + {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, + {"AXI", "AXI-read-trans", "[AXI] Read transaction count"}, + {"AXI", "AXI-write-trans", "[AXI] Write transaction count"}, + {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"}, + {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"}, + {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"}, + {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"}, + {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"}, + {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"}, + {"QPU", "QPU-active", "[QPU] Executed shader instruction"}, + {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, + {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"}, + {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, + {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"}, + {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"}, + {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"}, + {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"}, + {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"}, + {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"}, +}; + +#elif (V3D_VERSION >= 41) + static const char *v3d_performance_counters[][3] = { {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, @@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = { {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, }; +#else +static const char *v3d_performance_counters[][3] = { }; +#endif + #endif diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h index 80da224ca2d..572d0074794 100644 --- a/src/broadcom/common/v3d_tfu.h +++ b/src/broadcom/common/v3d_tfu.h @@ -48,4 +48,27 @@ #define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14 #define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15 +/* Disable level 0 write, just write following mipmaps */ +#define V3D71_TFU_IOC_DIMTW (1 << 0) +#define V3D71_TFU_IOC_FORMAT_SHIFT 12 +#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3 +#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4 +#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5 +#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6 +#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7 + +#define V3D71_TFU_IOC_STRIDE_SHIFT 16 +#define V3D71_TFU_IOC_NUMMM_SHIFT 4 + +#define V3D71_TFU_ICFG_OTYPE_SHIFT 16 +#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23 +#define V3D71_TFU_ICFG_FORMAT_RASTER 0 +#define V3D71_TFU_ICFG_FORMAT_SAND_128 1 +#define V3D71_TFU_ICFG_FORMAT_SAND_256 2 +#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11 +#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12 +#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13 +#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14 +#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15 + #endif diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c index 57872a923d3..8a50d279985 100644 --- a/src/broadcom/common/v3d_util.c +++ b/src/broadcom/common/v3d_util.c @@ -87,10 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, return best_wgs_per_sg; } +#define V3D71_TLB_COLOR_SIZE (16 * 1024) +#define V3D71_TLB_DETPH_SIZE (16 * 1024) +#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024) + +static bool +tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp) +{ + /* First, we check if we can fit this tile size allocating the depth + * TLB memory to color. + */ + if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE && + pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) { + return true; + } + + /* Otherwise the tile must fit in the main TLB buffers */ + return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE && + pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE; +} + void -v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, - bool msaa, bool double_buffer, - uint32_t *width, uint32_t *height) +v3d_choose_tile_size(const struct v3d_device_info *devinfo, + uint32_t color_attachment_count, + /* V3D 4.x max internal bpp of all RTs */ + uint32_t max_internal_bpp, + /* V3D 7.x accumulated bpp for all RTs (in bytes) */ + uint32_t total_color_bpp, + bool msaa, + bool double_buffer, + uint32_t *width, + uint32_t *height) { static const uint8_t tile_sizes[] = { 64, 64, @@ -103,19 +130,65 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, }; uint32_t idx = 0; - if (color_attachment_count > 2) - idx += 2; - else if (color_attachment_count > 1) - idx += 1; + if (devinfo->ver >= 71) { + /* In V3D 7.x, we use the actual bpp used by color attachments to compute + * the tile size instead of the maximum bpp. This may allow us to choose a + * larger tile size than we would in 4.x in scenarios with multiple RTs + * with different bpps. + * + * Also, the TLB has an auxiliary buffer of 8KB that will be automatically + * used for depth instead of the main 16KB depth TLB buffer when the depth + * tile fits in the auxiliary buffer, allowing the hardware to allocate + * the 16KB from the main depth TLB to the color TLB. If we can do that, + * then we are effectively doubling the memory we have for color and we + * can also select a larger tile size. This is necessary to support + * the most expensive configuration: 8x128bpp RTs + MSAA. + * + * FIXME: the docs state that depth TLB memory can be used for color + * if depth testing is not used by setting the 'depth disable' bit in the + * rendering configuration. However, this comes with a requirement that + * occlussion queries must not be active. We need to clarify if this means + * active at the point at which we emit a tile rendering configuration + * item, meaning that the we have a query spanning a full render pass + * (this is something we can tell before we emit the rendering + * configuration item) or active in the subpass for which we are enabling + * the bit (which we can't tell until later, when we record commands for + * the subpass). If it is the latter, then we cannot use this feature. + * + * FIXME: pending handling double_buffer. + */ + const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1); + const uint32_t depth_bpp = 4 * (msaa ? 4 : 1); + do { + const uint32_t tile_w = tile_sizes[idx * 2]; + const uint32_t tile_h = tile_sizes[idx * 2 + 1]; + if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp)) + break; + idx++; + } while (idx < ARRAY_SIZE(tile_sizes) / 2); + + /* FIXME: pending handling double_buffer */ + assert(!double_buffer); + } else { + /* On V3D 4.x tile size is selected based on the number of RTs, the + * maximum bpp across all of them and whether 4x MSAA is used. + */ + if (color_attachment_count > 4) + idx += 3; + else if (color_attachment_count > 2) + idx += 2; + else if (color_attachment_count > 1) + idx += 1; - /* MSAA and double-buffer are mutually exclusive */ - assert(!msaa || !double_buffer); - if (msaa) - idx += 2; - else if (double_buffer) - idx += 1; + /* MSAA and double-buffer are mutually exclusive */ + assert(!msaa || !double_buffer); + if (msaa) + idx += 2; + else if (double_buffer) + idx += 1; - idx += max_color_bpp; + idx += max_internal_bpp; + } assert(idx < ARRAY_SIZE(tile_sizes) / 2); @@ -170,3 +243,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type) unreachable("Unsupported primitive type"); } } + +uint32_t +v3d_internal_bpp_words(uint32_t internal_bpp) +{ + switch (internal_bpp) { + case 0 /* V3D_INTERNAL_BPP_32 */: + return 1; + case 1 /* V3D_INTERNAL_BPP_64 */: + return 2; + case 2 /* V3D_INTERNAL_BPP_128 */: + return 4; + default: + unreachable("Unsupported internal BPP"); + } +} + +uint32_t +v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, + uint32_t bpp) +{ + /* stride in multiples of 128 bits, and covers 2 rows. This is the + * reason we divide by 2 instead of 4, as we divide number of 32-bit + * words per row by 2. + */ + + return (tile_width * bpp) / 2; +} diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h index eb802b77f67..d02d41dd089 100644 --- a/src/broadcom/common/v3d_util.h +++ b/src/broadcom/common/v3d_util.h @@ -24,6 +24,7 @@ #ifndef V3D_UTIL_H #define V3D_UTIL_H +#include "util/macros.h" #include "common/v3d_device_info.h" #include "pipe/p_defines.h" @@ -36,9 +37,14 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, uint32_t wg_size); void -v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp, - bool msaa, bool double_buffer, - uint32_t *width, uint32_t *height); +v3d_choose_tile_size(const struct v3d_device_info *devinfo, + uint32_t color_attachment_count, + uint32_t max_internal_bpp, + uint32_t total_color_bpp, + bool msaa, + bool double_buffer, + uint32_t *width, + uint32_t *height); uint32_t v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); @@ -46,4 +52,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle); uint32_t v3d_hw_prim_type(enum mesa_prim prim_type); +uint32_t +v3d_internal_bpp_words(uint32_t internal_bpp); + +/* Some configuration packets want the size on log2, but starting at 0 for + * size 8. + */ +static inline uint8_t +log2_tile_size(uint32_t size) +{ + switch(size) { + case 8: + return 0; + case 16: + return 1; + case 32: + return 2; + case 64: + return 3; + default: + unreachable("Unsupported tile width/height"); + } +} + +uint32_t +v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, + uint32_t bpp); #endif diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index ad461dbe24c..4536d3bc67b 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr) static struct qreg emit_smooth_varying(struct v3d_compile *c, - struct qreg vary, struct qreg w, struct qreg r5) + struct qreg vary, struct qreg w, struct qreg c_reg) { - return vir_FADD(c, vir_FMUL(c, vary, w), r5); + return vir_FADD(c, vir_FMUL(c, vary, w), c_reg); } static struct qreg emit_noperspective_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { - return vir_FADD(c, vir_MOV(c, vary), r5); + return vir_FADD(c, vir_MOV(c, vary), c_reg); } static struct qreg emit_flat_varying(struct v3d_compile *c, - struct qreg vary, struct qreg r5) + struct qreg vary, struct qreg c_reg) { vir_MOV_dest(c, c->undef, vary); - return vir_MOV(c, r5); + return vir_MOV(c, c_reg); } static struct qreg emit_fragment_varying(struct v3d_compile *c, nir_variable *var, int8_t input_idx, uint8_t swizzle, int array_index) { - struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); - struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + struct qreg c_reg; /* C coefficient */ + + if (c->devinfo->has_accumulators) + c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); + else + c_reg = vir_reg(QFILE_REG, 0); struct qinst *ldvary = NULL; struct qreg vary; @@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, vary = vir_emit_def(c, ldvary); } else { vir_NOP(c)->qpu.sig.ldvary = true; - vary = r3; + vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); } /* Store the input value before interpolation so we can implement @@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (input_idx >= 0) { assert(var); c->interp[input_idx].vp = vary; - c->interp[input_idx].C = vir_MOV(c, r5); + c->interp[input_idx].C = vir_MOV(c, c_reg); c->interp[input_idx].mode = var->data.interpolation; } @@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, */ if (!var) { assert(input_idx < 0); - return emit_smooth_varying(c, vary, c->payload_w, r5); + return emit_smooth_varying(c, vary, c->payload_w, c_reg); } int i = c->num_inputs++; @@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var, if (var->data.centroid) { BITSET_SET(c->centroid_flags, i); result = emit_smooth_varying(c, vary, - c->payload_w_centroid, r5); + c->payload_w_centroid, c_reg); } else { - result = emit_smooth_varying(c, vary, c->payload_w, r5); + result = emit_smooth_varying(c, vary, c->payload_w, c_reg); } break; case INTERP_MODE_NOPERSPECTIVE: BITSET_SET(c->noperspective_flags, i); - result = emit_noperspective_varying(c, vary, r5); + result = emit_noperspective_varying(c, vary, c_reg); break; case INTERP_MODE_FLAT: BITSET_SET(c->flat_shade_flags, i); - result = emit_flat_varying(c, vary, r5); + result = emit_flat_varying(c, vary, c_reg); break; default: @@ -1685,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) result = vir_VFPACK(c, src[0], src[1]); break; + case nir_op_vpack_v3d: + result = vir_VPACK(c, src[0], src[1]); + break; + + case nir_op_v11fpack_v3d: + result = vir_V11FPACK(c, src[0], src[1]); + break; + + case nir_op_v10pack_v3d: + result = vir_V10PACK(c, src[0], src[1]); + break; + + case nir_op_v8pack_v3d: + result = vir_V8PACK(c, src[0], src[1]); + break; + case nir_op_unpack_half_2x16_split_x: result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); @@ -1715,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); break; } + case nir_op_vftounorm8_v3d: + result = vir_VFTOUNORM8(c, src[0]); + break; + + case nir_op_vftosnorm8_v3d: + result = vir_VFTOSNORM8(c, src[0]); + break; + + case nir_op_vftounorm10lo_v3d: + result = vir_VFTOUNORM10LO(c, src[0]); + break; + + case nir_op_vftounorm10hi_v3d: + result = vir_VFTOUNORM10HI(c, src[0]); + break; + + case nir_op_ftounorm16_v3d: + result = vir_FTOUNORM16(c, src[0]); + break; + + case nir_op_ftosnorm16_v3d: + result = vir_FTOSNORM16(c, src[0]); + break; default: fprintf(stderr, "unknown NIR ALU inst: "); @@ -2440,15 +2483,17 @@ ntq_setup_outputs(struct v3d_compile *c) switch (var->data.location) { case FRAG_RESULT_COLOR: - c->output_color_var[0] = var; - c->output_color_var[1] = var; - c->output_color_var[2] = var; - c->output_color_var[3] = var; + for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) + c->output_color_var[i] = var; break; case FRAG_RESULT_DATA0: case FRAG_RESULT_DATA1: case FRAG_RESULT_DATA2: case FRAG_RESULT_DATA3: + case FRAG_RESULT_DATA4: + case FRAG_RESULT_DATA5: + case FRAG_RESULT_DATA6: + case FRAG_RESULT_DATA7: c->output_color_var[var->data.location - FRAG_RESULT_DATA0] = var; break; @@ -4321,7 +4366,11 @@ nir_to_vir(struct v3d_compile *c) { switch (c->s->info.stage) { case MESA_SHADER_FRAGMENT: - c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + if (c->devinfo->ver < 71) + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); + else + c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); @@ -4361,8 +4410,13 @@ nir_to_vir(struct v3d_compile *c) V3D_QPU_WADDR_SYNC)); } - c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); - c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + if (c->devinfo->ver <= 42) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } else if (c->devinfo->ver >= 71) { + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); + } /* Set up the division between gl_LocalInvocationIndex and * wg_in_mem in the payload reg. @@ -4541,8 +4595,8 @@ vir_check_payload_w(struct v3d_compile *c) vir_for_each_inst_inorder(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_REG && - inst->src[i].index == 0) { + if (inst->src[i].file == c->payload_w.file && + inst->src[i].index == c->payload_w.index) { c->uses_center_w = true; return; } diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c index 3b32b48f86f..4f767296860 100644 --- a/src/broadcom/compiler/qpu_schedule.c +++ b/src/broadcom/compiler/qpu_schedule.c @@ -155,12 +155,13 @@ static void process_mux_deps(struct schedule_state *state, struct schedule_node *n, enum v3d_qpu_mux mux) { + assert(state->devinfo->ver < 71); switch (mux) { case V3D_QPU_MUX_A: add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: - if (!n->inst->qpu.sig.small_imm) { + if (!n->inst->qpu.sig.small_imm_b) { add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); } @@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, } } + +static void +process_raddr_deps(struct schedule_state *state, struct schedule_node *n, + uint8_t raddr, bool is_small_imm) +{ + assert(state->devinfo->ver >= 71); + + if (!is_small_imm) + add_read_dep(state, state->last_rf[raddr], n); +} + static bool tmu_write_is_sequence_terminator(uint32_t waddr) { @@ -285,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* If the input and output segments are shared, then all VPM reads to * a location need to happen before all writes. We handle this by * serializing all VPM operations for now. + * + * FIXME: we are assuming that the segments are shared. That is + * correct right now as we are only using shared, but technically you + * can choose. */ bool separate_vpm_segment = false; @@ -305,15 +321,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) /* XXX: LOAD_IMM */ - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) - process_mux_deps(state, n, inst->alu.add.a); - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) - process_mux_deps(state, n, inst->alu.add.b); + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.a.raddr, + inst->sig.small_imm_a); + } + } + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.add.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.add.b.raddr, + inst->sig.small_imm_b); + } + } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) - process_mux_deps(state, n, inst->alu.mul.a); - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) - process_mux_deps(state, n, inst->alu.mul.b); + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.a.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.a.raddr, + inst->sig.small_imm_c); + } + } + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + process_mux_deps(state, n, inst->alu.mul.b.mux); + } else { + process_raddr_deps(state, n, inst->alu.mul.b.raddr, + inst->sig.small_imm_d); + } + } switch (inst->alu.add.op) { case V3D_QPU_A_VPMSETUP: @@ -386,6 +426,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_r[4], n); if (v3d_qpu_writes_r5(devinfo, inst)) add_write_dep(state, &state->last_r[5], n); + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) + add_write_dep(state, &state->last_rf[0], n); /* If we add any more dependencies here we should consider whether we * also need to update qpu_inst_after_thrsw_valid_in_delay_slot. @@ -500,6 +542,10 @@ struct choose_scoreboard { int ldvary_count; int pending_ldtmu_count; bool first_ldtmu_after_thrsw; + + /* V3D 7.x */ + int last_implicit_rf0_write_tick; + bool has_rf0_flops_conflict; }; static bool @@ -524,7 +570,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard, } static bool -reads_too_soon_after_write(struct choose_scoreboard *scoreboard, +reads_too_soon(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, uint8_t raddr) +{ + switch (raddr) { + case 0: /* ldvary delayed write of C coefficient to rf0 */ + if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) + return true; + break; + default: + break; + } + + return false; +} + +static bool +reads_too_soon_after_write(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, struct qinst *qinst) { const struct v3d_qpu_instr *inst = &qinst->qpu; @@ -536,24 +599,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); if (inst->alu.add.op != V3D_QPU_A_NOP) { - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr)) + return true; + } } - if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { - return true; + if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr)) + return true; + } } } if (inst->alu.mul.op != V3D_QPU_M_NOP) { - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) + return true; + } } - if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && - mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { - return true; + if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) { + if (devinfo->ver < 71) { + if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) + return true; + } else { + if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr)) + return true; + } } } @@ -577,6 +660,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo, v3d_qpu_writes_r4(devinfo, inst)) return true; + if (devinfo->ver <= 42) + return false; + + /* Don't schedule anything that writes rf0 right after ldvary, since + * that would clash with the ldvary's delayed rf0 write (the exception + * is another ldvary, since its implicit rf0 write would also have + * one cycle of delay and would not clash). + */ + if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick && + (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + (v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !inst->sig.ldvary))) { + return true; + } + return false; } @@ -604,29 +702,36 @@ pixel_scoreboard_too_soon(struct v3d_compile *c, } static bool -qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, +qpu_instruction_uses_rf(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, uint32_t waddr) { if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && - inst->raddr_a == waddr) - return true; + if (devinfo->ver < 71) { + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; - if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && - !inst->sig.small_imm && (inst->raddr_b == waddr)) - return true; + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm_b && (inst->raddr_b == waddr)) + return true; + } else { + if (v3d71_qpu_reads_raddr(inst, waddr)) + return true; + } return false; } static bool -mux_read_stalls(struct choose_scoreboard *scoreboard, - const struct v3d_qpu_instr *inst) +read_stalls(const struct v3d_device_info *devinfo, + struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) { return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && - qpu_instruction_uses_rf(inst, + qpu_instruction_uses_rf(devinfo, inst, scoreboard->last_stallable_sfu_reg); } @@ -692,7 +797,8 @@ enum { V3D_PERIPHERAL_TMU_WAIT = (1 << 6), V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7), V3D_PERIPHERAL_TSY = (1 << 8), - V3D_PERIPHERAL_TLB = (1 << 9), + V3D_PERIPHERAL_TLB_READ = (1 << 9), + V3D_PERIPHERAL_TLB_WRITE = (1 << 10), }; static uint32_t @@ -717,8 +823,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo, if (v3d_qpu_uses_sfu(inst)) result |= V3D_PERIPHERAL_SFU; - if (v3d_qpu_uses_tlb(inst)) - result |= V3D_PERIPHERAL_TLB; + if (v3d_qpu_reads_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_READ; + if (v3d_qpu_writes_tlb(inst)) + result |= V3D_PERIPHERAL_TLB_WRITE; if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && @@ -749,32 +857,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, if (devinfo->ver < 41) return false; - /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than - * tmuc). + /* V3D 4.x can't do more than one peripheral access except in a + * few cases: */ - if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && - b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { - return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); + if (devinfo->ver <= 42) { + /* WRTMUC signal with TMU register write (other than tmuc). */ + if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, b); + } + if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_peripherals == V3D_PERIPHERAL_TMU_WRITE) { + return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); + } + + /* TMU read with VPM read/write. */ + if (a_peripherals == V3D_PERIPHERAL_TMU_READ && + (b_peripherals == V3D_PERIPHERAL_VPM_READ || + b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + if (b_peripherals == V3D_PERIPHERAL_TMU_READ && + (a_peripherals == V3D_PERIPHERAL_VPM_READ || + a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { + return true; + } + + return false; } - if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE && - b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) { - return v3d_qpu_writes_tmu_not_tmuc(devinfo, a); + /* V3D 7.x can't have more than one of these restricted peripherals */ + const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE | + V3D_PERIPHERAL_TMU_WRTMUC_SIG | + V3D_PERIPHERAL_TSY | + V3D_PERIPHERAL_TLB_READ | + V3D_PERIPHERAL_SFU | + V3D_PERIPHERAL_VPM_READ | + V3D_PERIPHERAL_VPM_WRITE; + + const uint32_t a_restricted = a_peripherals & restricted; + const uint32_t b_restricted = b_peripherals & restricted; + if (a_restricted && b_restricted) { + /* WRTMUC signal with TMU register write (other than tmuc) is + * allowed though. + */ + if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + b_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) || + (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG && + a_restricted == V3D_PERIPHERAL_TMU_WRITE && + v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) { + return false; + } } - /* V3D 4.1+ allows TMU read with VPM read/write. */ - if (a_peripherals == V3D_PERIPHERAL_TMU_READ && - (b_peripherals == V3D_PERIPHERAL_VPM_READ || - b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { - return true; + /* Only one TMU read per instruction */ + if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) && + (b_peripherals & V3D_PERIPHERAL_TMU_READ)) { + return false; } - if (b_peripherals == V3D_PERIPHERAL_TMU_READ && - (a_peripherals == V3D_PERIPHERAL_VPM_READ || - a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) { - return true; + + /* Only one TLB access per instruction */ + if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ)) && + (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE | + V3D_PERIPHERAL_TLB_READ))) { + return false; } - return false; + return true; } /* Compute a bitmask of which rf registers are used between @@ -790,42 +941,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a, uint64_t raddrs_used = 0; if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A)) raddrs_used |= (1ll << a->raddr_a); - if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) + if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B)) raddrs_used |= (1ll << a->raddr_b); if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) raddrs_used |= (1ll << b->raddr_a); - if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) + if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) raddrs_used |= (1ll << b->raddr_b); return raddrs_used; } -/* Take two instructions and attempt to merge their raddr fields - * into one merged instruction. Returns false if the two instructions - * access more than two different rf registers between them, or more - * than one rf register and one small immediate. +/* Takes two instructions and attempts to merge their raddr fields (including + * small immediates) into one merged instruction. For V3D 4.x, returns false + * if the two instructions access more than two different rf registers between + * them, or more than one rf register and one small immediate. For 7.x returns + * false if both instructions use small immediates. */ static bool qpu_merge_raddrs(struct v3d_qpu_instr *result, const struct v3d_qpu_instr *add_instr, - const struct v3d_qpu_instr *mul_instr) + const struct v3d_qpu_instr *mul_instr, + const struct v3d_device_info *devinfo) { + if (devinfo->ver >= 71) { + assert(add_instr->sig.small_imm_a + + add_instr->sig.small_imm_b <= 1); + assert(add_instr->sig.small_imm_c + + add_instr->sig.small_imm_d == 0); + assert(mul_instr->sig.small_imm_a + + mul_instr->sig.small_imm_b == 0); + assert(mul_instr->sig.small_imm_c + + mul_instr->sig.small_imm_d <= 1); + + result->sig.small_imm_a = add_instr->sig.small_imm_a; + result->sig.small_imm_b = add_instr->sig.small_imm_b; + result->sig.small_imm_c = mul_instr->sig.small_imm_c; + result->sig.small_imm_d = mul_instr->sig.small_imm_d; + + return (result->sig.small_imm_a + + result->sig.small_imm_b + + result->sig.small_imm_c + + result->sig.small_imm_d) <= 1; + } + + assert(devinfo->ver <= 42); + uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr); int naddrs = util_bitcount64(raddrs_used); if (naddrs > 2) return false; - if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) { + if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) { if (naddrs > 1) return false; - if (add_instr->sig.small_imm && mul_instr->sig.small_imm) + if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b) if (add_instr->raddr_b != mul_instr->raddr_b) return false; - result->sig.small_imm = true; - result->raddr_b = add_instr->sig.small_imm ? + result->sig.small_imm_b = true; + result->raddr_b = add_instr->sig.small_imm_b ? add_instr->raddr_b : mul_instr->raddr_b; } @@ -836,23 +1012,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, raddrs_used &= ~(1ll << raddr_a); result->raddr_a = raddr_a; - if (!result->sig.small_imm) { + if (!result->sig.small_imm_b) { if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) && raddr_a == add_instr->raddr_b) { - if (add_instr->alu.add.a == V3D_QPU_MUX_B) - result->alu.add.a = V3D_QPU_MUX_A; - if (add_instr->alu.add.b == V3D_QPU_MUX_B && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B) + result->alu.add.a.mux = V3D_QPU_MUX_A; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_A; + result->alu.add.b.mux = V3D_QPU_MUX_A; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) && raddr_a == mul_instr->raddr_b) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_B) - result->alu.mul.a = V3D_QPU_MUX_A; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_B && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B) + result->alu.mul.a.mux = V3D_QPU_MUX_A; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_A; + result->alu.mul.b.mux = V3D_QPU_MUX_A; } } } @@ -863,20 +1039,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result, result->raddr_b = raddr_b; if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) && raddr_b == add_instr->raddr_a) { - if (add_instr->alu.add.a == V3D_QPU_MUX_A) - result->alu.add.a = V3D_QPU_MUX_B; - if (add_instr->alu.add.b == V3D_QPU_MUX_A && + if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A) + result->alu.add.a.mux = V3D_QPU_MUX_B; + if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A && v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) { - result->alu.add.b = V3D_QPU_MUX_B; + result->alu.add.b.mux = V3D_QPU_MUX_B; } } if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) && raddr_b == mul_instr->raddr_a) { - if (mul_instr->alu.mul.a == V3D_QPU_MUX_A) - result->alu.mul.a = V3D_QPU_MUX_B; - if (mul_instr->alu.mul.b == V3D_QPU_MUX_A && + if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A) + result->alu.mul.a.mux = V3D_QPU_MUX_B; + if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A && v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) { - result->alu.mul.b = V3D_QPU_MUX_B; + result->alu.mul.b.mux = V3D_QPU_MUX_B; } } @@ -909,7 +1085,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op) } static void -qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) +qpu_convert_add_to_mul(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst) { STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add)); assert(inst->alu.add.op != V3D_QPU_A_NOP); @@ -927,11 +1104,85 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst) inst->flags.auf = V3D_QPU_UF_NONE; inst->alu.mul.output_pack = inst->alu.add.output_pack; - inst->alu.mul.a_unpack = inst->alu.add.a_unpack; - inst->alu.mul.b_unpack = inst->alu.add.b_unpack; + + inst->alu.mul.a.unpack = inst->alu.add.a.unpack; + inst->alu.mul.b.unpack = inst->alu.add.b.unpack; inst->alu.add.output_pack = V3D_QPU_PACK_NONE; - inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; - inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; + inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + + if (devinfo->ver >= 71) { + assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d); + assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1); + if (inst->sig.small_imm_a) { + inst->sig.small_imm_c = true; + inst->sig.small_imm_a = false; + } else if (inst->sig.small_imm_b) { + inst->sig.small_imm_d = true; + inst->sig.small_imm_b = false; + } + } +} + +static bool +can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + case V3D_QPU_M_FMOV: + return devinfo->ver >= 71; + default: + return false; + } +} + +static enum v3d_qpu_mul_op +mul_op_as_add_op(enum v3d_qpu_mul_op op) +{ + switch (op) { + case V3D_QPU_M_MOV: + return V3D_QPU_A_MOV; + case V3D_QPU_M_FMOV: + return V3D_QPU_A_FMOV; + default: + unreachable("unexpected mov opcode"); + } +} + +static void +qpu_convert_mul_to_add(struct v3d_qpu_instr *inst) +{ + STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul)); + assert(inst->alu.mul.op != V3D_QPU_M_NOP); + assert(inst->alu.add.op == V3D_QPU_A_NOP); + + memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add)); + inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op); + inst->alu.mul.op = V3D_QPU_M_NOP; + + inst->flags.ac = inst->flags.mc; + inst->flags.apf = inst->flags.mpf; + inst->flags.auf = inst->flags.muf; + inst->flags.mc = V3D_QPU_COND_NONE; + inst->flags.mpf = V3D_QPU_PF_NONE; + inst->flags.muf = V3D_QPU_UF_NONE; + + inst->alu.add.output_pack = inst->alu.mul.output_pack; + inst->alu.add.a.unpack = inst->alu.mul.a.unpack; + inst->alu.add.b.unpack = inst->alu.mul.b.unpack; + inst->alu.mul.output_pack = V3D_QPU_PACK_NONE; + inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b); + assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1); + if (inst->sig.small_imm_c) { + inst->sig.small_imm_a = true; + inst->sig.small_imm_c = false; + } else if (inst->sig.small_imm_d) { + inst->sig.small_imm_b = true; + inst->sig.small_imm_d = false; + } } static bool @@ -970,20 +1221,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(b->alu.add.op)) { mul_inst = *b; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge.alu.mul = mul_inst.alu.mul; - merge.flags.mc = b->flags.ac; - merge.flags.mpf = b->flags.apf; - merge.flags.muf = b->flags.auf; + merge.flags.mc = mul_inst.flags.mc; + merge.flags.mpf = mul_inst.flags.mpf; + merge.flags.muf = mul_inst.flags.muf; add_instr = a; mul_instr = &mul_inst; } else if (a->alu.mul.op == V3D_QPU_M_NOP && can_do_add_as_mul(a->alu.add.op)) { mul_inst = *a; - qpu_convert_add_to_mul(&mul_inst); + qpu_convert_add_to_mul(devinfo, &mul_inst); merge = mul_inst; merge.alu.add = b->alu.add; @@ -999,22 +1250,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, } } + struct v3d_qpu_instr add_inst; if (b->alu.mul.op != V3D_QPU_M_NOP) { - if (a->alu.mul.op != V3D_QPU_M_NOP) - return false; - merge.alu.mul = b->alu.mul; + if (a->alu.mul.op == V3D_QPU_M_NOP) { + merge.alu.mul = b->alu.mul; + + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; + + mul_instr = b; + add_instr = a; + } + /* If a's mul op is used but its add op is not, then see if we + * can convert either a's mul op or b's mul op to an add op + * so we can merge. + */ + else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, b->alu.mul.op)) { + add_inst = *b; + qpu_convert_mul_to_add(&add_inst); - merge.flags.mc = b->flags.mc; - merge.flags.mpf = b->flags.mpf; - merge.flags.muf = b->flags.muf; + merge.alu.add = add_inst.alu.add; - mul_instr = b; - add_instr = a; + merge.flags.ac = add_inst.flags.ac; + merge.flags.apf = add_inst.flags.apf; + merge.flags.auf = add_inst.flags.auf; + + mul_instr = a; + add_instr = &add_inst; + } else if (a->alu.add.op == V3D_QPU_A_NOP && + can_do_mul_as_add(devinfo, a->alu.mul.op)) { + add_inst = *a; + qpu_convert_mul_to_add(&add_inst); + + merge = add_inst; + merge.alu.mul = b->alu.mul; + + merge.flags.mc = b->flags.mc; + merge.flags.mpf = b->flags.mpf; + merge.flags.muf = b->flags.muf; + + mul_instr = b; + add_instr = &add_inst; + } else { + return false; + } } + /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and + * they have restrictions on the number of raddrs that can be adressed + * in a single instruction. In V3D 7.x, we don't have that restriction, + * but we are still limited to a single small immediate per instruction. + */ if (add_instr && mul_instr && - !qpu_merge_raddrs(&merge, add_instr, mul_instr)) { - return false; + !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) { + return false; } merge.sig.thrsw |= b->sig.thrsw; @@ -1025,7 +1316,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, merge.sig.ldtmu |= b->sig.ldtmu; merge.sig.ldvary |= b->sig.ldvary; merge.sig.ldvpm |= b->sig.ldvpm; - merge.sig.small_imm |= b->sig.small_imm; merge.sig.ldtlb |= b->sig.ldtlb; merge.sig.ldtlbu |= b->sig.ldtlbu; merge.sig.ucb |= b->sig.ucb; @@ -1108,7 +1398,7 @@ retry: * regfile A or B that was written to by the previous * instruction." */ - if (reads_too_soon_after_write(scoreboard, n->inst)) + if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst)) continue; if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst)) @@ -1122,10 +1412,11 @@ retry: if (pixel_scoreboard_too_soon(c, scoreboard, inst)) continue; - /* ldunif and ldvary both write r5, but ldunif does so a tick - * sooner. If the ldvary's r5 wasn't used, then ldunif might + /* ldunif and ldvary both write the same register (r5 for v42 + * and below, rf0 for v71), but ldunif does so a tick sooner. + * If the ldvary's register wasn't used, then ldunif might * otherwise get scheduled so ldunif and ldvary try to update - * r5 in the same tick. + * the register in the same tick. */ if ((inst->sig.ldunif || inst->sig.ldunifa) && scoreboard->tick == scoreboard->last_ldvary_tick + 1) { @@ -1204,11 +1495,20 @@ retry: * ldvary now if the follow-up fixup would place * it in the delay slots of a thrsw, which is not * allowed and would prevent the fixup from being - * successful. + * successful. In V3D 7.x we can allow this to happen + * as long as it is not the last delay slot. */ - if (inst->sig.ldvary && - scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) { - continue; + if (inst->sig.ldvary) { + if (c->devinfo->ver <= 42 && + scoreboard->last_thrsw_tick + 2 >= + scoreboard->tick - 1) { + continue; + } + if (c->devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 == + scoreboard->tick - 1) { + continue; + } } /* We can emit a new tmu lookup with a previous ldtmu @@ -1243,7 +1543,7 @@ retry: int prio = get_instruction_priority(c->devinfo, inst); - if (mux_read_stalls(scoreboard, inst)) { + if (read_stalls(c->devinfo, scoreboard, inst)) { /* Don't merge an instruction that stalls */ if (prev_inst) continue; @@ -1340,6 +1640,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard, } } +static void +set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick && + v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic) { + scoreboard->has_rf0_flops_conflict = true; + } +} + +static void +update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return; + + /* Thread switch restrictions: + * + * At the point of a thread switch or thread end (when the actual + * thread switch or thread end happens, not when the signalling + * instruction is processed): + * + * - If the most recent write to rf0 was from a ldunif, ldunifa, or + * ldvary instruction in which another signal also wrote to the + * register file, and the final instruction of the thread section + * contained a signal which wrote to the register file, then the + * value of rf0 is undefined at the start of the new section + * + * Here we use the scoreboard to track if our last rf0 implicit write + * happens at the same time that another signal writes the register + * file (has_rf0_flops_conflict). We will use that information when + * scheduling thrsw instructions to avoid putting anything in their + * last delay slot which has a signal that writes to the register file. + */ + + /* Reset tracking if we have an explicit rf0 write or we are starting + * a new thread section. + */ + if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) || + scoreboard->tick - scoreboard->last_thrsw_tick == 3) { + scoreboard->last_implicit_rf0_write_tick = -10; + scoreboard->has_rf0_flops_conflict = false; + } + + if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) { + scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ? + scoreboard->tick + 1 : scoreboard->tick; + } + + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); +} + static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct qinst *qinst, @@ -1383,6 +1739,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->sig.ldvary) scoreboard->last_ldvary_tick = scoreboard->tick; + update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo); + update_scoreboard_tmu_tracking(scoreboard, qinst); } @@ -1580,7 +1938,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, if (slot > 0 && qinst->uniform != ~0) return false; - if (v3d_qpu_waits_vpm(inst)) + if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst)) return false; if (inst->sig.ldvary) @@ -1588,35 +1946,67 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { /* GFXH-1625: TMUWT not allowed in the final instruction. */ - if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT) + if (c->devinfo->ver <= 42 && slot == 2 && + inst->alu.add.op == V3D_QPU_A_TMUWT) { return false; + } - /* No writing physical registers at the end. */ - bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; - bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; - if ((!add_is_nop && !inst->alu.add.magic_write) || - (!mul_is_nop && !inst->alu.mul.magic_write)) { - return false; + if (c->devinfo->ver <= 42) { + /* No writing physical registers at the end. */ + bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP; + bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP; + if ((!add_is_nop && !inst->alu.add.magic_write) || + (!mul_is_nop && !inst->alu.mul.magic_write)) { + return false; + } + + if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && + !inst->sig_magic) { + return false; + } } - if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) && - !inst->sig_magic) { - return false; + if (c->devinfo->ver >= 71) { + /* The thread end instruction must not write to the + * register file via the add/mul ALUs. + */ + if (slot == 0 && + (!inst->alu.add.magic_write || + !inst->alu.mul.magic_write)) { + return false; + } } if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) return false; - /* RF0-2 might be overwritten during the delay slots by - * fragment shader setup. - */ - if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) - return false; + if (c->devinfo->ver <= 42) { + /* RF0-2 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A)) + return false; - if (inst->raddr_b < 3 && - !inst->sig.small_imm && - v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { - return false; + if (inst->raddr_b < 3 && + !inst->sig.small_imm_b && + v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) { + return false; + } + } + + if (c->devinfo->ver >= 71) { + /* RF2-3 might be overwritten during the delay slots by + * fragment shader setup. + */ + if (v3d71_qpu_reads_raddr(inst, 2) || + v3d71_qpu_reads_raddr(inst, 3)) { + return false; + } + + if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) || + v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) { + return false; + } } } @@ -1632,6 +2022,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c, */ static bool qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, + struct choose_scoreboard *scoreboard, const struct qinst *qinst, uint32_t slot) { @@ -1642,8 +2033,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu)) return false; - if (slot > 0 && qinst->qpu.sig.ldvary) - return false; + if (qinst->qpu.sig.ldvary) { + if (c->devinfo->ver <= 42 && slot > 0) + return false; + if (c->devinfo->ver >= 71 && slot == 2) + return false; + } /* unifa and the following 3 instructions can't overlap a * thread switch/end. The docs further clarify that this means @@ -1662,6 +2057,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c, if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu)) return false; + /* See comment when we set has_rf0_flops_conflict for details */ + if (c->devinfo->ver >= 71 && + slot == 2 && + v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) && + !qinst->qpu.sig_magic) { + if (scoreboard->has_rf0_flops_conflict) + return false; + if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick) + return false; + } + return true; } @@ -1694,7 +2100,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c, * also apply to instructions scheduled after the thrsw that we want * to place in its delay slots. */ - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot)) return false; /* TLB access is disallowed until scoreboard wait is executed, which @@ -1767,8 +2173,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard bool is_thrend) { for (int slot = 0; slot < instructions_in_sequence; slot++) { - if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot)) + if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, + qinst, slot)) { return false; + } if (is_thrend && !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) { @@ -1969,10 +2377,11 @@ emit_branch(struct v3d_compile *c, assert(scoreboard->last_branch_tick + 3 < branch_tick); assert(scoreboard->last_unifa_write_tick + 3 < branch_tick); - /* Can't place a branch with msfign != 0 and cond != 0,2,3 after + /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after * setmsf. */ bool is_safe_msf_branch = + c->devinfo->ver >= 71 || inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS || inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 || @@ -2056,46 +2465,72 @@ emit_branch(struct v3d_compile *c, } static bool -alu_reads_register(struct v3d_qpu_instr *inst, +alu_reads_register(const struct v3d_device_info *devinfo, + struct v3d_qpu_instr *inst, bool add, bool magic, uint32_t index) { uint32_t num_src; - enum v3d_qpu_mux mux_a, mux_b; - - if (add) { + if (add) num_src = v3d_qpu_add_op_num_src(inst->alu.add.op); - mux_a = inst->alu.add.a; - mux_b = inst->alu.add.b; - } else { + else num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op); - mux_a = inst->alu.mul.a; - mux_b = inst->alu.mul.b; - } - for (int i = 0; i < num_src; i++) { - if (magic) { - if (i == 0 && mux_a == index) - return true; - if (i == 1 && mux_b == index) - return true; + if (devinfo->ver <= 42) { + enum v3d_qpu_mux mux_a, mux_b; + if (add) { + mux_a = inst->alu.add.a.mux; + mux_b = inst->alu.add.b.mux; } else { - if (i == 0 && mux_a == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 0 && mux_a == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_A && - inst->raddr_a == index) { - return true; - } - if (i == 1 && mux_b == V3D_QPU_MUX_B && - inst->raddr_b == index) { - return true; + mux_a = inst->alu.mul.a.mux; + mux_b = inst->alu.mul.b.mux; + } + + for (int i = 0; i < num_src; i++) { + if (magic) { + if (i == 0 && mux_a == index) + return true; + if (i == 1 && mux_b == index) + return true; + } else { + if (i == 0 && mux_a == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 0 && mux_a == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_A && + inst->raddr_a == index) { + return true; + } + if (i == 1 && mux_b == V3D_QPU_MUX_B && + inst->raddr_b == index) { + return true; + } } } + + return false; + } + + assert(devinfo->ver >= 71); + assert(!magic); + + uint32_t raddr_a, raddr_b; + if (add) { + raddr_a = inst->alu.add.a.raddr; + raddr_b = inst->alu.add.b.raddr; + } else { + raddr_a = inst->alu.mul.a.raddr; + raddr_b = inst->alu.mul.b.raddr; + } + + for (int i = 0; i < num_src; i++) { + if (i == 0 && raddr_a == index) + return true; + if (i == 1 && raddr_b == index) + return true; } return false; @@ -2130,6 +2565,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c, struct qblock *block, struct v3d_qpu_instr *inst) { + const struct v3d_device_info *devinfo = c->devinfo; + /* We only call this if we have successfully merged an ldvary into a * previous instruction. */ @@ -2142,9 +2579,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c, * the ldvary destination, if it does, then moving the ldvary before * it would overwrite it. */ - if (alu_reads_register(inst, true, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index)) return false; - if (alu_reads_register(inst, false, ldvary_magic, ldvary_index)) + if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index)) return false; /* The implicit ldvary destination may not be written to by a signal @@ -2180,13 +2617,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, } /* The previous instruction cannot have a conflicting signal */ - if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig)) + if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig)) return false; uint32_t sig; struct v3d_qpu_sig new_sig = prev->qpu.sig; new_sig.ldvary = true; - if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig)) + if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) return false; /* The previous instruction cannot use flags since ldvary uses the @@ -2199,9 +2636,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c, /* We can't put an ldvary in the delay slots of a thrsw. We should've * prevented this when pairing up the ldvary with another instruction - * and flagging it for a fixup. + * and flagging it for a fixup. In V3D 7.x this is limited only to the + * second delay slot. */ - assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1); + assert((devinfo->ver <= 42 && + scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) || + (devinfo->ver >= 71 && + scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1)); /* Move the ldvary to the previous instruction and remove it from the * current one. @@ -2215,14 +2656,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c, inst->sig_magic = false; inst->sig_addr = 0; - /* By moving ldvary to the previous instruction we make it update - * r5 in the current one, so nothing else in it should write r5. - * This should've been prevented by our dependency tracking, which + /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */ + if (devinfo->ver >= 71) { + scoreboard->last_implicit_rf0_write_tick = scoreboard->tick; + set_has_rf0_flops_conflict(scoreboard, inst, devinfo); + } + + /* By moving ldvary to the previous instruction we make it update r5 + * (rf0 for ver >= 71) in the current one, so nothing else in it + * should write this register. + * + * This should've been prevented by our depedency tracking, which * would not allow ldvary to be paired up with an instruction that - * writes r5 (since our dependency tracking doesn't know that the - * ldvary write r5 happens in the next instruction). + * writes r5/rf0 (since our dependency tracking doesn't know that the + * ldvary write to r5/rf0 happens in the next instruction). */ - assert(!v3d_qpu_writes_r5(c->devinfo, inst)); + assert(!v3d_qpu_writes_r5(devinfo, inst)); + assert(devinfo->ver <= 42 || + (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) && + !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0))); return true; } @@ -2313,7 +2765,7 @@ schedule_instructions(struct v3d_compile *c, } } } - if (mux_read_stalls(scoreboard, inst)) + if (read_stalls(c->devinfo, scoreboard, inst)) c->qpu_inst_stalled_count++; } @@ -2538,6 +2990,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_setmsf_tick = -10; scoreboard.last_stallable_sfu_tick = -10; scoreboard.first_ldtmu_after_thrsw = true; + scoreboard.last_implicit_rf0_write_tick = - 10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c index 2cc7a0eb0ae..0466ee5d0b6 100644 --- a/src/broadcom/compiler/qpu_validate.c +++ b/src/broadcom/compiler/qpu_validate.c @@ -41,6 +41,7 @@ struct v3d_qpu_validate_state { int last_sfu_write; int last_branch_ip; int last_thrsw_ip; + int first_tlb_z_write; /* Set when we've found the last-THRSW signal, or if we were started * in single-segment mode. @@ -110,11 +111,58 @@ static void qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) { const struct v3d_device_info *devinfo = state->c->devinfo; + + if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write) + state->first_tlb_z_write = state->ip; + const struct v3d_qpu_instr *inst = &qinst->qpu; + if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && + state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write && + inst->branch.msfign != V3D_QPU_MSFIGN_NONE && + inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && + inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && + inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { + fail_instr(state, "Implicit branch MSF read after TLB Z write"); + } + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return; + if (inst->alu.add.op == V3D_QPU_A_SETMSF && + state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write) { + fail_instr(state, "SETMSF after TLB Z write"); + } + + if (state->first_tlb_z_write >= 0 && + state->ip > state->first_tlb_z_write && + inst->alu.add.op == V3D_QPU_A_MSF) { + fail_instr(state, "MSF read after TLB Z write"); + } + + if (devinfo->ver < 71) { + if (inst->sig.small_imm_a || inst->sig.small_imm_c || + inst->sig.small_imm_d) { + fail_instr(state, "small imm a/c/d added after V3D 7.1"); + } + } else { + if ((inst->sig.small_imm_a || inst->sig.small_imm_b) && + !vir_is_add(qinst)) { + fail_instr(state, "small imm a/b used but no ADD inst"); + } + if ((inst->sig.small_imm_c || inst->sig.small_imm_d) && + !vir_is_mul(qinst)) { + fail_instr(state, "small imm c/d used but no MUL inst"); + } + if (inst->sig.small_imm_a + inst->sig.small_imm_b + + inst->sig.small_imm_c + inst->sig.small_imm_d > 1) { + fail_instr(state, "only one small immediate can be " + "enabled per instruction"); + } + } + /* LDVARY writes r5 two instructions later and LDUNIF writes * r5 one instruction later, which is illegal to have * together. @@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) "SFU write started during THRSW delay slots "); } - if (inst->sig.ldvary) - fail_instr(state, "LDVARY during THRSW delay slots"); + if (inst->sig.ldvary) { + if (devinfo->ver <= 42) + fail_instr(state, "LDVARY during THRSW delay slots"); + if (devinfo->ver >= 71 && + state->ip - state->last_thrsw_ip == 2) { + fail_instr(state, "LDVARY in 2nd THRSW delay slot"); + } + } } (void)qpu_magic_waddr_matches; /* XXX */ @@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) vpm_writes + tlb_writes + tsy_writes + - inst->sig.ldtmu + + (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) + inst->sig.ldtlb + inst->sig.ldvpm + inst->sig.ldtlbu > 1) { @@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) inst->type == V3D_QPU_INSTR_TYPE_ALU) { if ((inst->alu.add.op != V3D_QPU_A_NOP && !inst->alu.add.magic_write)) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver <= 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71) { + if (state->last_thrsw_ip - state->ip == 0) { + fail_instr(state, + "ADD RF write at THREND"); + } + if (inst->alu.add.waddr == 2 || + inst->alu.add.waddr == 3) { + fail_instr(state, + "RF2-3 write after THREND"); + } + } } if ((inst->alu.mul.op != V3D_QPU_M_NOP && !inst->alu.mul.magic_write)) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver <= 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71) { + if (state->last_thrsw_ip - state->ip == 0) { + fail_instr(state, + "MUL RF write at THREND"); + } + + if (inst->alu.mul.waddr == 2 || + inst->alu.mul.waddr == 3) { + fail_instr(state, + "RF2-3 write after THREND"); + } + } } if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && !inst->sig_magic) { - fail_instr(state, "RF write after THREND"); + if (devinfo->ver <= 42) { + fail_instr(state, "RF write after THREND"); + } else if (devinfo->ver >= 71 && + (inst->sig_addr == 2 || + inst->sig_addr == 3)) { + fail_instr(state, "RF2-3 write after THREND"); + } } /* GFXH-1625: No TMUWT in the last instruction */ @@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c) .last_sfu_write = -10, .last_thrsw_ip = -10, .last_branch_ip = -10, + .first_tlb_z_write = INT_MAX, .ip = 0, .last_thrsw_found = !c->last_thrsw, diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 9f4129870e1..b437b5f5168 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -613,6 +613,11 @@ struct v3d_ra_node_info { struct { uint32_t priority; uint8_t class_bits; + bool is_program_end; + bool unused; + + /* V3D 7.x */ + bool is_ldunif_dst; } *info; uint32_t alloc_count; }; @@ -1150,8 +1155,8 @@ bool vir_is_raw_mov(struct qinst *inst); bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); -bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); -bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); +bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); +bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst); struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); uint8_t vir_channels_written(struct qinst *inst); struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); @@ -1184,7 +1189,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader); bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); bool v3d_nir_lower_scratch(nir_shader *s); bool v3d_nir_lower_txf_ms(nir_shader *s); -bool v3d_nir_lower_image_load_store(nir_shader *s); +bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c); bool v3d_nir_lower_load_store_bitsize(nir_shader *s); void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components); @@ -1425,6 +1430,20 @@ VIR_SFU(LOG) VIR_SFU(SIN) VIR_SFU(RSQRT2) +VIR_A_ALU2(VPACK) +VIR_A_ALU2(V8PACK) +VIR_A_ALU2(V10PACK) +VIR_A_ALU2(V11FPACK) + +VIR_M_ALU1(FTOUNORM16) +VIR_M_ALU1(FTOSNORM16) + +VIR_M_ALU1(VFTOUNORM8) +VIR_M_ALU1(VFTOSNORM8) + +VIR_M_ALU1(VFTOUNORM10LO) +VIR_M_ALU1(VFTOUNORM10HI) + static inline struct qinst * vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond, struct qreg dest, struct qreg src) diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c index 2900a29817f..bbb55be4a14 100644 --- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c @@ -40,6 +40,10 @@ * calculations and load/store using the TMU general memory access path. */ +static const unsigned bits_8[4] = {8, 8, 8, 8}; +static const unsigned bits_16[4] = {16, 16, 16, 16}; +static const unsigned bits_1010102[4] = {10, 10, 10, 2}; + bool v3d_gl_format_is_return_32(enum pipe_format format) { @@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format) /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a * 32-bit SSA value, with as many channels as necessary to store all the bits + * + * This is the generic helper, using all common nir operations. */ static nir_ssa_def * pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, @@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits, return nir_vec(b, results, DIV_ROUND_UP(offset, 32)); } +/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is + * just easier to read vfpack on the code, specially while using the PRM as + * reference + */ +static nir_ssa_def * +nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2) +{ + return nir_pack_half_2x16_split(b, p1, p2); +} + +static inline nir_ssa_def * +pack_11f11f10f(nir_builder *b, nir_ssa_def *color) +{ + nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + /* FIXME: we noted that we could just use p2 again as the second + * element to pack, and CTS tests still works. Just using undef as is + * slightly more correct + */ + nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size); + nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef); + + return nir_v11fpack_v3d(b, p1, p2); +} + +static inline nir_ssa_def * +pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color) +{ + nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + + return nir_v10pack_v3d(b, p1, p2); +} + +static inline nir_ssa_def * +pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color) +{ + nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, 1)); + p1 = nir_vftounorm10lo_v3d(b, p1); + + nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + p2 = nir_vftounorm10hi_v3d(b, p2); + + return nir_v10pack_v3d(b, p1, p2); +} + +enum hw_conversion { + NONE, + TO_SNORM, + TO_UNORM +}; + +static inline nir_ssa_def * +pack_8bit(nir_builder *b, nir_ssa_def *color, + unsigned num_components, + enum hw_conversion conversion) +{ + /* Note that usually you should not use this method (that relies on + * custom packing) for 1 component if we are not doing any + * conversion. But we support also that case, and let the caller + * decide which method to use. + */ + nir_ssa_def *p1; + nir_ssa_def *p2; + + if (conversion == NONE) { + p1 = nir_vpack_v3d(b, nir_channel(b, color, 0), + nir_channel(b, color, num_components == 1 ? 0 : 1)); + } else { + p1 = nir_vfpack(b, nir_channel(b, color, 0), + nir_channel(b, color, num_components == 1 ? 0 : 1)); + p1 = (conversion == TO_UNORM) ? + nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1); + } + if (num_components == 4) { + if (conversion == NONE) { + p2 = nir_vpack_v3d(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + } else { + p2 = nir_vfpack(b, nir_channel(b, color, 2), + nir_channel(b, color, 3)); + p2 = (conversion == TO_UNORM) ? + nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2); + } + } else { + /* As mentioned on the comment before, using an undef here + * would be more correct. But for this case we are getting + * worse values, and in fact even some worse instruction count + * with some CTS tests, so we just reuse the first packing + */ + p2 = p1; + } + + return nir_v8pack_v3d(b, p1, p2); +} + +static inline nir_ssa_def * +pack_16bit(nir_builder *b, nir_ssa_def *color, + unsigned num_components, + enum hw_conversion conversion) +{ + nir_ssa_def *results[2]; + nir_ssa_def *channels[4]; + + /* Note that usually you should not use this method (that relies on + * custom packing) if we are not doing any conversion. But we support + * also that case, and let the caller decide which method to use. + */ + + for (unsigned i = 0; i < num_components; i++) { + channels[i] = nir_channel(b, color, i); + switch (conversion) { + case TO_SNORM: + channels[i] = nir_ftosnorm16_v3d(b, channels[i]); + break; + case TO_UNORM: + channels[i] = nir_ftounorm16_v3d(b, channels[i]); + break; + default: + break; + } + } + + switch (num_components) { + case 1: + results[0] = channels[0]; + break; + case 4: + results[1] = nir_vpack_v3d(b, channels[2], channels[3]); + FALLTHROUGH; + case 2: + results[0] = nir_vpack_v3d(b, channels[0], channels[1]); + break; + } + + return nir_vec(b, results, DIV_ROUND_UP(num_components, 2)); +} + +static inline nir_ssa_def * +pack_xbit(nir_builder *b, nir_ssa_def *color, + unsigned num_components, + const struct util_format_channel_description *r_chan) +{ + bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED); + enum hw_conversion conversion = NONE; + if (r_chan->normalized) { + conversion = + (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM; + } + + switch (r_chan->size) { + case 8: + if (conversion == NONE && num_components < 2) + return pack_bits(b, color, bits_8, num_components, pack_mask); + else + return pack_8bit(b, color, num_components, conversion); + break; + case 16: + /* pack_mask implies that the generic packing method would + * need to include extra operations to handle negative values, + * so in that case, even without a conversion, it is better to + * use the packing using custom hw operations. + */ + if (conversion == NONE && !pack_mask) + return pack_bits(b, color, bits_16, num_components, pack_mask); + else + return pack_16bit(b, color, num_components, conversion); + break; + default: + unreachable("unrecognized bits"); + } +} + static bool -v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) +v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr) { enum pipe_format format = nir_intrinsic_format(instr); assert(format != PIPE_FORMAT_NONE); @@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) */ formatted = color; } else { - static const unsigned bits_8[4] = {8, 8, 8, 8}; - static const unsigned bits_16[4] = {16, 16, 16, 16}; - static const unsigned bits_1010102[4] = {10, 10, 10, 2}; const unsigned *bits; switch (r_chan->size) { @@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr) return true; } + +static bool +v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr) +{ + enum pipe_format format = nir_intrinsic_format(instr); + assert(format != PIPE_FORMAT_NONE); + const struct util_format_description *desc = + util_format_description(format); + const struct util_format_channel_description *r_chan = &desc->channel[0]; + unsigned num_components = util_format_get_nr_components(format); + b->cursor = nir_before_instr(&instr->instr); + + nir_ssa_def *color = nir_channels(b, + nir_ssa_for_src(b, instr->src[3], 4), + (1 << num_components) - 1); + nir_ssa_def *formatted = NULL; + if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + formatted = nir_format_pack_r9g9b9e5(b, color); + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + formatted = pack_11f11f10f(b, color); + } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) { + formatted = pack_r10g10b10a2_uint(b, color); + } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) { + formatted = pack_r10g10b10a2_unorm(b, color); + } else if (r_chan->size == 32) { + /* For 32-bit formats, we just have to move the vector + * across (possibly reducing the number of channels). + */ + formatted = color; + } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) { + assert(r_chan->size == 16); + formatted = nir_format_float_to_half(b, color); + formatted = pack_bits(b, formatted, bits_16, num_components, + false); + } else { + assert(r_chan->size == 8 || r_chan->size == 16); + formatted = pack_xbit(b, color, num_components, r_chan); + } + + nir_instr_rewrite_src(&instr->instr, &instr->src[3], + nir_src_for_ssa(formatted)); + instr->num_components = formatted->num_components; + + return true; +} + static bool v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr) { @@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b, nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + struct v3d_compile *c = (struct v3d_compile *) _state; + switch (intr->intrinsic) { case nir_intrinsic_image_load: return v3d_nir_lower_image_load(b, intr); case nir_intrinsic_image_store: - return v3d_nir_lower_image_store(b, intr); + if (c->devinfo->ver >= 71) + return v3d_nir_lower_image_store_v71(b, intr); + else + return v3d_nir_lower_image_store_v42(b, intr); + break; default: return false; } @@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b, } bool -v3d_nir_lower_image_load_store(nir_shader *s) +v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c) { return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb, nir_metadata_block_index | - nir_metadata_dominance, NULL); + nir_metadata_dominance, c); } diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c index 69929a145aa..8a314c8b5a9 100644 --- a/src/broadcom/compiler/v3d_nir_lower_io.c +++ b/src/broadcom/compiler/v3d_nir_lower_io.c @@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, * The correct fix for this as recommended by Broadcom * is to convert to .8 fixed-point with ffloor(). */ - pos = nir_f2i32(b, nir_ffloor(b, pos)); - v3d_nir_store_output(b, state->vp_vpm_offset + i, - offset_reg, pos); + if (c->devinfo->ver <= 42) + pos = nir_f2i32(b, nir_ffloor(b, pos)); + else + pos = nir_f2i32(b, nir_fround_even(b, pos)); + + v3d_nir_store_output(b, state->vp_vpm_offset + i, + offset_reg, pos); } } diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 499215454c0..192872f368c 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst) return false; } - if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || - inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { + if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) { return false; } @@ -156,8 +156,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) } bool -vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) +vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, + struct qinst *inst) { + if (!devinfo->has_accumulators) + return false; + for (int i = 0; i < vir_get_nsrc(inst); i++) { switch (inst->src[i].file) { case QFILE_VPM: @@ -178,8 +182,12 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) } bool -vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) +vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, + struct qinst *inst) { + if (!devinfo->has_accumulators) + return false; + switch (inst->dst.file) { case QFILE_MAGIC: switch (inst->dst.index) { @@ -209,15 +217,15 @@ vir_set_unpack(struct qinst *inst, int src, if (vir_is_add(inst)) { if (src == 0) - inst->qpu.alu.add.a_unpack = unpack; + inst->qpu.alu.add.a.unpack = unpack; else - inst->qpu.alu.add.b_unpack = unpack; + inst->qpu.alu.add.b.unpack = unpack; } else { assert(vir_is_mul(inst)); if (src == 0) - inst->qpu.alu.mul.a_unpack = unpack; + inst->qpu.alu.mul.a.unpack = unpack; else - inst->qpu.alu.mul.b_unpack = unpack; + inst->qpu.alu.mul.b.unpack = unpack; } } @@ -737,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c, /* Set us up for shared input/output segments. This is apparently * necessary for our VCM setup to avoid varying corruption. + * + * FIXME: initially testing on V3D 7.1 seems to work fine when using + * separate segments. So we could try to reevaluate in the future, if + * there is any advantage of using separate segments. */ prog_data->separate_segments = false; prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, @@ -1572,7 +1584,7 @@ v3d_attempt_compile(struct v3d_compile *c) NIR_PASS(_, c->s, v3d_nir_lower_io, c); NIR_PASS(_, c->s, v3d_nir_lower_txf_ms); - NIR_PASS(_, c->s, v3d_nir_lower_image_load_store); + NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c); NIR_PASS(_, c->s, nir_opt_idiv_const, 8); nir_lower_idiv_options idiv_options = { diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c index 5c47bbdc1b0..ab5d4043039 100644 --- a/src/broadcom/compiler/vir_dump.c +++ b/src/broadcom/compiler/vir_dump.c @@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack)); - unpack[0] = instr->alu.add.a_unpack; - unpack[1] = instr->alu.add.b_unpack; + unpack[0] = instr->alu.add.a.unpack; + unpack[1] = instr->alu.add.b.unpack; } else { fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op)); fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc)); @@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) vir_print_reg(c, inst, inst->dst); fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack)); - unpack[0] = instr->alu.mul.a_unpack; - unpack[1] = instr->alu.mul.b_unpack; + unpack[0] = instr->alu.mul.a.unpack; + unpack[1] = instr->alu.mul.b.unpack; } for (int i = 0; i < nsrc; i++) { diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c index 2fd6430a0f4..2907de9049f 100644 --- a/src/broadcom/compiler/vir_live_variables.c +++ b/src/broadcom/compiler/vir_live_variables.c @@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c) flags_inst = NULL; } - /* Payload registers: r0/1/2 contain W, centroid W, - * and Z at program start. Register allocation will - * force their nodes to R0/1/2. + /* Payload registers: for fragment shaders, W, + * centroid W, and Z will be initialized at r0/1/2 + * until v42, or r1/r2/r3 from v71. + * + * For compute shaders, payload would be r0/r2 until + * v42, r3/r2 from v71 + * + * Register allocation will force their nodes to those + * registers. */ if (inst->src[0].file == QFILE_REG) { - switch (inst->src[0].index) { - case 0: - case 1: - case 2: + uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0; + uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2; + if (inst->src[0].index >= min_payload_r || + inst->src[0].index <= max_payload_r) { c->temp_start[inst->dst.index] = 0; - break; } } diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c index da121c2a5bd..1260838ca05 100644 --- a/src/broadcom/compiler/vir_opt_copy_propagate.c +++ b/src/broadcom/compiler/vir_opt_copy_propagate.c @@ -35,7 +35,7 @@ #include "v3d_compiler.h" static bool -is_copy_mov(struct qinst *inst) +is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst) { if (!inst) return false; @@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst) return false; } - switch (inst->src[0].file) { - case QFILE_MAGIC: - /* No copy propagating from R3/R4/R5 -- the MOVs from those - * are there to register allocate values produced into R3/4/5 - * to other regs (though hopefully r3/4/5). - */ - switch (inst->src[0].index) { - case V3D_QPU_WADDR_R3: - case V3D_QPU_WADDR_R4: - case V3D_QPU_WADDR_R5: - return false; + if (devinfo->ver <= 42) { + switch (inst->src[0].file) { + case QFILE_MAGIC: + /* No copy propagating from R3/R4/R5 -- the MOVs from + * those are there to register allocate values produced + * into R3/4/5 to other regs (though hopefully r3/4/5). + */ + switch (inst->src[0].index) { + case V3D_QPU_WADDR_R3: + case V3D_QPU_WADDR_R4: + case V3D_QPU_WADDR_R5: + return false; + default: + break; + } + break; + + case QFILE_REG: + switch (inst->src[0].index) { + case 0: + case 1: + case 2: + /* MOVs from rf0/1/2 are only to track the live + * intervals for W/centroid W/Z. + */ + return false; + } + break; + default: break; } - break; - - case QFILE_REG: - switch (inst->src[0].index) { - case 0: - case 1: - case 2: - /* MOVs from rf0/1/2 are only to track the live + } else { + assert(devinfo->ver >= 71); + switch (inst->src[0].file) { + case QFILE_REG: + switch (inst->src[0].index) { + /* MOVs from rf1/2/3 are only to track the live * intervals for W/centroid W/Z. + * + * Note: rf0 can be implicitly written by ldvary + * (no temp involved), so it is not an SSA value and + * could clash with writes to other temps that are + * also allocated to rf0. In theory, that would mean + * that we can't copy propagate from it, but we handle + * this at register allocation time, preventing temps + * from being allocated to rf0 while the rf0 value from + * ldvary is still live. */ - return false; - } - break; + case 1: + case 2: + case 3: + return false; + } + break; - default: - break; + default: + break; + } } return true; @@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan) if (vir_is_add(inst)) { if (chan == 0) - return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE; else - return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE; } else { if (chan == 0) - return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE; else - return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE; + return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE; } } @@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) */ struct qinst *mov = movs[inst->src[i].index]; if (!mov) { - if (!is_copy_mov(c->defs[inst->src[i].index])) + if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index])) continue; mov = c->defs[inst->src[i].index]; @@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) continue; /* these ops can't represent abs. */ - if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) { + if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) { switch (inst->qpu.alu.add.op) { case V3D_QPU_A_VFPACK: case V3D_QPU_A_FROUND: @@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) inst->src[i] = mov->src[0]; if (vir_has_unpack(mov, 0)) { - enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack; + enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack; vir_set_unpack(inst, i, unpack); } @@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c) apply_kills(c, movs, inst); - if (is_copy_mov(inst)) + if (is_copy_mov(c->devinfo, inst)) movs[inst->dst.index] = inst; } } diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c index c7896d57f2b..6b61ed6a39a 100644 --- a/src/broadcom/compiler/vir_opt_redundant_flags.c +++ b/src/broadcom/compiler/vir_opt_redundant_flags.c @@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b) a->qpu.flags.mpf != b->qpu.flags.mpf || a->qpu.alu.add.op != b->qpu.alu.add.op || a->qpu.alu.mul.op != b->qpu.alu.mul.op || - a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack || - a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack || + a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack || + a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack || a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack || - a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack || - a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack || + a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack || + a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack || a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) { return false; } diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c index 47d7722968d..ed5bc011964 100644 --- a/src/broadcom/compiler/vir_opt_small_immediates.c +++ b/src/broadcom/compiler/vir_opt_small_immediates.c @@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c) /* The small immediate value sits in the raddr B field, so we * can't have 2 small immediates in one instruction (unless * they're the same value, but that should be optimized away - * elsewhere). + * elsewhere). Since 7.x we can encode small immediates in + * any raddr field, but each instruction can still only use + * one. */ bool uses_small_imm = false; for (int i = 0; i < vir_get_nsrc(inst); i++) { @@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c) */ struct v3d_qpu_sig new_sig = inst->qpu.sig; uint32_t sig_packed; - new_sig.small_imm = true; + if (c->devinfo->ver <= 42) { + new_sig.small_imm_b = true; + } else { + if (vir_is_add(inst)) { + if (i == 0) + new_sig.small_imm_a = true; + else + new_sig.small_imm_b = true; + } else { + if (i == 0) + new_sig.small_imm_c = true; + else + new_sig.small_imm_d = true; + } + } + if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed)) continue; @@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c) vir_dump_inst(c, inst); fprintf(stderr, "\n"); } - inst->qpu.sig.small_imm = true; + inst->qpu.sig.small_imm_a = new_sig.small_imm_a; + inst->qpu.sig.small_imm_b = new_sig.small_imm_b; + inst->qpu.sig.small_imm_c = new_sig.small_imm_c; + inst->qpu.sig.small_imm_d = new_sig.small_imm_d; inst->qpu.raddr_b = packed; inst->src[i].file = QFILE_SMALL_IMM; diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index b22f915d1df..8eac2b75bd7 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -28,41 +28,73 @@ #define ACC_INDEX 0 #define ACC_COUNT 6 -#define PHYS_INDEX (ACC_INDEX + ACC_COUNT) -#define PHYS_COUNT 64 +/* RA nodes used to track RF registers with implicit writes */ +#define IMPLICIT_RF_COUNT 1 + +#define PHYS_COUNT 64 + +static uint8_t +get_phys_index(const struct v3d_device_info *devinfo) +{ + if (devinfo->has_accumulators) + return ACC_INDEX + ACC_COUNT; + else + return 0; +} + +/* ACC as accumulator */ #define CLASS_BITS_PHYS (1 << 0) #define CLASS_BITS_ACC (1 << 1) #define CLASS_BITS_R5 (1 << 4) -#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \ - CLASS_BITS_ACC | \ - CLASS_BITS_R5) + +static uint8_t +get_class_bit_any(const struct v3d_device_info *devinfo) +{ + if (devinfo->has_accumulators) + return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5); + else + return CLASS_BITS_PHYS; +} + +static uint8_t +filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits) +{ + if (!devinfo->has_accumulators) { + assert(class_bits & CLASS_BITS_PHYS); + class_bits = CLASS_BITS_PHYS; + } + return class_bits; +} static inline uint32_t -temp_to_node(uint32_t temp) +temp_to_node(struct v3d_compile *c, uint32_t temp) { - return temp + ACC_COUNT; + return temp + (c->devinfo->has_accumulators ? ACC_COUNT : + IMPLICIT_RF_COUNT); } static inline uint32_t -node_to_temp(uint32_t node) +node_to_temp(struct v3d_compile *c, uint32_t node) { - assert(node >= ACC_COUNT); - return node - ACC_COUNT; + assert((c->devinfo->has_accumulators && node >= ACC_COUNT) || + (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT)); + return node - (c->devinfo->has_accumulators ? ACC_COUNT : + IMPLICIT_RF_COUNT); } static inline uint8_t -get_temp_class_bits(struct v3d_ra_node_info *nodes, +get_temp_class_bits(struct v3d_compile *c, uint32_t temp) { - return nodes->info[temp_to_node(temp)].class_bits; + return c->nodes.info[temp_to_node(c, temp)].class_bits; } static inline void -set_temp_class_bits(struct v3d_ra_node_info *nodes, +set_temp_class_bits(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) { - nodes->info[temp_to_node(temp)].class_bits = class_bits; + c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits; } static struct ra_class * @@ -71,11 +103,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits) if (class_bits == CLASS_BITS_PHYS) { return c->compiler->reg_class_phys[c->thread_index]; } else if (class_bits == (CLASS_BITS_R5)) { + assert(c->devinfo->has_accumulators); return c->compiler->reg_class_r5[c->thread_index]; } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) { + assert(c->devinfo->has_accumulators); return c->compiler->reg_class_phys_or_acc[c->thread_index]; } else { - assert(class_bits == CLASS_BITS_ANY); + assert(class_bits == get_class_bit_any(c->devinfo)); return c->compiler->reg_class_any[c->thread_index]; } } @@ -84,7 +118,7 @@ static inline struct ra_class * choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp) { assert(temp < c->num_temps && temp < c->nodes.alloc_count); - return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp)); + return choose_reg_class(c, get_temp_class_bits(c, temp)); } static inline bool @@ -313,7 +347,7 @@ v3d_choose_spill_node(struct v3d_compile *c) for (unsigned i = 0; i < c->num_temps; i++) { if (BITSET_TEST(c->spillable, i)) { - ra_set_node_spill_cost(c->g, temp_to_node(i), + ra_set_node_spill_cost(c->g, temp_to_node(c, i), spill_costs[i]); } } @@ -331,7 +365,8 @@ ensure_nodes(struct v3d_compile *c) c->nodes.info = reralloc_array_size(c, c->nodes.info, sizeof(c->nodes.info[0]), - c->nodes.alloc_count + ACC_COUNT); + c->nodes.alloc_count + + MAX2(ACC_COUNT, IMPLICIT_RF_COUNT)); } /* Creates the interference node for a new temp. We use this to keep the node @@ -343,11 +378,15 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits) ensure_nodes(c); int node = ra_add_node(c->g, choose_reg_class(c, class_bits)); - assert(node == temp + ACC_COUNT); + assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT : + node == temp + IMPLICIT_RF_COUNT); /* We fill the node priority after we are done inserting spills */ c->nodes.info[node].class_bits = class_bits; c->nodes.info[node].priority = 0; + c->nodes.info[node].is_ldunif_dst = false; + c->nodes.info[node].is_program_end = false; + c->nodes.info[node].unused = false; } /* The spill offset for this thread takes a bit of setup, so do it once at @@ -395,8 +434,10 @@ v3d_setup_spill_base(struct v3d_compile *c) */ if (c->spilling) { int temp_class = CLASS_BITS_PHYS; - if (i != c->spill_base.index) + if (c->devinfo->has_accumulators && + i != c->spill_base.index) { temp_class |= CLASS_BITS_ACC; + } add_node(c, i, temp_class); } } @@ -436,7 +477,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, */ assert(c->disable_ldunif_opt); struct qreg offset = vir_uniform_ui(c, spill_offset); - add_node(c, offset.index, CLASS_BITS_ANY); + add_node(c, offset.index, get_class_bit_any(c->devinfo)); /* We always enable per-quad on spills/fills to ensure we spill * any channels involved with helper invocations. @@ -455,14 +496,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c, * temp will be used immediately so just like the uniform above we * can allow accumulators. */ + int temp_class = + filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC); if (!fill_dst) { struct qreg dst = vir_TMUWT(c); assert(dst.file == QFILE_TEMP); - add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC); + add_node(c, dst.index, temp_class); } else { *fill_dst = vir_LDTMU(c); assert(fill_dst->file == QFILE_TEMP); - add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC); + add_node(c, fill_dst->index, temp_class); } /* Temps across the thread switch we injected can't be assigned to @@ -482,7 +525,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c, c->temp_start[i] < ip && c->temp_end[i] >= ip : c->temp_start[i] <= ip && c->temp_end[i] > ip; if (thrsw_cross) { - ra_set_node_class(c->g, temp_to_node(i), + ra_set_node_class(c->g, temp_to_node(c, i), choose_reg_class(c, CLASS_BITS_PHYS)); } } @@ -509,8 +552,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c, * same register class bits as the original. */ if (inst == position) { - uint8_t class_bits = get_temp_class_bits(&c->nodes, - inst->dst.index); + uint8_t class_bits = get_temp_class_bits(c, inst->dst.index); inst->dst = vir_get_temp(c); add_node(c, inst->dst.index, class_bits); } else { @@ -542,7 +584,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end) } static void -v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) +v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes, + int spill_temp) { c->spill_start_num_temps = c->num_temps; c->spilling = true; @@ -554,8 +597,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) spill_offset = c->spill_size; c->spill_size += V3D_CHANNELS * sizeof(uint32_t); - if (spill_offset == 0) + if (spill_offset == 0) { v3d_setup_spill_base(c); + + /* Don't allocate our spill base to rf0 to avoid + * conflicts with instructions doing implicit writes + * to that register. + */ + if (!c->devinfo->has_accumulators) { + ra_add_node_interference( + c->g, + temp_to_node(c, c->spill_base.index), + implicit_rf_nodes[0]); + } + } } struct qinst *last_thrsw = c->last_thrsw; @@ -574,7 +629,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) reconstruct_op = orig_def->qpu.alu.add.op; } - uint32_t spill_node = temp_to_node(spill_temp); + uint32_t spill_node = temp_to_node(c, spill_temp); /* We must disable the ldunif optimization if we are spilling uniforms */ bool had_disable_ldunif_opt = c->disable_ldunif_opt; @@ -635,7 +690,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) * instruction immediately after, so * we can use any register class for it. */ - add_node(c, unif.index, CLASS_BITS_ANY); + add_node(c, unif.index, + get_class_bit_any(c->devinfo)); } else if (spill_type == SPILL_TYPE_RECONSTRUCT) { struct qreg temp = reconstruct_temp(c, reconstruct_op); @@ -644,8 +700,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) * instruction immediately after so we * can use ACC. */ - add_node(c, temp.index, CLASS_BITS_PHYS | - CLASS_BITS_ACC); + int temp_class = + filter_class_bits(c->devinfo, CLASS_BITS_PHYS | + CLASS_BITS_ACC); + add_node(c, temp.index, temp_class); } else { /* If we have a postponed spill, we * don't need a fill as the temp would @@ -739,12 +797,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) * update node priorities based one new liveness data. */ uint32_t sb_temp =c->spill_base.index; - uint32_t sb_node = temp_to_node(sb_temp); + uint32_t sb_node = temp_to_node(c, sb_temp); for (uint32_t i = 0; i < c->num_temps; i++) { if (c->temp_end[i] == -1) continue; - uint32_t node_i = temp_to_node(i); + uint32_t node_i = temp_to_node(c, i); c->nodes.info[node_i].priority = c->temp_end[i] - c->temp_start[i]; @@ -752,7 +810,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) j < c->num_temps; j++) { if (interferes(c->temp_start[i], c->temp_end[i], c->temp_start[j], c->temp_end[j])) { - uint32_t node_j = temp_to_node(j); + uint32_t node_j = temp_to_node(c, j); ra_add_node_interference(c->g, node_i, node_j); } } @@ -771,9 +829,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp) } struct v3d_ra_select_callback_data { + uint32_t phys_index; uint32_t next_acc; uint32_t next_phys; struct v3d_ra_node_info *nodes; + const struct v3d_device_info *devinfo; }; /* Choosing accumulators improves chances of merging QPU instructions @@ -785,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, int priority) { + if (!v3d_ra->devinfo->has_accumulators) + return false; + /* Favor accumulators if we have less that this number of physical * registers. Accumulators have more restrictions (like being * invalidated through thrsw), so running out of physical registers @@ -794,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra, static const int available_rf_threshold = 5; int available_rf = 0 ; for (int i = 0; i < PHYS_COUNT; i++) { - if (BITSET_TEST(regs, PHYS_INDEX + i)) + if (BITSET_TEST(regs, v3d_ra->phys_index + i)) available_rf++; if (available_rf >= available_rf_threshold) break; @@ -820,6 +883,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, BITSET_WORD *regs, unsigned int *out) { + if (!v3d_ra->devinfo->has_accumulators) + return false; + /* Choose r5 for our ldunifs if possible (nobody else can load to that * reg, and it keeps the QPU cond field free from being occupied by * ldunifrf). @@ -849,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra, static bool v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, + unsigned int node, BITSET_WORD *regs, unsigned int *out) { + /* If this node is for an unused temp, ignore. */ + if (v3d_ra->nodes->info[node].unused) { + *out = 0; + return true; + } + + /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst + * so we can avoid turning them into ldunifrf (which uses the + * cond field to encode the dst and would prevent merge with + * instructions that use cond flags). + */ + if (v3d_ra->nodes->info[node].is_ldunif_dst && + BITSET_TEST(regs, v3d_ra->phys_index)) { + assert(v3d_ra->devinfo->ver >= 71); + *out = v3d_ra->phys_index; + return true; + } + + /* The last 3 instructions in a shader can't use some specific registers + * (usually early rf registers, depends on v3d version) so try to + * avoid allocating these to registers used by the last instructions + * in the shader. + */ + const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4; + if (v3d_ra->nodes->info[node].is_program_end && + v3d_ra->next_phys < safe_rf_start) { + v3d_ra->next_phys = safe_rf_start; + } + for (int i = 0; i < PHYS_COUNT; i++) { int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT; - int phys = PHYS_INDEX + phys_off; + + /* Try to keep rf0 available for ldunif in 7.x (see above). */ + if (v3d_ra->devinfo->ver >= 71 && phys_off == 0) + continue; + + int phys = v3d_ra->phys_index + phys_off; if (BITSET_TEST(regs, phys)) { v3d_ra->next_phys = phys_off + 1; @@ -863,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra, } } + /* If we couldn't allocate, do try to assign rf0 if it is available. */ + if (v3d_ra->devinfo->ver >= 71 && + BITSET_TEST(regs, v3d_ra->phys_index)) { + v3d_ra->next_phys = 1; + *out = v3d_ra->phys_index; + return true; + } + return false; } @@ -877,7 +986,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data) return reg; } - if (v3d_ra_select_rf(v3d_ra, regs, ®)) + if (v3d_ra_select_rf(v3d_ra, n, regs, ®)) return reg; /* If we ran out of physical registers try to assign an accumulator @@ -896,8 +1005,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler) * register file can be divided up for fragment shader threading. */ int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3); + uint8_t phys_index = get_phys_index(compiler->devinfo); - compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT, + compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT, false); if (!compiler->regs) return false; @@ -905,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler) for (int threads = 0; threads < max_thread_index; threads++) { compiler->reg_class_any[threads] = ra_alloc_contig_reg_class(compiler->regs, 1); - compiler->reg_class_r5[threads] = - ra_alloc_contig_reg_class(compiler->regs, 1); - compiler->reg_class_phys_or_acc[threads] = - ra_alloc_contig_reg_class(compiler->regs, 1); + if (compiler->devinfo->has_accumulators) { + compiler->reg_class_r5[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + compiler->reg_class_phys_or_acc[threads] = + ra_alloc_contig_reg_class(compiler->regs, 1); + } compiler->reg_class_phys[threads] = ra_alloc_contig_reg_class(compiler->regs, 1); - for (int i = PHYS_INDEX; - i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) { - ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + /* Init physical regs */ + for (int i = phys_index; + i < phys_index + (PHYS_COUNT >> threads); i++) { + if (compiler->devinfo->has_accumulators) + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); ra_class_add_reg(compiler->reg_class_phys[threads], i); ra_class_add_reg(compiler->reg_class_any[threads], i); } - for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { - ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); - ra_class_add_reg(compiler->reg_class_any[threads], i); + /* Init accumulator regs */ + if (compiler->devinfo->has_accumulators) { + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { + ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->reg_class_any[threads], i); + } + /* r5 can only store a single 32-bit value, so not much can + * use it. + */ + ra_class_add_reg(compiler->reg_class_r5[threads], + ACC_INDEX + 5); + ra_class_add_reg(compiler->reg_class_any[threads], + ACC_INDEX + 5); } - /* r5 can only store a single 32-bit value, so not much can - * use it. - */ - ra_class_add_reg(compiler->reg_class_r5[threads], - ACC_INDEX + 5); - ra_class_add_reg(compiler->reg_class_any[threads], - ACC_INDEX + 5); } ra_set_finalize(compiler->regs, NULL); @@ -944,7 +1061,10 @@ tmu_spilling_allowed(struct v3d_compile *c) } static void -update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, +update_graph_and_reg_classes_for_inst(struct v3d_compile *c, + int *acc_nodes, + int *implicit_rf_nodes, + int last_ldvary_ip, struct qinst *inst) { int32_t ip = inst->ip; @@ -954,26 +1074,39 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, * result to a temp), nothing else can be stored in r3/r4 across * it. */ - if (vir_writes_r3(c->devinfo, inst)) { + if (vir_writes_r3_implicitly(c->devinfo, inst)) { for (int i = 0; i < c->num_temps; i++) { if (c->temp_start[i] < ip && c->temp_end[i] > ip) { ra_add_node_interference(c->g, - temp_to_node(i), + temp_to_node(c, i), acc_nodes[3]); } } } - if (vir_writes_r4(c->devinfo, inst)) { + if (vir_writes_r4_implicitly(c->devinfo, inst)) { for (int i = 0; i < c->num_temps; i++) { if (c->temp_start[i] < ip && c->temp_end[i] > ip) { ra_add_node_interference(c->g, - temp_to_node(i), + temp_to_node(c, i), acc_nodes[4]); } } } + /* If any instruction writes to a physical register implicitly + * nothing else can write the same register across it. + */ + if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + implicit_rf_nodes[0]); + } + } + } + if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { switch (inst->qpu.alu.add.op) { case V3D_QPU_A_LDVPMV_IN: @@ -987,7 +1120,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, * decides whether the LDVPM is in or out) */ assert(inst->dst.file == QFILE_TEMP); - set_temp_class_bits(&c->nodes, inst->dst.index, + set_temp_class_bits(c, inst->dst.index, CLASS_BITS_PHYS); break; } @@ -1002,7 +1135,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, * phys regfile. */ assert(inst->dst.file == QFILE_TEMP); - set_temp_class_bits(&c->nodes, inst->dst.index, + set_temp_class_bits(c, inst->dst.index, CLASS_BITS_PHYS); break; } @@ -1015,6 +1148,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, if (inst->src[0].file == QFILE_REG) { switch (inst->src[0].index) { case 0: + /* V3D 7.x doesn't use rf0 for thread payload */ + if (c->devinfo->ver >= 71) + break; + else + FALLTHROUGH; case 1: case 2: case 3: { @@ -1024,14 +1162,34 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, */ assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV); assert(inst->dst.file == QFILE_TEMP); - uint32_t node = temp_to_node(inst->dst.index); + uint32_t node = temp_to_node(c, inst->dst.index); ra_set_node_reg(c->g, node, - PHYS_INDEX + inst->src[0].index); + get_phys_index(c->devinfo) + + inst->src[0].index); break; } } } + /* Don't allocate rf0 to temps that cross ranges where we have + * live implicit rf0 writes from ldvary. We can identify these + * by tracking the last ldvary instruction and explicit reads + * of rf0. + */ + if (c->devinfo->ver >= 71 && + ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) || + (vir_get_nsrc(inst) > 1 && + inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && + c->temp_end[i] > last_ldvary_ip) { + ra_add_node_interference(c->g, + temp_to_node(c, i), + implicit_rf_nodes[0]); + } + } + } + if (inst->dst.file == QFILE_TEMP) { /* Only a ldunif gets to write to R5, which only has a * single 32-bit channel of storage. @@ -1041,36 +1199,95 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes, * because ldunif has usually a shorter lifespan, allowing for * more accumulator reuse and QPU merges. */ - if (!inst->qpu.sig.ldunif) { - uint8_t class_bits = - get_temp_class_bits(&c->nodes, inst->dst.index) & - ~CLASS_BITS_R5; - set_temp_class_bits(&c->nodes, inst->dst.index, - class_bits); - + if (c->devinfo->has_accumulators) { + if (!inst->qpu.sig.ldunif) { + uint8_t class_bits = + get_temp_class_bits(c, inst->dst.index) & + ~CLASS_BITS_R5; + set_temp_class_bits(c, inst->dst.index, + class_bits); + + } else { + /* Until V3D 4.x, we could only load a uniform + * to r5, so we'll need to spill if uniform + * loads interfere with each other. + */ + if (c->devinfo->ver < 40) { + set_temp_class_bits(c, inst->dst.index, + CLASS_BITS_R5); + } + } } else { - /* Until V3D 4.x, we could only load a uniform - * to r5, so we'll need to spill if uniform - * loads interfere with each other. + /* Make sure we don't allocate the ldvary's + * destination to rf0, since it would clash + * with its implicit write to that register. + */ + if (inst->qpu.sig.ldvary) { + ra_add_node_interference(c->g, + temp_to_node(c, inst->dst.index), + implicit_rf_nodes[0]); + } + /* Flag dst temps from ldunif(a) instructions + * so we can try to assign rf0 to them and avoid + * converting these to ldunif(a)rf. */ - if (c->devinfo->ver < 40) { - set_temp_class_bits(&c->nodes, inst->dst.index, - CLASS_BITS_R5); + if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) { + const uint32_t dst_n = + temp_to_node(c, inst->dst.index); + c->nodes.info[dst_n].is_ldunif_dst = true; } } } /* All accumulators are invalidated across a thread switch. */ - if (inst->qpu.sig.thrsw) { + if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) { for (int i = 0; i < c->num_temps; i++) { if (c->temp_start[i] < ip && c->temp_end[i] > ip) { - set_temp_class_bits(&c->nodes, i, + set_temp_class_bits(c, i, CLASS_BITS_PHYS); } } } } +static void +flag_program_end_nodes(struct v3d_compile *c) +{ + /* Only look for registers used in this many instructions */ + uint32_t last_set_count = 6; + + struct qblock *last_block = vir_exit_block(c); + list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) { + if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) + continue; + + int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); + for (int i = 0; i < num_src; i++) { + if (inst->src[i].file == QFILE_TEMP) { + int node = temp_to_node(c, inst->src[i].index); + c->nodes.info[node].is_program_end = true; + } + } + + num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); + for (int i = 0; i < num_src; i++) { + if (inst->src[i].file == QFILE_TEMP) { + int node = temp_to_node(c, inst->src[i].index); + c->nodes.info[node].is_program_end = true; + + } + } + + if (inst->dst.file == QFILE_TEMP) { + int node = temp_to_node(c, inst->dst.index); + c->nodes.info[node].is_program_end = true; + } + + if (--last_set_count == 0) + break; + } +} + /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. * @@ -1080,19 +1297,32 @@ struct qpu_reg * v3d_register_allocate(struct v3d_compile *c) { int acc_nodes[ACC_COUNT]; + int implicit_rf_nodes[IMPLICIT_RF_COUNT]; + + unsigned num_ra_nodes = c->num_temps; + if (c->devinfo->has_accumulators) + num_ra_nodes += ARRAY_SIZE(acc_nodes); + else + num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes); + c->nodes = (struct v3d_ra_node_info) { .alloc_count = c->num_temps, .info = ralloc_array_size(c, sizeof(c->nodes.info[0]), - c->num_temps + ACC_COUNT), + num_ra_nodes), }; + uint32_t phys_index = get_phys_index(c->devinfo); + struct v3d_ra_select_callback_data callback_data = { + .phys_index = phys_index, .next_acc = 0, /* Start at RF3, to try to keep the TLB writes from using - * RF0-2. + * RF0-2. Start at RF4 in 7.x to prevent TLB writes from + * using RF2-3. */ - .next_phys = 3, + .next_phys = c->devinfo->ver <= 42 ? 3 : 4, .nodes = &c->nodes, + .devinfo = c->devinfo, }; vir_calculate_live_intervals(c); @@ -1108,27 +1338,35 @@ v3d_register_allocate(struct v3d_compile *c) c->thread_index--; } - c->g = ra_alloc_interference_graph(c->compiler->regs, - c->num_temps + ARRAY_SIZE(acc_nodes)); + c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes); ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data); /* Make some fixed nodes for the accumulators, which we will need to * interfere with when ops have implied r3/r4 writes or for the thread * switches. We could represent these as classes for the nodes to * live in, but the classes take up a lot of memory to set up, so we - * don't want to make too many. + * don't want to make too many. We use the same mechanism on platforms + * without accumulators that can have implicit writes to phys regs. */ - for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) { - if (i < ACC_COUNT) { + for (uint32_t i = 0; i < num_ra_nodes; i++) { + c->nodes.info[i].is_ldunif_dst = false; + c->nodes.info[i].is_program_end = false; + c->nodes.info[i].unused = false; + c->nodes.info[i].priority = 0; + c->nodes.info[i].class_bits = 0; + if (c->devinfo->has_accumulators && i < ACC_COUNT) { acc_nodes[i] = i; ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i); - c->nodes.info[i].priority = 0; - c->nodes.info[i].class_bits = 0; + } else if (!c->devinfo->has_accumulators && + i < ARRAY_SIZE(implicit_rf_nodes)) { + implicit_rf_nodes[i] = i; + ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i); } else { - uint32_t t = node_to_temp(i); + uint32_t t = node_to_temp(c, i); c->nodes.info[i].priority = c->temp_end[t] - c->temp_start[t]; - c->nodes.info[i].class_bits = CLASS_BITS_ANY; + c->nodes.info[i].class_bits = + get_class_bit_any(c->devinfo); } } @@ -1136,25 +1374,61 @@ v3d_register_allocate(struct v3d_compile *c) * interferences. */ int ip = 0; + int last_ldvary_ip = -1; vir_for_each_inst_inorder(inst, c) { inst->ip = ip++; - update_graph_and_reg_classes_for_inst(c, acc_nodes, inst); + + /* ldunif(a) always write to a temporary, so we have + * liveness info available to decide if rf0 is + * available for them, however, ldvary is different: + * it always writes to rf0 directly so we don't have + * liveness information for its implicit rf0 write. + * + * That means the allocator may assign rf0 to a temp + * that is defined while an implicit rf0 write from + * ldvary is still live. We fix that by manually + * tracking rf0 live ranges from ldvary instructions. + */ + if (inst->qpu.sig.ldvary) + last_ldvary_ip = ip; + + update_graph_and_reg_classes_for_inst(c, acc_nodes, + implicit_rf_nodes, + last_ldvary_ip, inst); } + /* Flag the nodes that are used in the last instructions of the program + * (there are some registers that cannot be used in the last 3 + * instructions). We only do this for fragment shaders, because the idea + * is that by avoiding this conflict we may be able to emit the last + * thread switch earlier in some cases, however, in non-fragment shaders + * this won't happen because the last instructions are always VPM stores + * with a small immediate, which conflicts with other signals, + * preventing us from ever moving the thrsw earlier. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) + flag_program_end_nodes(c); + /* Set the register classes for all our temporaries in the graph */ for (uint32_t i = 0; i < c->num_temps; i++) { - ra_set_node_class(c->g, temp_to_node(i), + ra_set_node_class(c->g, temp_to_node(c, i), choose_reg_class_for_temp(c, i)); } /* Add register interferences based on liveness data */ for (uint32_t i = 0; i < c->num_temps; i++) { + /* And while we are here, let's also flag nodes for + * unused temps. + */ + if (c->temp_start[i] > c->temp_end[i]) + c->nodes.info[temp_to_node(c, i)].unused = true; + for (uint32_t j = i + 1; j < c->num_temps; j++) { if (interferes(c->temp_start[i], c->temp_end[i], c->temp_start[j], c->temp_end[j])) { ra_add_node_interference(c->g, - temp_to_node(i), - temp_to_node(j)); + temp_to_node(c, i), + temp_to_node(c, j)); } } } @@ -1171,9 +1445,9 @@ v3d_register_allocate(struct v3d_compile *c) if (c->spill_size < V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { int node = v3d_choose_spill_node(c); - uint32_t temp = node_to_temp(node); + uint32_t temp = node_to_temp(c, node); if (node != -1) { - v3d_spill_reg(c, acc_nodes, temp); + v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); continue; } } @@ -1186,11 +1460,11 @@ v3d_register_allocate(struct v3d_compile *c) if (node == -1) goto spill_fail; - uint32_t temp = node_to_temp(node); + uint32_t temp = node_to_temp(c, node); enum temp_spill_type spill_type = get_spill_type_for_temp(c, temp); if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) { - v3d_spill_reg(c, acc_nodes, temp); + v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp); if (c->spills + c->fills > c->max_tmu_spills) goto spill_fail; } else { @@ -1201,14 +1475,14 @@ v3d_register_allocate(struct v3d_compile *c) /* Allocation was successful, build the 'temp -> reg' map */ temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); for (uint32_t i = 0; i < c->num_temps; i++) { - int ra_reg = ra_get_node_reg(c->g, temp_to_node(i)); - if (ra_reg < PHYS_INDEX) { + int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i)); + if (ra_reg < phys_index) { temp_registers[i].magic = true; temp_registers[i].index = (V3D_QPU_WADDR_R0 + ra_reg - ACC_INDEX); } else { temp_registers[i].magic = false; - temp_registers[i].index = ra_reg - PHYS_INDEX; + temp_registers[i].index = ra_reg - phys_index; } } diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c index 45e6bfa1470..4ed184cbbcb 100644 --- a/src/broadcom/compiler/vir_to_qpu.c +++ b/src/broadcom/compiler/vir_to_qpu.c @@ -86,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst) return q; } +static void +v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src) +{ + /* If we have a small immediate move it from inst->raddr_b to the + * corresponding raddr. + */ + if (src.smimm) { + assert(instr->sig.small_imm_a || instr->sig.small_imm_b || + instr->sig.small_imm_c || instr->sig.small_imm_d); + *raddr = instr->raddr_b; + return; + } + + assert(!src.magic); + *raddr = src.index; +} + /** * Allocates the src register (accumulator or register file) into the RADDR * fields of the instruction. */ static void -set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) +v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) { if (src.smimm) { - assert(instr->sig.small_imm); + assert(instr->sig.small_imm_b); *mux = V3D_QPU_MUX_B; return; } @@ -106,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) return; } - if (instr->alu.add.a != V3D_QPU_MUX_A && - instr->alu.add.b != V3D_QPU_MUX_A && - instr->alu.mul.a != V3D_QPU_MUX_A && - instr->alu.mul.b != V3D_QPU_MUX_A) { + if (instr->alu.add.a.mux != V3D_QPU_MUX_A && + instr->alu.add.b.mux != V3D_QPU_MUX_A && + instr->alu.mul.a.mux != V3D_QPU_MUX_A && + instr->alu.mul.b.mux != V3D_QPU_MUX_A) { instr->raddr_a = src.index; *mux = V3D_QPU_MUX_A; } else { if (instr->raddr_a == src.index) { *mux = V3D_QPU_MUX_A; } else { - assert(!(instr->alu.add.a == V3D_QPU_MUX_B && - instr->alu.add.b == V3D_QPU_MUX_B && - instr->alu.mul.a == V3D_QPU_MUX_B && - instr->alu.mul.b == V3D_QPU_MUX_B) || + assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B && + instr->alu.add.b.mux == V3D_QPU_MUX_B && + instr->alu.mul.a.mux == V3D_QPU_MUX_B && + instr->alu.mul.b.mux == V3D_QPU_MUX_B) || src.index == instr->raddr_b); instr->raddr_b = src.index; @@ -128,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src) } } -static bool -is_no_op_mov(struct qinst *qinst) +/* + * The main purpose of the following wrapper is to make calling set_src + * cleaner. This is the reason it receives both mux and raddr pointers. Those + * will be filled or not based on the device version. + */ +static void +set_src(struct v3d_qpu_instr *instr, + enum v3d_qpu_mux *mux, + uint8_t *raddr, + struct qpu_reg src, + const struct v3d_device_info *devinfo) { - static const struct v3d_qpu_sig no_sig = {0}; - - /* Make sure it's just a lone MOV. */ - if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || - qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || - qinst->qpu.alu.add.op != V3D_QPU_A_NOP || - memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { - return false; - } + if (devinfo->ver < 71) + return v3d33_set_src(instr, mux, src); + else + return v3d71_set_src(instr, raddr, src); +} - /* Check if it's a MOV from a register to itself. */ +static bool +v3d33_mov_src_and_dst_equal(struct qinst *qinst) +{ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; if (qinst->qpu.alu.mul.magic_write) { if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4) return false; - if (qinst->qpu.alu.mul.a != + if (qinst->qpu.alu.mul.a.mux != V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) { return false; } } else { int raddr; - switch (qinst->qpu.alu.mul.a) { + switch (qinst->qpu.alu.mul.a.mux) { case V3D_QPU_MUX_A: raddr = qinst->qpu.raddr_a; break; @@ -168,10 +192,61 @@ is_no_op_mov(struct qinst *qinst) return false; } + return true; +} + +static bool +v3d71_mov_src_and_dst_equal(struct qinst *qinst) +{ + if (qinst->qpu.alu.mul.magic_write) + return false; + + enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr; + int raddr; + + raddr = qinst->qpu.alu.mul.a.raddr; + if (raddr != waddr) + return false; + + return true; +} + +static bool +mov_src_and_dst_equal(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + if (devinfo->ver < 71) + return v3d33_mov_src_and_dst_equal(qinst); + else + return v3d71_mov_src_and_dst_equal(qinst); +} + + +static bool +is_no_op_mov(struct qinst *qinst, + const struct v3d_device_info *devinfo) +{ + static const struct v3d_qpu_sig no_sig = {0}; + + /* Make sure it's just a lone MOV. We only check for M_MOV. Although + * for V3D 7.x there is also A_MOV, we don't need to check for it as + * we always emit using M_MOV. We could use A_MOV later on the + * squedule to improve performance + */ + if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + qinst->qpu.alu.mul.op != V3D_QPU_M_MOV || + qinst->qpu.alu.add.op != V3D_QPU_A_NOP || + memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) { + return false; + } + + if (!mov_src_and_dst_equal(qinst, devinfo)) + return false; + /* No packing or flags updates, or we need to execute the * instruction. */ - if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || + if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE || qinst->qpu.flags.mc != V3D_QPU_COND_NONE || qinst->qpu.flags.mpf != V3D_QPU_PF_NONE || @@ -277,8 +352,15 @@ v3d_generate_code_block(struct v3d_compile *c, assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); - if (!dst.magic || - dst.index != V3D_QPU_WADDR_R5) { + bool use_rf; + if (c->devinfo->has_accumulators) { + use_rf = !dst.magic || + dst.index != V3D_QPU_WADDR_R5; + } else { + use_rf = dst.magic || dst.index != 0; + } + + if (use_rf) { assert(c->devinfo->ver >= 40); if (qinst->qpu.sig.ldunif) { @@ -300,13 +382,18 @@ v3d_generate_code_block(struct v3d_compile *c, qinst->qpu.sig_magic = dst.magic; } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) { assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.a, src[0]); + &qinst->qpu.alu.add.a.mux, + &qinst->qpu.alu.add.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.add.b, src[1]); + &qinst->qpu.alu.add.b.mux, + &qinst->qpu.alu.add.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.add.waddr = dst.index; @@ -314,17 +401,21 @@ v3d_generate_code_block(struct v3d_compile *c, } else { if (nsrc >= 1) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.a, src[0]); + &qinst->qpu.alu.mul.a.mux, + &qinst->qpu.alu.mul.a.raddr, + src[0], c->devinfo); } if (nsrc >= 2) { set_src(&qinst->qpu, - &qinst->qpu.alu.mul.b, src[1]); + &qinst->qpu.alu.mul.b.mux, + &qinst->qpu.alu.mul.b.raddr, + src[1], c->devinfo); } qinst->qpu.alu.mul.waddr = dst.index; qinst->qpu.alu.mul.magic_write = dst.magic; - if (is_no_op_mov(qinst)) { + if (is_no_op_mov(qinst, c->devinfo)) { vir_remove_instruction(c, qinst); continue; } diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build index 2c10e46b188..73cb7aa0575 100644 --- a/src/broadcom/meson.build +++ b/src/broadcom/meson.build @@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle') subdir('cle') -v3d_versions = ['33', '41', '42'] +v3d_versions = ['33', '41', '42', '71'] v3d_libs = [] if with_gallium_v3d or with_broadcom_vk diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c index 28fb2357b97..c1590a760de 100644 --- a/src/broadcom/qpu/qpu_disasm.c +++ b/src/broadcom/qpu/qpu_disasm.c @@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n) static void -v3d_qpu_disasm_raddr(struct disasm_state *disasm, - const struct v3d_qpu_instr *instr, uint8_t mux) +v3d33_qpu_disasm_raddr(struct disasm_state *disasm, + const struct v3d_qpu_instr *instr, + enum v3d_qpu_mux mux) { if (mux == V3D_QPU_MUX_A) { append(disasm, "rf%d", instr->raddr_a); } else if (mux == V3D_QPU_MUX_B) { - if (instr->sig.small_imm) { + if (instr->sig.small_imm_b) { uint32_t val; ASSERTED bool ok = v3d_qpu_small_imm_unpack(disasm->devinfo, @@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm, } } +enum v3d_qpu_input_class { + V3D_QPU_ADD_A, + V3D_QPU_ADD_B, + V3D_QPU_MUL_A, + V3D_QPU_MUL_B +}; + +static void +v3d71_qpu_disasm_raddr(struct disasm_state *disasm, + const struct v3d_qpu_instr *instr, + uint8_t raddr, + enum v3d_qpu_input_class input_class) +{ + bool is_small_imm = false; + switch(input_class) { + case V3D_QPU_ADD_A: + is_small_imm = instr->sig.small_imm_a; + break; + case V3D_QPU_ADD_B: + is_small_imm = instr->sig.small_imm_b; + break; + case V3D_QPU_MUL_A: + is_small_imm = instr->sig.small_imm_c; + break; + case V3D_QPU_MUL_B: + is_small_imm = instr->sig.small_imm_d; + break; + } + + if (is_small_imm) { + uint32_t val; + ASSERTED bool ok = + v3d_qpu_small_imm_unpack(disasm->devinfo, + raddr, + &val); + + if ((int)val >= -16 && (int)val <= 15) + append(disasm, "%d", val); + else + append(disasm, "0x%08x", val); + assert(ok); + } else { + append(disasm, "rf%d", raddr); + } +} + +static void +v3d_qpu_disasm_raddr(struct disasm_state *disasm, + const struct v3d_qpu_instr *instr, + const struct v3d_qpu_input *input, + enum v3d_qpu_input_class input_class) +{ + if (disasm->devinfo->ver < 71) + v3d33_qpu_disasm_raddr(disasm, instr, input->mux); + else + v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class); +} + static void v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic) { @@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm, if (num_src >= 1) { if (has_dst) append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.add.a_unpack)); + v3d_qpu_unpack_name(instr->alu.add.a.unpack)); } if (num_src >= 2) { append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.add.b_unpack)); + v3d_qpu_unpack_name(instr->alu.add.b.unpack)); } } @@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm, if (num_src >= 1) { if (has_dst) append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.mul.a_unpack)); + v3d_qpu_unpack_name(instr->alu.mul.a.unpack)); } if (num_src >= 2) { append(disasm, ", "); - v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b); + v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B); append(disasm, "%s", - v3d_qpu_unpack_name(instr->alu.mul.b_unpack)); + v3d_qpu_unpack_name(instr->alu.mul.b.unpack)); } } diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c index 60dabf74e8e..44f20618a5a 100644 --- a/src/broadcom/qpu/qpu_instr.c +++ b/src/broadcom/qpu/qpu_instr.c @@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo, if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU) return "tmu"; + /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below + */ + if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD) + return "quad"; + + if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP) + return "rep"; + static const char *waddr_magic[] = { [V3D_QPU_WADDR_R0] = "r0", [V3D_QPU_WADDR_R1] = "r1", @@ -169,6 +177,12 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op) [V3D_QPU_A_ITOF] = "itof", [V3D_QPU_A_CLZ] = "clz", [V3D_QPU_A_UTOF] = "utof", + [V3D_QPU_A_MOV] = "mov", + [V3D_QPU_A_FMOV] = "fmov", + [V3D_QPU_A_VPACK] = "vpack", + [V3D_QPU_A_V8PACK] = "v8pack", + [V3D_QPU_A_V10PACK] = "v10pack", + [V3D_QPU_A_V11FPACK] = "v11fpack", }; if (op >= ARRAY_SIZE(op_names)) @@ -191,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op) [V3D_QPU_M_MOV] = "mov", [V3D_QPU_M_NOP] = "nop", [V3D_QPU_M_FMUL] = "fmul", + [V3D_QPU_M_FTOUNORM16] = "ftounorm16", + [V3D_QPU_M_FTOSNORM16] = "ftosnorm16", + [V3D_QPU_M_VFTOUNORM8] = "vftounorm8", + [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8", + [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo", + [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi", }; if (op >= ARRAY_SIZE(op_names)) @@ -450,6 +470,13 @@ static const uint8_t add_op_args[] = { [V3D_QPU_A_ITOF] = D | A, [V3D_QPU_A_CLZ] = D | A, [V3D_QPU_A_UTOF] = D | A, + + [V3D_QPU_A_MOV] = D | A, + [V3D_QPU_A_FMOV] = D | A, + [V3D_QPU_A_VPACK] = D | A | B, + [V3D_QPU_A_V8PACK] = D | A | B, + [V3D_QPU_A_V10PACK] = D | A | B, + [V3D_QPU_A_V11FPACK] = D | A | B, }; static const uint8_t mul_op_args[] = { @@ -463,6 +490,12 @@ static const uint8_t mul_op_args[] = { [V3D_QPU_M_NOP] = 0, [V3D_QPU_M_MOV] = D | A, [V3D_QPU_M_FMUL] = D | A | B, + [V3D_QPU_M_FTOUNORM16] = D | A, + [V3D_QPU_M_FTOSNORM16] = D | A, + [V3D_QPU_M_VFTOUNORM8] = D | A, + [V3D_QPU_M_VFTOSNORM8] = D | A, + [V3D_QPU_M_VFTOUNORM10LO] = D | A, + [V3D_QPU_M_VFTOUNORM10HI] = D | A, }; bool @@ -636,12 +669,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op) } bool -v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) +v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) { - if (inst->sig.ldtlb || - inst->sig.ldtlbu) - return true; + return inst->sig.ldtlb || inst->sig.ldtlbu; +} +bool +v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) +{ if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && @@ -659,6 +694,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) return false; } +bool +v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) +{ + return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst); +} + bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) { @@ -846,6 +887,9 @@ bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if(!devinfo->has_accumulators) + return false; + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3)) return true; @@ -856,6 +900,9 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if (!devinfo->has_accumulators) + return false; + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.op != V3D_QPU_A_NOP && inst->alu.add.magic_write && @@ -886,6 +933,9 @@ bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if (!devinfo->has_accumulators) + return false; + if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5)) return true; @@ -896,6 +946,9 @@ bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { + if (!devinfo->has_accumulators) + return false; + if (v3d_qpu_writes_r5(devinfo, inst)) return true; if (v3d_qpu_writes_r4(devinfo, inst)) @@ -912,16 +965,68 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, return false; } +bool +v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst) +{ + if (devinfo->ver >= 71 && + (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) { + return true; + } + + return false; +} + bool v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) { int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); - return ((add_nsrc > 0 && inst->alu.add.a == mux) || - (add_nsrc > 1 && inst->alu.add.b == mux) || - (mul_nsrc > 0 && inst->alu.mul.a == mux) || - (mul_nsrc > 1 && inst->alu.mul.b == mux)); + return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) || + (add_nsrc > 1 && inst->alu.add.b.mux == mux) || + (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) || + (mul_nsrc > 1 && inst->alu.mul.b.mux == mux)); +} + +bool +v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr) +{ + int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op); + int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op); + + return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) || + (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) || + (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) || + (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr); +} + +bool +v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, + uint8_t waddr) +{ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (v3d_qpu_add_op_has_dst(inst->alu.add.op) && + !inst->alu.add.magic_write && + inst->alu.add.waddr == waddr) { + return true; + } + + if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) && + !inst->alu.mul.magic_write && + inst->alu.mul.waddr == waddr) { + return true; + } + + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && + !inst->sig_magic && inst->sig_addr == waddr) { + return true; + } + + return false; } bool diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h index 2e133472698..56eee9f9cac 100644 --- a/src/broadcom/qpu/qpu_instr.h +++ b/src/broadcom/qpu/qpu_instr.h @@ -50,10 +50,13 @@ struct v3d_qpu_sig { bool ldvpm:1; bool ldtlb:1; bool ldtlbu:1; - bool small_imm:1; bool ucb:1; bool rotate:1; bool wrtmuc:1; + bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */ + bool small_imm_b:1; /* raddr_b (add b) */ + bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */ + bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */ }; enum v3d_qpu_cond { @@ -88,12 +91,13 @@ enum v3d_qpu_uf { }; enum v3d_qpu_waddr { - V3D_QPU_WADDR_R0 = 0, - V3D_QPU_WADDR_R1 = 1, - V3D_QPU_WADDR_R2 = 2, - V3D_QPU_WADDR_R3 = 3, - V3D_QPU_WADDR_R4 = 4, - V3D_QPU_WADDR_R5 = 5, + V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */ + V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */ V3D_QPU_WADDR_NOP = 6, V3D_QPU_WADDR_TLB = 7, V3D_QPU_WADDR_TLBU = 8, @@ -108,12 +112,12 @@ enum v3d_qpu_waddr { V3D_QPU_WADDR_SYNC = 16, V3D_QPU_WADDR_SYNCU = 17, V3D_QPU_WADDR_SYNCB = 18, - V3D_QPU_WADDR_RECIP = 19, - V3D_QPU_WADDR_RSQRT = 20, - V3D_QPU_WADDR_EXP = 21, - V3D_QPU_WADDR_LOG = 22, - V3D_QPU_WADDR_SIN = 23, - V3D_QPU_WADDR_RSQRT2 = 24, + V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */ + V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */ V3D_QPU_WADDR_TMUC = 32, V3D_QPU_WADDR_TMUS = 33, V3D_QPU_WADDR_TMUT = 34, @@ -129,7 +133,8 @@ enum v3d_qpu_waddr { V3D_QPU_WADDR_TMUHSCM = 44, V3D_QPU_WADDR_TMUHSF = 45, V3D_QPU_WADDR_TMUHSLOD = 46, - V3D_QPU_WADDR_R5REP = 55, + V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */ + V3D_QPU_WADDR_REP = 55, /* V3D 7.x */ }; struct v3d_qpu_flags { @@ -222,6 +227,14 @@ enum v3d_qpu_add_op { V3D_QPU_A_ITOF, V3D_QPU_A_CLZ, V3D_QPU_A_UTOF, + + /* V3D 7.x */ + V3D_QPU_A_FMOV, + V3D_QPU_A_MOV, + V3D_QPU_A_VPACK, + V3D_QPU_A_V8PACK, + V3D_QPU_A_V10PACK, + V3D_QPU_A_V11FPACK, }; enum v3d_qpu_mul_op { @@ -235,6 +248,14 @@ enum v3d_qpu_mul_op { V3D_QPU_M_MOV, V3D_QPU_M_NOP, V3D_QPU_M_FMUL, + + /* V3D 7.x */ + V3D_QPU_M_FTOUNORM16, + V3D_QPU_M_FTOSNORM16, + V3D_QPU_M_VFTOUNORM8, + V3D_QPU_M_VFTOSNORM8, + V3D_QPU_M_VFTOUNORM10LO, + V3D_QPU_M_VFTOUNORM10HI, }; enum v3d_qpu_output_pack { @@ -276,6 +297,15 @@ enum v3d_qpu_input_unpack { /** Swap high and low 16 bits */ V3D_QPU_UNPACK_SWAP_16, + + /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */ + V3D_QPU_UNPACK_UL, + /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */ + V3D_QPU_UNPACK_UH, + /** Convert low 16 bits from 16-bit integer to signed 32-bit int */ + V3D_QPU_UNPACK_IL, + /** Convert high 16 bits from 16-bit integer to signed 32-bit int */ + V3D_QPU_UNPACK_IH, }; enum v3d_qpu_mux { @@ -289,25 +319,29 @@ enum v3d_qpu_mux { V3D_QPU_MUX_B, }; +struct v3d_qpu_input { + union { + enum v3d_qpu_mux mux; /* V3D 4.x */ + uint8_t raddr; /* V3D 7.x */ + }; + enum v3d_qpu_input_unpack unpack; +}; + struct v3d_qpu_alu_instr { struct { enum v3d_qpu_add_op op; - enum v3d_qpu_mux a, b; + struct v3d_qpu_input a, b; uint8_t waddr; bool magic_write; enum v3d_qpu_output_pack output_pack; - enum v3d_qpu_input_unpack a_unpack; - enum v3d_qpu_input_unpack b_unpack; } add; struct { enum v3d_qpu_mul_op op; - enum v3d_qpu_mux a, b; + struct v3d_qpu_input a, b; uint8_t waddr; bool magic_write; enum v3d_qpu_output_pack output_pack; - enum v3d_qpu_input_unpack a_unpack; - enum v3d_qpu_input_unpack b_unpack; } mul; }; @@ -379,8 +413,8 @@ struct v3d_qpu_instr { struct v3d_qpu_sig sig; uint8_t sig_addr; bool sig_magic; /* If the signal writes to a magic address */ - uint8_t raddr_a; - uint8_t raddr_b; + uint8_t raddr_a; /* V3D 4.x */ + uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */ struct v3d_qpu_flags flags; union { @@ -450,6 +484,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; +bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; @@ -464,6 +500,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; +bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; @@ -483,4 +521,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; + +bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr); +bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *inst, + uint8_t waddr); #endif diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c index 94629aff4fc..4e3c3da8866 100644 --- a/src/broadcom/qpu/qpu_pack.c +++ b/src/broadcom/qpu/qpu_pack.c @@ -84,6 +84,9 @@ #define V3D_QPU_MUL_A_SHIFT 18 #define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18) +#define V3D_QPU_RADDR_C_SHIFT 18 +#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18) + #define V3D_QPU_ADD_B_SHIFT 15 #define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15) @@ -98,6 +101,9 @@ #define V3D_QPU_BRANCH_BDI_SHIFT 12 #define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12) +#define V3D_QPU_RADDR_D_SHIFT 12 +#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12) + #define V3D_QPU_RADDR_A_SHIFT 6 #define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6) @@ -112,12 +118,15 @@ #define LDTMU .ldtmu = true #define LDVARY .ldvary = true #define LDVPM .ldvpm = true -#define SMIMM .small_imm = true #define LDTLB .ldtlb = true #define LDTLBU .ldtlbu = true #define UCB .ucb = true #define ROT .rotate = true #define WRTMUC .wrtmuc = true +#define SMIMM_A .small_imm_a = true +#define SMIMM_B .small_imm_b = true +#define SMIMM_C .small_imm_c = true +#define SMIMM_D .small_imm_d = true static const struct v3d_qpu_sig v33_sig_map[] = { /* MISC R3 R4 R5 */ @@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { [11] = { THRSW, LDVARY, LDUNIF }, [12] = { LDVARY, LDTMU, }, [13] = { THRSW, LDVARY, LDTMU, }, - [14] = { SMIMM, LDVARY, }, - [15] = { SMIMM, }, + [14] = { SMIMM_B, LDVARY, }, + [15] = { SMIMM_B, }, [16] = { LDTLB, }, [17] = { LDTLBU, }, /* 18-21 reserved */ @@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = { [27] = { THRSW, LDVPM, LDUNIF }, [28] = { LDVPM, LDTMU, }, [29] = { THRSW, LDVPM, LDTMU, }, - [30] = { SMIMM, LDVPM, }, - [31] = { SMIMM, }, + [30] = { SMIMM_B, LDVPM, }, + [31] = { SMIMM_B, }, }; static const struct v3d_qpu_sig v40_sig_map[] = { @@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = { [10] = { LDVARY, LDUNIF }, [11] = { THRSW, LDVARY, LDUNIF }, /* 12-13 reserved */ - [14] = { SMIMM, LDVARY, }, - [15] = { SMIMM, }, + [14] = { SMIMM_B, LDVARY, }, + [15] = { SMIMM_B, }, [16] = { LDTLB, }, [17] = { LDTLBU, }, [18] = { WRTMUC }, @@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = { [22] = { UCB, }, [23] = { ROT, }, /* 24-30 reserved */ - [31] = { SMIMM, LDTMU, }, + [31] = { SMIMM_B, LDTMU, }, }; static const struct v3d_qpu_sig v41_sig_map[] = { @@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = { [11] = { THRSW, LDVARY, LDUNIF }, [12] = { LDUNIFRF }, [13] = { THRSW, LDUNIFRF }, - [14] = { SMIMM, LDVARY, }, - [15] = { SMIMM, }, + [14] = { SMIMM_B, LDVARY }, + [15] = { SMIMM_B, }, [16] = { LDTLB, }, [17] = { LDTLBU, }, [18] = { WRTMUC }, @@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = { [24] = { LDUNIFA}, [25] = { LDUNIFARF }, /* 26-30 reserved */ - [31] = { SMIMM, LDTMU, }, + [31] = { SMIMM_B, LDTMU, }, +}; + + +static const struct v3d_qpu_sig v71_sig_map[] = { + /* MISC phys RF0 */ + [0] = { }, + [1] = { THRSW, }, + [2] = { LDUNIF }, + [3] = { THRSW, LDUNIF }, + [4] = { LDTMU, }, + [5] = { THRSW, LDTMU, }, + [6] = { LDTMU, LDUNIF }, + [7] = { THRSW, LDTMU, LDUNIF }, + [8] = { LDVARY, }, + [9] = { THRSW, LDVARY, }, + [10] = { LDVARY, LDUNIF }, + [11] = { THRSW, LDVARY, LDUNIF }, + [12] = { LDUNIFRF }, + [13] = { THRSW, LDUNIFRF }, + [14] = { SMIMM_A, }, + [15] = { SMIMM_B, }, + [16] = { LDTLB, }, + [17] = { LDTLBU, }, + [18] = { WRTMUC }, + [19] = { THRSW, WRTMUC }, + [20] = { LDVARY, WRTMUC }, + [21] = { THRSW, LDVARY, WRTMUC }, + [22] = { UCB, }, + /* 23 reserved */ + [24] = { LDUNIFA}, + [25] = { LDUNIFARF }, + /* 26-29 reserved */ + [30] = { SMIMM_C, }, + [31] = { SMIMM_D, }, }; bool @@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo, if (packed_sig >= ARRAY_SIZE(v33_sig_map)) return false; - if (devinfo->ver >= 41) + if (devinfo->ver >= 71) + *sig = v71_sig_map[packed_sig]; + else if (devinfo->ver >= 41) *sig = v41_sig_map[packed_sig]; else if (devinfo->ver == 40) *sig = v40_sig_map[packed_sig]; @@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo, { static const struct v3d_qpu_sig *map; - if (devinfo->ver >= 41) + if (devinfo->ver >= 71) + map = v71_sig_map; + else if (devinfo->ver >= 41) map = v41_sig_map; else if (devinfo->ver == 40) map = v40_sig_map; @@ -443,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo, /* Make a mapping of the table of opcodes in the spec. The opcode is * determined by a combination of the opcode field, and in the case of 0 or - * 1-arg opcodes, the mux_b field as well. + * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as + * well. */ -#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1)) -#define ANYMUX MUX_MASK(0, 7) +#define OP_MASK(val) BITFIELD64_BIT(val) +#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1) +#define ANYMUX OP_RANGE(0, 7) +#define ANYOPMASK OP_RANGE(0, 63) struct opcode_desc { uint8_t opcode_first; uint8_t opcode_last; - uint8_t mux_b_mask; - uint8_t mux_a_mask; + + union { + struct { + uint8_t b_mask; + uint8_t a_mask; + } mux; + uint64_t raddr_mask; + }; + uint8_t op; /* first_ver == 0 if it's the same across all V3D versions. @@ -465,122 +522,321 @@ struct opcode_desc { uint8_t last_ver; }; -static const struct opcode_desc add_ops[] = { +static const struct opcode_desc add_ops_v33[] = { /* FADD is FADDNF depending on the order of the mux_a/mux_b. */ - { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD }, - { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF }, - { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, - { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD }, - { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, - { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB }, - { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK }, - { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB }, - { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN }, - { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX }, - { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN }, - { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX }, - { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL }, - { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR }, - { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR }, - { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR }, + { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD }, + { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF }, + { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, + { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD }, + { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, + { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB }, + { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK }, + { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB }, + { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN }, + { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX }, + { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN }, + { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX }, + { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL }, + { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR }, + { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR }, + { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR }, /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */ - { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN }, - { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX }, - { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN }, - - { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND }, - { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR }, - { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR }, - - { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD }, - { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB }, - { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT }, - { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG }, - { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH }, - { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH }, - { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP }, - { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP }, - { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF }, - { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF }, - { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 }, - { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX }, - { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX }, - { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR }, - { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA }, - { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA }, - { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB }, - { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB }, - - { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD }, - { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD }, - { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD }, - { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD }, - - { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF }, - { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF }, - { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 }, - { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 }, - { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 }, - { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 }, - { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT }, - { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT }, - { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 }, - { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 }, - { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, - - { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, - { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, - { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, - { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, - { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 }, - { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 }, - { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 }, - { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 }, - { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 }, - { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 }, - { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, - { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, + { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN }, + { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX }, + { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN }, + + { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND }, + { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR }, + { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR }, + + { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD }, + { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB }, + { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT }, + { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG }, + { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH }, + { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH }, + { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP }, + { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP }, + { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF }, + { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB }, + { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, + + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD }, + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD }, + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD }, + { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD }, + + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT }, + { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 }, + { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 }, + { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 }, + + { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 }, + { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 }, + { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 }, + { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 }, + { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 }, /* FIXME: MORE COMPLICATED */ - /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ + /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */ - { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP }, - { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX }, + { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP }, + { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX }, - { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND }, - { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN }, - { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC }, - { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ }, - { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR }, - { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ }, - { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL }, - { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC }, + { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND }, + { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN }, + { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC }, + { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ }, + { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR }, + { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ }, + { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL }, + { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC }, - { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX }, - { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY }, + { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX }, + { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY }, /* The stvpms are distinguished by the waddr field. */ - { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV }, - { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD }, - { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP }, + { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV }, + { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD }, + { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP }, + + { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF }, + { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ }, + { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF }, +}; + +static const struct opcode_desc mul_ops_v33[] = { + { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD }, + { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB }, + { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 }, + { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL }, + { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 }, + { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP }, + { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 }, + { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42}, + { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 }, + { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 }, + + { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL }, +}; + +/* Note that it would have been possible to define all the add/mul opcodes in + * just one table, using the first_ver/last_ver. But taking into account that + * for v71 there were a lot of changes, it was more tidy this way. Also right + * now we are doing a linear search on those tables, so this maintains the + * tables smaller. + * + * Just in case we merge the tables, we define the first_ver as 71 for those + * opcodes that changed on v71 + */ +static const struct opcode_desc add_ops_v71[] = { + /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */ + { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD }, + { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF }, + { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD }, + { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB }, + { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK }, + { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB }, + { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN }, + { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX }, + { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN }, + { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX }, + { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL }, + { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR }, + { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR }, + { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR }, + /* FMIN is instead FMAX depending on the raddr_a/b order. */ + { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN }, + { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX }, + { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN }, + + { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND }, + { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR }, + { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR }, + { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD }, + { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB }, + + { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT }, + { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG }, + { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH }, + { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH }, + { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP }, + { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ }, + { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF }, + { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF }, + + { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 }, + { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX }, + { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX }, + { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR }, + { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA }, + { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA }, + { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB }, + { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB }, + { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD }, + { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD }, + { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF }, + { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF }, + { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID }, + { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID }, + { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID }, + { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT }, + { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT }, + { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST }, + { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST }, + + { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD }, + { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD }, + + { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 }, + { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 }, + { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 }, + + { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 }, + { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 }, + { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 }, + { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 }, + { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 }, + { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 }, + + { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 }, - { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF }, - { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ }, - { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF }, + /* The stvpms are distinguished by the waddr field. */ + { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71}, + { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71}, + { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71}, + + { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 }, + { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 }, + { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 }, + { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 }, + { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 }, + + { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 }, + { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 }, + + { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC }, + { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC }, + { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC }, + { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC }, + + { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 }, + + { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 }, + { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 }, + + { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 }, + { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 }, + + { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 }, + { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 }, + + { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 }, + { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 }, + + { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 }, + { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 }, }; -static const struct opcode_desc mul_ops[] = { - { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD }, - { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB }, - { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 }, - { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL }, - { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 }, - { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP }, - { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV }, - { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV }, - { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 }, - { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV }, - { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL }, +static const struct opcode_desc mul_ops_v71[] = { + /* For V3D 7.1, second mask field would be ignored */ + { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 }, + { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 }, + { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, + { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 }, + { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 }, + { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 }, + { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 }, + + { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 }, + { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 }, + + { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 }, + { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 }, + + { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 }, + { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 }, + { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 }, + { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 }, + { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 }, + { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 }, + + { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 }, + + { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL }, }; /* Returns true if op_desc should be filtered out based on devinfo->ver @@ -589,17 +845,23 @@ static const struct opcode_desc mul_ops[] = { */ static bool opcode_invalid_in_version(const struct v3d_device_info *devinfo, - const struct opcode_desc *op_desc) + const uint8_t first_ver, + const uint8_t last_ver) { - return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) || - (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver); + return (first_ver != 0 && devinfo->ver < first_ver) || + (last_ver != 0 && devinfo->ver > last_ver); } +/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending + * on the devinfo->ver some would be ignored. We do this way just to avoid + * having two really similar lookup_opcode methods + */ static const struct opcode_desc * lookup_opcode_from_packed(const struct v3d_device_info *devinfo, const struct opcode_desc *opcodes, size_t num_opcodes, uint32_t opcode, - uint32_t mux_a, uint32_t mux_b) + uint32_t mux_a, uint32_t mux_b, + uint32_t raddr) { for (int i = 0; i < num_opcodes; i++) { const struct opcode_desc *op_desc = &opcodes[i]; @@ -608,14 +870,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo, opcode > op_desc->opcode_last) continue; - if (opcode_invalid_in_version(devinfo, op_desc)) + if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) continue; - if (!(op_desc->mux_b_mask & (1 << mux_b))) - continue; + if (devinfo->ver < 71) { + if (!(op_desc->mux.b_mask & (1 << mux_b))) + continue; - if (!(op_desc->mux_a_mask & (1 << mux_a))) - continue; + if (!(op_desc->mux.a_mask & (1 << mux_a))) + continue; + } else { + if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr))) + continue; + } return op_desc; } @@ -667,6 +934,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked, } } +static bool +v3d_qpu_int32_unpack_unpack(uint32_t packed, + enum v3d_qpu_input_unpack *unpacked) +{ + switch (packed) { + case 0: + *unpacked = V3D_QPU_UNPACK_NONE; + return true; + case 1: + *unpacked = V3D_QPU_UNPACK_UL; + return true; + case 2: + *unpacked = V3D_QPU_UNPACK_UH; + return true; + case 3: + *unpacked = V3D_QPU_UNPACK_IL; + return true; + case 4: + *unpacked = V3D_QPU_UNPACK_IH; + return true; + default: + return false; + } +} + +static bool +v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked, + uint32_t *packed) +{ + switch (unpacked) { + case V3D_QPU_UNPACK_NONE: + *packed = 0; + return true; + case V3D_QPU_UNPACK_UL: + *packed = 1; + return true; + case V3D_QPU_UNPACK_UH: + *packed = 2; + return true; + case V3D_QPU_UNPACK_IL: + *packed = 3; + return true; + case V3D_QPU_UNPACK_IH: + *packed = 4; + return true; + default: + return false; + } +} + static bool v3d_qpu_float16_unpack_unpack(uint32_t packed, enum v3d_qpu_input_unpack *unpacked) @@ -737,8 +1054,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked, } static bool -v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, - struct v3d_qpu_instr *instr) +v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) { uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A); @@ -755,8 +1072,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, map_op = (map_op - 253 + 245); const struct opcode_desc *desc = - lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops), - map_op, mux_a, mux_b); + lookup_opcode_from_packed(devinfo, add_ops_v33, + ARRAY_SIZE(add_ops_v33), + map_op, mux_a, mux_b, 0); if (!desc) return false; @@ -812,12 +1130,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.add.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, - &instr->alu.add.b_unpack)) { + &instr->alu.add.b.unpack)) { return false; } break; @@ -831,7 +1149,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.add.output_pack = mux_b & 0x3; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } break; @@ -843,7 +1161,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.add.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } break; @@ -851,23 +1169,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, case V3D_QPU_A_VFMIN: case V3D_QPU_A_VFMAX: if (!v3d_qpu_float16_unpack_unpack(op & 0x7, - &instr->alu.add.a_unpack)) { + &instr->alu.add.a.unpack)) { return false; } instr->alu.add.output_pack = V3D_QPU_PACK_NONE; - instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; break; default: instr->alu.add.output_pack = V3D_QPU_PACK_NONE; - instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE; - instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; break; } - instr->alu.add.a = mux_a; - instr->alu.add.b = mux_b; + instr->alu.add.a.mux = mux_a; + instr->alu.add.b.mux = mux_b; instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); instr->alu.add.magic_write = false; @@ -892,8 +1210,194 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, } static bool -v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, +v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) +{ + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD); + uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B); + uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); + uint32_t map_op = op; + + const struct opcode_desc *desc = + lookup_opcode_from_packed(devinfo, + add_ops_v71, + ARRAY_SIZE(add_ops_v71), + map_op, 0, 0, + raddr_b); + if (!desc) + return false; + + instr->alu.add.op = desc->op; + + /* FADD/FADDNF and FMIN/FMAX are determined by the order of the + * operands. + */ + if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a > + instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) { + if (instr->alu.add.op == V3D_QPU_A_FMIN) + instr->alu.add.op = V3D_QPU_A_FMAX; + if (instr->alu.add.op == V3D_QPU_A_FADD) + instr->alu.add.op = V3D_QPU_A_FADDNF; + } + + /* Some QPU ops require a bit more than just basic opcode and mux a/b + * comparisons to distinguish them. + */ + switch (instr->alu.add.op) { + case V3D_QPU_A_STVPMV: + case V3D_QPU_A_STVPMD: + case V3D_QPU_A_STVPMP: + switch (waddr) { + case 0: + instr->alu.add.op = V3D_QPU_A_STVPMV; + break; + case 1: + instr->alu.add.op = V3D_QPU_A_STVPMD; + break; + case 2: + instr->alu.add.op = V3D_QPU_A_STVPMP; + break; + default: + return false; + } + break; + default: + break; + } + + switch (instr->alu.add.op) { + case V3D_QPU_A_FADD: + case V3D_QPU_A_FADDNF: + case V3D_QPU_A_FSUB: + case V3D_QPU_A_FMIN: + case V3D_QPU_A_FMAX: + case V3D_QPU_A_FCMP: + case V3D_QPU_A_VFPACK: + if (instr->alu.add.op != V3D_QPU_A_VFPACK && + instr->alu.add.op != V3D_QPU_A_FCMP) { + instr->alu.add.output_pack = (op >> 4) & 0x3; + } else { + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, + &instr->alu.add.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, + &instr->alu.add.b.unpack)) { + return false; + } + break; + + case V3D_QPU_A_FFLOOR: + case V3D_QPU_A_FROUND: + case V3D_QPU_A_FTRUNC: + case V3D_QPU_A_FCEIL: + case V3D_QPU_A_FDX: + case V3D_QPU_A_FDY: + instr->alu.add.output_pack = raddr_b & 0x3; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + case V3D_QPU_A_FTOIN: + case V3D_QPU_A_FTOIZ: + case V3D_QPU_A_FTOUZ: + case V3D_QPU_A_FTOC: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + unreachable("pending v71 update"); + if (!v3d_qpu_float16_unpack_unpack(op & 0x7, + &instr->alu.add.a.unpack)) { + return false; + } + + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + + case V3D_QPU_A_MOV: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + case V3D_QPU_A_FMOV: + instr->alu.add.output_pack = raddr_b & 0x3; + + /* Mul alu FMOV has one additional variant */ + int32_t unpack = (raddr_b >> 2) & 0x7; + if (unpack == 7) + return false; + + if (!v3d_qpu_float32_unpack_unpack(unpack, + &instr->alu.add.a.unpack)) { + return false; + } + break; + + default: + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; + instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + + instr->alu.add.a.raddr = raddr_a; + instr->alu.add.b.raddr = raddr_b; + instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A); + + instr->alu.add.magic_write = false; + if (packed_inst & V3D_QPU_MA) { + switch (instr->alu.add.op) { + case V3D_QPU_A_LDVPMV_IN: + instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT; + break; + case V3D_QPU_A_LDVPMD_IN: + instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT; + break; + case V3D_QPU_A_LDVPMG_IN: + instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT; + break; + default: + instr->alu.add.magic_write = true; + break; + } + } + + return true; +} + +static bool +v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, struct v3d_qpu_instr *instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_add_unpack(devinfo, packed_inst, instr); + else + return v3d71_qpu_add_unpack(devinfo, packed_inst, instr); +} + +static bool +v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) { uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A); @@ -901,9 +1405,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, { const struct opcode_desc *desc = - lookup_opcode_from_packed(devinfo, mul_ops, - ARRAY_SIZE(mul_ops), - op, mux_a, mux_b); + lookup_opcode_from_packed(devinfo, + mul_ops_v33, + ARRAY_SIZE(mul_ops_v33), + op, mux_a, mux_b, 0); if (!desc) return false; @@ -915,12 +1420,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, - &instr->alu.mul.a_unpack)) { + &instr->alu.mul.a.unpack)) { return false; } if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, - &instr->alu.mul.b_unpack)) { + &instr->alu.mul.b.unpack)) { return false; } @@ -931,7 +1436,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, ((mux_b >> 2) & 1)); if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3, - &instr->alu.mul.a_unpack)) { + &instr->alu.mul.a.unpack)) { return false; } @@ -941,74 +1446,169 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, - &instr->alu.mul.a_unpack)) { + &instr->alu.mul.a.unpack)) { return false; } - instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; break; default: instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; - instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE; - instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; break; } - instr->alu.mul.a = mux_a; - instr->alu.mul.b = mux_b; + instr->alu.mul.a.mux = mux_a; + instr->alu.mul.b.mux = mux_b; instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; return true; } -static const struct opcode_desc * -lookup_opcode_from_instr(const struct v3d_device_info *devinfo, - const struct opcode_desc *opcodes, size_t num_opcodes, - uint8_t op) +static bool +v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) { - for (int i = 0; i < num_opcodes; i++) { - const struct opcode_desc *op_desc = &opcodes[i]; - - if (op_desc->op != op) - continue; + uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL); + uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C); + uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D); - if (opcode_invalid_in_version(devinfo, op_desc)) - continue; + { + const struct opcode_desc *desc = + lookup_opcode_from_packed(devinfo, + mul_ops_v71, + ARRAY_SIZE(mul_ops_v71), + op, 0, 0, + raddr_d); + if (!desc) + return false; - return op_desc; + instr->alu.mul.op = desc->op; } - return NULL; -} - + switch (instr->alu.mul.op) { + case V3D_QPU_M_FMUL: + instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1; + + if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, + &instr->alu.mul.a.unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3, + &instr->alu.mul.b.unpack)) { + return false; + } + + break; + + case V3D_QPU_M_FMOV: + instr->alu.mul.output_pack = raddr_d & 0x3; + + if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7, + &instr->alu.mul.a.unpack)) { + return false; + } + + break; + + case V3D_QPU_M_VFMUL: + unreachable("pending v71 update"); + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7, + &instr->alu.mul.a.unpack)) { + return false; + } + + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + + break; + + case V3D_QPU_M_MOV: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + + if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7, + &instr->alu.mul.a.unpack)) { + return false; + } + break; + + default: + instr->alu.mul.output_pack = V3D_QPU_PACK_NONE; + instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE; + instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE; + break; + } + + instr->alu.mul.a.raddr = raddr_c; + instr->alu.mul.b.raddr = raddr_d; + instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M); + instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM; + + return true; +} + static bool -v3d_qpu_add_pack(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, + struct v3d_qpu_instr *instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr); + else + return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr); +} + +static const struct opcode_desc * +lookup_opcode_from_instr(const struct v3d_device_info *devinfo, + const struct opcode_desc *opcodes, size_t num_opcodes, + uint8_t op) +{ + for (int i = 0; i < num_opcodes; i++) { + const struct opcode_desc *op_desc = &opcodes[i]; + + if (op_desc->op != op) + continue; + + if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver)) + continue; + + return op_desc; + } + + return NULL; +} + +static bool +v3d33_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) { uint32_t waddr = instr->alu.add.waddr; - uint32_t mux_a = instr->alu.add.a; - uint32_t mux_b = instr->alu.add.b; + uint32_t mux_a = instr->alu.add.a.mux; + uint32_t mux_b = instr->alu.add.b.mux; int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); const struct opcode_desc *desc = - lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops), + lookup_opcode_from_instr(devinfo, add_ops_v33, + ARRAY_SIZE(add_ops_v33), instr->alu.add.op); if (!desc) return false; - uint32_t opcode = desc->opcode_first; + uint32_t opcode = opcode = desc->opcode_first; /* If an operation doesn't use an arg, its mux values may be used to * identify the operation type. */ if (nsrc < 2) - mux_b = ffs(desc->mux_b_mask) - 1; + mux_b = ffs(desc->mux.b_mask) - 1; if (nsrc < 1) - mux_a = ffs(desc->mux_a_mask) - 1; + mux_a = ffs(desc->mux.a_mask) - 1; bool no_magic_write = false; @@ -1061,12 +1661,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } opcode |= output_pack << 4; - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &a_unpack)) { return false; } - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, &b_unpack)) { return false; } @@ -1100,23 +1700,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, uint32_t a_unpack; uint32_t b_unpack; - if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS || - instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) { + if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || + instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { return false; } - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &a_unpack)) { return false; } - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, &b_unpack)) { return false; } - opcode = (opcode & ~(1 << 2)) | (a_unpack << 2); - opcode = (opcode & ~(1 << 0)) | (b_unpack << 0); + opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); + opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); break; } @@ -1135,13 +1735,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } mux_b |= packed; - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &packed)) { return false; } if (packed == 0) return false; - opcode = (opcode & ~(1 << 2)) | packed << 2; + opcode = (opcode & ~(0x3 << 2)) | packed << 2; break; } @@ -1153,7 +1753,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, return false; uint32_t packed; - if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, &packed)) { return false; } @@ -1166,11 +1766,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, case V3D_QPU_A_VFMIN: case V3D_QPU_A_VFMAX: if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || - instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) { + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { return false; } - if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack, + if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, &packed)) { return false; } @@ -1180,8 +1780,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, default: if (instr->alu.add.op != V3D_QPU_A_NOP && (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || - instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE || - instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) { + instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { return false; } break; @@ -1198,15 +1798,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } static bool -v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, - const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +v3d71_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) { - uint32_t mux_a = instr->alu.mul.a; - uint32_t mux_b = instr->alu.mul.b; + uint32_t waddr = instr->alu.add.waddr; + uint32_t raddr_a = instr->alu.add.a.raddr; + uint32_t raddr_b = instr->alu.add.b.raddr; + + int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op); + const struct opcode_desc *desc = + lookup_opcode_from_instr(devinfo, add_ops_v71, + ARRAY_SIZE(add_ops_v71), + instr->alu.add.op); + if (!desc) + return false; + + uint32_t opcode = opcode = desc->opcode_first; + + /* If an operation doesn't use an arg, its raddr values may be used to + * identify the operation type. + */ + if (nsrc < 2) + raddr_b = ffsll(desc->raddr_mask) - 1; + + bool no_magic_write = false; + + switch (instr->alu.add.op) { + case V3D_QPU_A_STVPMV: + waddr = 0; + no_magic_write = true; + break; + case V3D_QPU_A_STVPMD: + waddr = 1; + no_magic_write = true; + break; + case V3D_QPU_A_STVPMP: + waddr = 2; + no_magic_write = true; + break; + + case V3D_QPU_A_LDVPMV_IN: + case V3D_QPU_A_LDVPMD_IN: + case V3D_QPU_A_LDVPMP: + case V3D_QPU_A_LDVPMG_IN: + assert(!instr->alu.add.magic_write); + break; + + case V3D_QPU_A_LDVPMV_OUT: + case V3D_QPU_A_LDVPMD_OUT: + case V3D_QPU_A_LDVPMG_OUT: + assert(!instr->alu.add.magic_write); + *packed_instr |= V3D_QPU_MA; + break; + + default: + break; + } + + switch (instr->alu.add.op) { + case V3D_QPU_A_FADD: + case V3D_QPU_A_FADDNF: + case V3D_QPU_A_FSUB: + case V3D_QPU_A_FMIN: + case V3D_QPU_A_FMAX: + case V3D_QPU_A_FCMP: { + uint32_t output_pack; + uint32_t a_unpack; + uint32_t b_unpack; + + if (instr->alu.add.op != V3D_QPU_A_FCMP) { + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, + &output_pack)) { + return false; + } + opcode |= output_pack << 4; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } + + /* These operations with commutative operands are + * distinguished by which order their operands come in. + */ + bool ordering = + instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a > + instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b; + if (((instr->alu.add.op == V3D_QPU_A_FMIN || + instr->alu.add.op == V3D_QPU_A_FADD) && ordering) || + ((instr->alu.add.op == V3D_QPU_A_FMAX || + instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) { + uint32_t temp; + + temp = a_unpack; + a_unpack = b_unpack; + b_unpack = temp; + + temp = raddr_a; + raddr_a = raddr_b; + raddr_b = temp; + + /* If we are swapping raddr_a/b we also need to swap + * small_imm_a/b. + */ + if (instr->sig.small_imm_a || instr->sig.small_imm_b) { + assert(instr->sig.small_imm_a != + instr->sig.small_imm_b); + struct v3d_qpu_sig new_sig = instr->sig; + new_sig.small_imm_a = !instr->sig.small_imm_a; + new_sig.small_imm_b = !instr->sig.small_imm_b; + uint32_t sig; + if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig)) + return false; + *packed_instr &= ~V3D_QPU_SIG_MASK; + *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); + } + } + + opcode |= a_unpack << 2; + opcode |= b_unpack << 0; + + break; + } + + case V3D_QPU_A_VFPACK: { + uint32_t a_unpack; + uint32_t b_unpack; + + if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS || + instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &a_unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack, + &b_unpack)) { + return false; + } + + opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2); + opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0); + + break; + } + + case V3D_QPU_A_FFLOOR: + case V3D_QPU_A_FROUND: + case V3D_QPU_A_FTRUNC: + case V3D_QPU_A_FCEIL: + case V3D_QPU_A_FDX: + case V3D_QPU_A_FDY: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, + &packed)) { + return false; + } + raddr_b |= packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + if (packed == 0) + return false; + raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2; + break; + } + + case V3D_QPU_A_FTOIN: + case V3D_QPU_A_FTOIZ: + case V3D_QPU_A_FTOUZ: + case V3D_QPU_A_FTOC: + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) + return false; + + uint32_t packed; + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + if (packed == 0) + return false; + + raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2; + + break; + + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + + if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + opcode |= packed; + break; + + case V3D_QPU_A_MOV: { + uint32_t packed; + + if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE) + return false; + + if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + + raddr_b |= packed << 2; + break; + } + + case V3D_QPU_A_FMOV: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack, + &packed)) { + return false; + } + raddr_b = packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack, + &packed)) { + return false; + } + raddr_b |= packed << 2; + break; + } + + default: + if (instr->alu.add.op != V3D_QPU_A_NOP && + (instr->alu.add.output_pack != V3D_QPU_PACK_NONE || + instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } + break; + } + + *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A); + *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B); + *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD); + *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A); + if (instr->alu.add.magic_write && !no_magic_write) + *packed_instr |= V3D_QPU_MA; + + return true; +} + +static bool +v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + uint32_t mux_a = instr->alu.mul.a.mux; + uint32_t mux_b = instr->alu.mul.b.mux; int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); const struct opcode_desc *desc = - lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops), + lookup_opcode_from_instr(devinfo, mul_ops_v33, + ARRAY_SIZE(mul_ops_v33), instr->alu.mul.op); if (!desc) @@ -1218,10 +2083,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, * that here. If mux a/b determine packing, it will be set below. */ if (nsrc < 2) - mux_b = ffs(desc->mux_b_mask) - 1; + mux_b = ffs(desc->mux.b_mask) - 1; if (nsrc < 1) - mux_a = ffs(desc->mux_a_mask) - 1; + mux_a = ffs(desc->mux.a_mask) - 1; switch (instr->alu.mul.op) { case V3D_QPU_M_FMUL: { @@ -1236,13 +2101,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, */ opcode += packed << 4; - if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, &packed)) { return false; } opcode |= packed << 2; - if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, &packed)) { return false; } @@ -1260,7 +2125,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, opcode |= (packed >> 1) & 1; mux_b = (packed & 1) << 2; - if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack, + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, &packed)) { return false; } @@ -1274,22 +2139,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) return false; - if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack, + if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, &packed)) { return false; } - if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16) + if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) opcode = 8; else opcode |= (packed + 4) & 7; - if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) + if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) return false; break; } default: + if (instr->alu.mul.op != V3D_QPU_M_NOP && + (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || + instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } break; } @@ -1304,6 +2175,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, return true; } +static bool +v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + uint32_t raddr_c = instr->alu.mul.a.raddr; + uint32_t raddr_d = instr->alu.mul.b.raddr; + int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op); + + const struct opcode_desc *desc = + lookup_opcode_from_instr(devinfo, mul_ops_v71, + ARRAY_SIZE(mul_ops_v71), + instr->alu.mul.op); + if (!desc) + return false; + + uint32_t opcode = desc->opcode_first; + + /* Some opcodes have a single valid value for their raddr_d, so set + * that here. If raddr_b determine packing, it will be set below. + */ + if (nsrc < 2) + raddr_d = ffsll(desc->raddr_mask) - 1; + + switch (instr->alu.mul.op) { + case V3D_QPU_M_FMUL: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, + &packed)) { + return false; + } + /* No need for a +1 because desc->opcode_first has a 1 in this + * field. + */ + opcode += packed << 4; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + opcode |= packed << 2; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack, + &packed)) { + return false; + } + opcode |= packed << 0; + break; + } + + case V3D_QPU_M_FMOV: { + uint32_t packed; + + if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack, + &packed)) { + return false; + } + raddr_d |= packed; + + if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + raddr_d |= packed << 2; + break; + } + + case V3D_QPU_M_VFMUL: { + unreachable("pending v71 update"); + uint32_t packed; + + if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) + return false; + + if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16) + opcode = 8; + else + opcode |= (packed + 4) & 7; + + if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) + return false; + + break; + } + + case V3D_QPU_M_MOV: { + uint32_t packed; + + if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE) + return false; + + if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack, + &packed)) { + return false; + } + + raddr_d |= packed << 2; + break; + } + + default: + if (instr->alu.mul.op != V3D_QPU_M_NOP && + (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE || + instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE || + instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) { + return false; + } + break; + } + + *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C); + *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D); + *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL); + *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M); + if (instr->alu.mul.magic_write) + *packed_instr |= V3D_QPU_MM; + + return true; +} + +static bool +v3d_qpu_add_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_add_pack(devinfo, instr, packed_instr); + else + return v3d71_qpu_add_pack(devinfo, instr, packed_instr); +} + +static bool +v3d_qpu_mul_pack(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *instr, uint64_t *packed_instr) +{ + if (devinfo->ver < 71) + return v3d33_qpu_mul_pack(devinfo, instr, packed_instr); + else + return v3d71_qpu_mul_pack(devinfo, instr, packed_instr); +} + static bool v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, uint64_t packed_instr, @@ -1332,8 +2347,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo, return false; } - instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); - instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); + if (devinfo->ver <= 71) { + /* + * For v71 this will be set on add/mul unpack, as raddr are now + * part of v3d_qpu_input + */ + instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A); + instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B); + } if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr)) return false; @@ -1419,8 +2440,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo, *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG); if (instr->type == V3D_QPU_INSTR_TYPE_ALU) { - *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); - *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); + if (devinfo->ver < 71) { + /* + * For v71 this will be set on add/mul unpack, as raddr are now + * part of v3d_qpu_input + */ + *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A); + *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B); + } if (!v3d_qpu_add_pack(devinfo, instr, packed_instr)) return false; diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c index 2f8e19c73fe..be7b78d5ef0 100644 --- a/src/broadcom/qpu/tests/qpu_disasm.c +++ b/src/broadcom/qpu/tests/qpu_disasm.c @@ -160,10 +160,10 @@ main(int argc, char **argv) /* Swap the operands to be sure that we test * how the QPUs distinguish between these ops. */ - swap_mux(&instr.alu.add.a, - &instr.alu.add.b); - swap_pack(&instr.alu.add.a_unpack, - &instr.alu.add.b_unpack); + swap_mux(&instr.alu.add.a.mux, + &instr.alu.add.b.mux); + swap_pack(&instr.alu.add.a.unpack, + &instr.alu.add.b.unpack); break; default: break; diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c index eea5d3f050e..c4bbd61abc2 100644 --- a/src/broadcom/simulator/v3d_simulator.c +++ b/src/broadcom/simulator/v3d_simulator.c @@ -92,6 +92,9 @@ static struct v3d_simulator_state { /** Last performance monitor ID. */ uint32_t last_perfid; + /** Total performance counters */ + uint32_t perfcnt_total; + struct util_dynarray bin_oom; int refcount; } sim_state = { @@ -436,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid) perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid); if (perfmon) - v3d41_simulator_perfmon_stop(sim_state.v3d, - perfmon->ncounters, - perfmon->values); + v3d_X_simulator(perfmon_stop)(sim_state.v3d, + perfmon->ncounters, + perfmon->values); perfmon = v3d_get_simulator_perfmon(fd, perfid); if (perfmon) - v3d41_simulator_perfmon_start(sim_state.v3d, - perfmon->ncounters, - perfmon->counters); + v3d_X_simulator(perfmon_start)(sim_state.v3d, + perfmon->ncounters, + perfmon->counters); file->active_perfid = perfid; } @@ -489,11 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit) bin_fd = fd; v3d_simulator_perfmon_switch(fd, submit->perfmon_id); - - if (sim_state.ver >= 41) - v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); - else - v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs); + v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs); util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *, sim_bo) { @@ -632,15 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args) return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args); } -static int -v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args) -{ - if (sim_state.ver >= 41) - return v3d41_simulator_get_param_ioctl(sim_state.v3d, args); - else - return v3d33_simulator_get_param_ioctl(sim_state.v3d, args); -} - static int v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) { @@ -652,10 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args) v3d_simulator_copy_in_handle(file, args->bo_handles[2]); v3d_simulator_copy_in_handle(file, args->bo_handles[3]); - if (sim_state.ver >= 41) - ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args); - else - ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args); + ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args); v3d_simulator_copy_out_handle(file, args->bo_handles[0]); @@ -682,11 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args) v3d_simulator_perfmon_switch(fd, args->perfmon_id); - if (sim_state.ver >= 41) - ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args, - file->gmp->ofs); - else - ret = -1; + ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args, + file->gmp->ofs); for (int i = 0; i < args->bo_handle_count; i++) v3d_simulator_copy_out_handle(file, bo_handles[i]); @@ -716,7 +700,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args) perfmon->ncounters = args->ncounters; for (int i = 0; i < args->ncounters; i++) { - if (args->counters[i] >= V3D_PERFCNT_NUM) { + if (args->counters[i] >= sim_state.perfcnt_total) { ralloc_free(perfmon); return -EINVAL; } else { @@ -797,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args) return 0; case DRM_IOCTL_V3D_GET_PARAM: - return v3d_simulator_get_param_ioctl(fd, args); + return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args); case DRM_IOCTL_GEM_CLOSE: return v3d_simulator_gem_close_ioctl(fd, args); @@ -880,10 +864,19 @@ v3d_simulator_init_global() util_dynarray_init(&sim_state.bin_oom, NULL); - if (sim_state.ver >= 41) - v3d41_simulator_init_regs(sim_state.v3d); - else - v3d33_simulator_init_regs(sim_state.v3d); + v3d_X_simulator(init_regs)(sim_state.v3d); + + switch(sim_state.ver) { + case 41: + case 42: + sim_state.perfcnt_total = 87; + break; + case 71: + sim_state.perfcnt_total = 93; + break; + default: + sim_state.perfcnt_total = 0; + } } struct v3d_simulator_file * diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h index ddb079c1455..92305634468 100644 --- a/src/broadcom/simulator/v3d_simulator.h +++ b/src/broadcom/simulator/v3d_simulator.h @@ -52,6 +52,32 @@ uint32_t v3d_simulator_get_mem_free(void); # define v3dX(x) v3d41_##x # include "v3dx_simulator.h" # undef v3dX + +# define v3dX(x) v3d71_##x +# include "v3dx_simulator.h" +# undef v3dX + #endif +/* Helper to call simulator ver specific functions */ +#define v3d_X_simulator(thing) ({ \ + __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\ + switch (sim_state.ver) { \ + case 33: \ + case 40: \ + v3d_X_sim_thing = &v3d33_simulator_##thing; \ + break; \ + case 41: \ + case 42: \ + v3d_X_sim_thing = &v3d41_simulator_##thing; \ + break; \ + case 71: \ + v3d_X_sim_thing = &v3d71_simulator_##thing; \ + break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + v3d_X_sim_thing; \ +}) + #endif diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c index c9322f0397b..01cf6b22663 100644 --- a/src/broadcom/simulator/v3dx_simulator.c +++ b/src/broadcom/simulator/v3dx_simulator.c @@ -46,11 +46,15 @@ #define HW_REGISTER_RO(x) (x) #define HW_REGISTER_RW(x) (x) -#if V3D_VERSION >= 41 -#include "libs/core/v3d/registers/4.1.35.0/v3d.h" +#if V3D_VERSION == 71 +#include "libs/core/v3d/registers/7.1.5.1/v3d.h" +#else +#if V3D_VERSION == 41 || V3D_VERSION == 42 +#include "libs/core/v3d/registers/4.2.14.0/v3d.h" #else #include "libs/core/v3d/registers/3.3.0.0/v3d.h" #endif +#endif #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val) #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg) @@ -178,38 +182,48 @@ v3d_flush_caches(struct v3d_hw *v3d) v3d_flush_l2t(v3d); } +#if V3D_VERSION < 71 +#define TFU_REG(NAME) V3D_TFU_ ## NAME +#else +#define TFU_REG(NAME) V3D_IFC_ ## NAME +#endif + + int v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, struct drm_v3d_submit_tfu *args) { - int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; - - V3D_WRITE(V3D_TFU_IIA, args->iia); - V3D_WRITE(V3D_TFU_IIS, args->iis); - V3D_WRITE(V3D_TFU_ICA, args->ica); - V3D_WRITE(V3D_TFU_IUA, args->iua); - V3D_WRITE(V3D_TFU_IOA, args->ioa); - V3D_WRITE(V3D_TFU_IOS, args->ios); - V3D_WRITE(V3D_TFU_COEF0, args->coef[0]); - V3D_WRITE(V3D_TFU_COEF1, args->coef[1]); - V3D_WRITE(V3D_TFU_COEF2, args->coef[2]); - V3D_WRITE(V3D_TFU_COEF3, args->coef[3]); - - V3D_WRITE(V3D_TFU_ICFG, args->icfg); - - while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { + int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET; + + V3D_WRITE(TFU_REG(IIA), args->iia); + V3D_WRITE(TFU_REG(IIS), args->iis); + V3D_WRITE(TFU_REG(ICA), args->ica); + V3D_WRITE(TFU_REG(IUA), args->iua); + V3D_WRITE(TFU_REG(IOA), args->ioa); +#if V3D_VERSION >= 71 + V3D_WRITE(TFU_REG(IOC), args->v71.ioc); +#endif + V3D_WRITE(TFU_REG(IOS), args->ios); + V3D_WRITE(TFU_REG(COEF0), args->coef[0]); + V3D_WRITE(TFU_REG(COEF1), args->coef[1]); + V3D_WRITE(TFU_REG(COEF2), args->coef[2]); + V3D_WRITE(TFU_REG(COEF3), args->coef[3]); + + V3D_WRITE(TFU_REG(ICFG), args->icfg); + + while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { v3d_hw_tick(v3d); } return 0; } -#if V3D_VERSION >= 41 int v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, struct drm_v3d_submit_csd *args, uint32_t gmp_ofs) { +#if V3D_VERSION >= 41 int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) & V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET); g_gmp_ofs = gmp_ofs; @@ -223,6 +237,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); +#if V3D_VERSION >= 71 + V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0); +#endif /* CFG0 kicks off the job */ V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); @@ -239,8 +256,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, v3d_flush_caches(v3d); return 0; -} +#else + return -1; #endif +} int v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, @@ -310,16 +329,17 @@ v3d_isr_core(struct v3d_hw *v3d, return; } +#if V3D_VERSION <= 42 if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { fprintf(stderr, "GMP violation at 0x%08x\n", V3D_READ(V3D_GMP_VIO_ADDR)); - abort(); } else { fprintf(stderr, "Unexpected ISR with core status 0x%08x\n", core_status); } abort(); +#endif } static void @@ -396,6 +416,18 @@ v3d_isr_hub(struct v3d_hw *v3d) } handle_mmu_interruptions(v3d, hub_status); + +#if V3D_VERSION == 71 + if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) { + fprintf(stderr, "GMP violation at 0x%08x\n", + V3D_READ(V3D_GMP_VIO_ADDR)); + } else { + fprintf(stderr, + "Unexpected ISR with status 0x%08x\n", + hub_status); + } + abort(); +#endif } static void @@ -436,8 +468,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) * for tracing. Perhaps we should evaluate to do the same here and add * some debug options. */ - uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET | - V3D_CTL_0_INT_STS_INT_OUTOMEM_SET); + uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET; +#if V3D_VERSION <= 42 + core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET; +#endif + V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); @@ -447,6 +482,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d) V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */ V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */ +#if V3D_VERSION == 71 + hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET; +#endif V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts); V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts); @@ -509,7 +547,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x)) #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8) #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \ - V3D_PCTR_0_SRC_N_SHIFT(x) + 6)) + V3D_PCTR_0_SRC_N_SHIFT(x) + \ + V3D_PCTR_0_SRC_0_3_PCTRS0_MSB)) #endif void diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build index ad032d832ad..182388a35b4 100644 --- a/src/broadcom/vulkan/meson.build +++ b/src/broadcom/vulkan/meson.build @@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target( '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv', '--beta', with_vulkan_beta.to_string(), '--device-prefix', 'ver42', + '--device-prefix', 'ver71', ], depend_files : vk_entrypoints_gen_depend_files, ) @@ -64,13 +65,11 @@ files_per_version = files( 'v3dvx_pipeline.c', 'v3dvx_meta_common.c', 'v3dvx_pipeline.c', + 'v3dvx_query.c', 'v3dvx_queue.c', ) -# The vulkan driver only supports version >= 42, which is the version present in -# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d -# driver. -v3d_versions = ['42'] +v3d_versions = ['42', '71'] v3dv_flags = [] diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index a14db073b4f..c6462735fe4 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job, uint32_t layers, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa, bool double_buffer) { @@ -360,13 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job, tiling->render_target_count = render_target_count; tiling->msaa = msaa; tiling->internal_bpp = max_internal_bpp; + tiling->total_color_bpp = total_color_bpp; tiling->double_buffer = double_buffer; /* Double-buffer is incompatible with MSAA */ assert(!tiling->msaa || !tiling->double_buffer); - v3d_choose_tile_size(render_target_count, max_internal_bpp, - tiling->msaa, tiling->double_buffer, + v3d_choose_tile_size(&job->device->devinfo, + render_target_count, + max_internal_bpp, total_color_bpp, msaa, + tiling->double_buffer, &tiling->tile_width, &tiling->tile_height); tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); @@ -457,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job, bool allocate_tile_state_now, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa) { assert(job); @@ -467,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job, const struct v3dv_frame_tiling *tiling = job_compute_frame_tiling(job, width, height, layers, render_target_count, max_internal_bpp, - msaa, false); + total_color_bpp, msaa, false); v3dv_cl_ensure_space_with_branch(&job->bcl, 256); v3dv_return_if_oom(NULL, job); @@ -528,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) job->frame_tiling.layers, job->frame_tiling.render_target_count, job->frame_tiling.internal_bpp, + job->frame_tiling.total_color_bpp, job->frame_tiling.msaa, true); @@ -1374,7 +1380,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) } uint32_t att_count = 0; - VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ + VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */ /* We only need to emit subpass clears as draw calls for color attachments * if the render area is not aligned to tile boundaries. @@ -1672,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, const struct v3dv_framebuffer *framebuffer = state->framebuffer; - uint8_t internal_bpp; + uint8_t max_internal_bpp, total_color_bpp; bool msaa; v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) - (framebuffer, state->attachments, subpass, &internal_bpp, &msaa); + (framebuffer, state->attachments, subpass, + &max_internal_bpp, &total_color_bpp, &msaa); /* From the Vulkan spec: * @@ -1699,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, layers, true, false, subpass->color_count, - internal_bpp, + max_internal_bpp, + total_color_bpp, msaa); } @@ -2062,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, } } + if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) { + if (memcmp(&dest->depth_bounds, &src->depth_bounds, + sizeof(src->depth_bounds))) { + memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds)); + dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; + } + } + if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { if (dest->line_width != src->line_width) { dest->line_width = src->line_width; @@ -2131,39 +2147,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, } } -/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ -void -v3dv_viewport_compute_xform(const VkViewport *viewport, - float scale[3], - float translate[3]) -{ - float x = viewport->x; - float y = viewport->y; - float half_width = 0.5f * viewport->width; - float half_height = 0.5f * viewport->height; - double n = viewport->minDepth; - double f = viewport->maxDepth; - - scale[0] = half_width; - translate[0] = half_width + x; - scale[1] = half_height; - translate[1] = half_height + y; - - scale[2] = (f - n); - translate[2] = n; - - /* It seems that if the scale is small enough the hardware won't clip - * correctly so we work around this my choosing the smallest scale that - * seems to work. - * - * This case is exercised by CTS: - * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero - */ - const float min_abs_scale = 0.000009f; - if (fabs(scale[2]) < min_abs_scale) - scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; -} - /* Considers the pipeline's negative_one_to_one state and applies it to the * current viewport transform if needed to produce the resulting Z translate * and scale parameters. @@ -2216,9 +2199,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, viewportCount * sizeof(*pViewports)); for (uint32_t i = firstViewport; i < total_count; i++) { - v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], - state->dynamic.viewport.scale[i], - state->dynamic.viewport.translate[i]); + v3dv_X(cmd_buffer->device, viewport_compute_xform) + (&state->dynamic.viewport.viewports[i], + state->dynamic.viewport.scale[i], + state->dynamic.viewport.translate[i]); } cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; @@ -2699,6 +2683,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) true, false, old_job->frame_tiling.render_target_count, old_job->frame_tiling.internal_bpp, + old_job->frame_tiling.total_color_bpp, true /* msaa */); v3dv_job_destroy(old_job); @@ -2963,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); + if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS) + v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer); + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); @@ -3410,9 +3398,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds) { - /* We do not support depth bounds testing so we just ignore this. We are - * already asserting that pipelines don't enable the feature anyway. - */ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; + cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; } VKAPI_ATTR void VKAPI_CALL @@ -3844,6 +3834,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) void v3dv_cmd_buffer_rewrite_indirect_csd_job( + struct v3dv_device *device, struct v3dv_csd_indirect_cpu_job_info *info, const uint32_t *wg_counts) { @@ -3863,8 +3854,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; - submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * - (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]); + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (device->devinfo.ver < 71 || + (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { + submit->cfg[4] = num_batches - 1; + } else { + submit->cfg[4] = num_batches; + } assert(submit->cfg[4] != ~0); if (info->needs_wg_uniform_rewrite) { @@ -3897,6 +3895,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t **wg_uniform_offsets_out, uint32_t *wg_size_out) { + struct v3dv_device *device = cmd_buffer->device; struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); struct v3dv_shader_variant *cs_variant = @@ -3955,18 +3954,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, if (wg_size_out) *wg_size_out = wg_size; - submit->cfg[4] = num_batches - 1; + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (device->devinfo.ver < 71 || + (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { + submit->cfg[4] = num_batches - 1; + } else { + submit->cfg[4] = num_batches; + } assert(submit->cfg[4] != ~0); assert(pipeline->shared_data->assembly_bo); struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; - submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (cs_variant->prog_data.base->single_seg) submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; if (cs_variant->prog_data.base->threads == 4) submit->cfg[5] |= V3D_CSD_CFG5_THREADING; + /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */ + if (device->devinfo.ver < 71) + submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (cs_variant->prog_data.cs->shared_size > 0) { job->csd.shared_memory = diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index 3bad290e8c5..d013edaa63d 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -91,7 +91,7 @@ static const struct vk_instance_extension_table instance_extensions = { .KHR_display = true, .KHR_get_display_properties2 = true, .EXT_direct_mode_display = true, - .EXT_acquire_drm_display = true, + .EXT_acquire_drm_display = false, #endif .KHR_external_fence_capabilities = true, .KHR_external_memory_capabilities = true, @@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device, *features = (struct vk_features) { /* Vulkan 1.0 */ .robustBufferAccess = true, /* This feature is mandatory */ - .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */ + .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71, .imageCubeArray = true, .independentBlend = true, .geometryShader = true, @@ -224,10 +224,10 @@ get_features(const struct v3dv_physical_device *physical_device, .logicOp = true, .multiDrawIndirect = false, .drawIndirectFirstInstance = true, - .depthClamp = false, /* Only available since V3D 4.5.1.1 */ + .depthClamp = physical_device->devinfo.ver >= 71, .depthBiasClamp = true, .fillModeNonSolid = true, - .depthBounds = false, /* Only available since V3D 4.3.16.2 */ + .depthBounds = physical_device->devinfo.ver >= 71, .wideLines = true, .largePoints = true, .alphaToOne = true, @@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device, * problematic, we would always have to scalarize. Overall, this would * not lead to best performance so let's just not support it. */ - .scalarBlockLayout = false, + .scalarBlockLayout = physical_device->devinfo.ver >= 71, /* This tells applications 2 things: * * 1. If they can select just one aspect for barriers. For us barriers @@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance, device->next_program_id = 0; ASSERTED int len = - asprintf(&device->name, "V3D %d.%d", - device->devinfo.ver / 10, device->devinfo.ver % 10); + asprintf(&device->name, "V3D %d.%d.%d", + device->devinfo.ver / 10, + device->devinfo.ver % 10, + device->devinfo.rev); assert(len != -1); v3dv_physical_device_init_disk_cache(device); @@ -1212,6 +1214,12 @@ create_physical_device(struct v3dv_instance *instance, list_addtail(&device->vk.link, &instance->vk.physical_devices.list); + if (device->devinfo.ver != 42) { + fprintf(stderr, "WARNING: v3dv support for hw version %i is neither " + "a complete nor a conformant Vulkan implementation. Testing " + "use only.\n", device->devinfo.ver); + } + return VK_SUCCESS; fail: @@ -1279,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance) if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) { char **compat = devices[i]->deviceinfo.platform->compatible; while (*compat) { - if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) { + if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 || + strncmp(*compat, "brcm,2712-v3d", 13) == 0) { v3d_idx = i; break; } @@ -1288,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance) } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) { char **compat = devices[i]->deviceinfo.platform->compatible; while (*compat) { - if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || - strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) { + if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 || + strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || + strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) { vc4_idx = i; break; } @@ -1326,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev) switch (dev->devinfo.ver) { case 42: return 0xBE485FD3; /* Broadcom deviceID for 2711 */ + case 71: + return 0x55701C33; /* Broadcom deviceID for 2712 */ default: unreachable("Unsupported V3D version"); } @@ -1354,6 +1366,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, const VkSampleCountFlags supported_sample_counts = VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT; + const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver); + struct timespec clock_res; clock_getres(CLOCK_MONOTONIC, &clock_res); const float timestamp_period = @@ -1424,7 +1438,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxFragmentInputComponents = max_varying_components, .maxFragmentOutputAttachments = 4, .maxFragmentDualSrcAttachments = 0, - .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS + + .maxFragmentCombinedOutputResources = max_rts + MAX_STORAGE_BUFFERS + MAX_STORAGE_IMAGES, @@ -1437,7 +1451,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .subPixelPrecisionBits = V3D_COORD_SHIFT, .subTexelPrecisionBits = 8, .mipmapPrecisionBits = 8, - .maxDrawIndexedIndexValue = 0x00ffffff, + .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ? + 0xffffffff : 0x00ffffff, .maxDrawIndirectCount = 0x7fffffff, .maxSamplerLodBias = 14.0f, .maxSamplerAnisotropy = 16.0f, @@ -1464,7 +1479,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .framebufferDepthSampleCounts = supported_sample_counts, .framebufferStencilSampleCounts = supported_sample_counts, .framebufferNoAttachmentsSampleCounts = supported_sample_counts, - .maxColorAttachments = MAX_RENDER_TARGETS, + .maxColorAttachments = max_rts, .sampledImageColorSampleCounts = supported_sample_counts, .sampledImageIntegerSampleCounts = supported_sample_counts, .sampledImageDepthSampleCounts = supported_sample_counts, @@ -2031,7 +2046,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0, device->instance->default_pipeline_cache_enabled); device->default_attribute_float = - v3dv_pipeline_create_default_attribute_values(device, NULL); + v3dv_X(device, create_default_attribute_values)(device, NULL); device->device_address_mem_ctx = ralloc_context(NULL); util_dynarray_init(&device->device_address_bo_list, @@ -2975,7 +2990,7 @@ v3dv_CreateSampler(VkDevice _device, } } - v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info); + v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info); *pSampler = v3dv_sampler_to_handle(sampler); diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c index ebbd60e4c03..e01e2e1bd19 100644 --- a/src/broadcom/vulkan/v3dv_image.c +++ b/src/broadcom/vulkan/v3dv_image.c @@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device, * makes sense to implement swizzle composition using VkSwizzle directly. */ VkFormat format; - uint8_t image_view_swizzle[4]; if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT && range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { format = VK_FORMAT_R8G8B8A8_UINT; @@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device, vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle); util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle, - image_view_swizzle); + iview->view_swizzle); } else { format = pCreateInfo->format; vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, - image_view_swizzle); + iview->view_swizzle); } iview->vk.view_format = format; @@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device, const uint8_t *format_swizzle = v3dv_get_format_swizzle(device, format, plane); - util_format_compose_swizzles(format_swizzle, image_view_swizzle, + util_format_compose_swizzles(format_swizzle, iview->view_swizzle, iview->planes[plane].swizzle); iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle); diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h index 9cda9f0d6d2..8ac99724105 100644 --- a/src/broadcom/vulkan/v3dv_limits.h +++ b/src/broadcom/vulkan/v3dv_limits.h @@ -50,8 +50,6 @@ #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \ MAX_DYNAMIC_STORAGE_BUFFERS) -#define MAX_RENDER_TARGETS 4 - #define MAX_MULTIVIEW_VIEW_COUNT 16 /* These are tunable parameters in the HW design, but all the V3D diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c index a200298a898..0b64653000d 100644 --- a/src/broadcom/vulkan/v3dv_meta_clear.c +++ b/src/broadcom/vulkan/v3dv_meta_clear.c @@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, v3dv_job_start_frame(job, width, height, max_layer, false, true, 1, internal_bpp, + 4 * v3d_internal_bpp_words(internal_bpp), image->vk.samples > VK_SAMPLE_COUNT_1_BIT); struct v3dv_meta_framebuffer framebuffer; @@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx, uint32_t bit_offset = 0; key |= rt_idx; - bit_offset += 2; + bit_offset += 3; key |= ((uint64_t) format) << bit_offset; bit_offset += 32; @@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - /* We can only clear attachments in the current subpass */ - assert(attachmentCount <= 5); /* 4 color + D/S */ + /* We can have at most max_color_RTs + 1 D/S attachments */ + assert(attachmentCount <= + V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1); + /* We can only clear attachments in the current subpass */ struct v3dv_render_pass *pass = cmd_buffer->state.pass; assert(cmd_buffer->state.subpass_idx < pass->subpass_count); diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c index c0ec888b8c7..2d30c611e17 100644 --- a/src/broadcom/vulkan/v3dv_meta_copy.c +++ b/src/broadcom/vulkan/v3dv_meta_copy.c @@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, - false, true, 1, internal_bpp, + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), src->vk.samples > VK_SAMPLE_COUNT_1_BIT); struct v3dv_meta_framebuffer framebuffer; @@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, (fb_format, region->srcSubresource.aspectMask, &internal_type, &internal_bpp); - v3dv_job_start_frame(job, width, height, num_layers, false, true, - 1, internal_bpp, true); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + true); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c index 20f5014268d..0583faf6f9a 100644 --- a/src/broadcom/vulkan/v3dv_pass.c +++ b/src/broadcom/vulkan/v3dv_pass.c @@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device, /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa), * the clear might get lost. If a subpass has this then we can't emit - * the clear using the TLB and we have to do it as a draw call. + * the clear using the TLB and we have to do it as a draw call. This + * issue is fixed since V3D 4.3.18. * * FIXME: separate stencil. */ - if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { + if (device->devinfo.ver == 42 && + subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { struct v3dv_render_pass_attachment *att = &pass->attachments[subpass->ds_attachment.attachment]; if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) { @@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device, /* Granularity is defined by the tile size */ assert(subpass_idx < pass->subpass_count); struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx]; - const uint32_t color_attachment_count = subpass->color_count; + const uint32_t color_count = subpass->color_count; bool msaa = false; - uint32_t max_bpp = 0; - for (uint32_t i = 0; i < color_attachment_count; i++) { + uint32_t max_internal_bpp = 0; + uint32_t total_color_bpp = 0; + for (uint32_t i = 0; i < color_count; i++) { uint32_t attachment_idx = subpass->color_attachments[i].attachment; if (attachment_idx == VK_ATTACHMENT_UNUSED) continue; @@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device, v3dv_X(device, get_internal_type_bpp_for_output_format) (format->planes[0].rt_type, &internal_type, &internal_bpp); - max_bpp = MAX2(max_bpp, internal_bpp); + max_internal_bpp = MAX2(max_internal_bpp, internal_bpp); + total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); if (desc->samples > VK_SAMPLE_COUNT_1_BIT) msaa = true; @@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device, * heuristics so we choose a conservative granularity here, with it disabled. */ uint32_t width, height; - v3d_choose_tile_size(color_attachment_count, max_bpp, msaa, + v3d_choose_tile_size(&device->devinfo, color_count, + max_internal_bpp, total_color_bpp, msaa, false /* double-buffer */, &width, &height); *granularity = (VkExtent2D) { .width = width, diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index 99fe8c16bfa..d6629c9a4a0 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state) return V3DV_DYNAMIC_LINE_WIDTH; case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: return V3DV_DYNAMIC_COLOR_WRITE_ENABLE; - - /* Depth bounds testing is not available in in V3D 4.2 so here we are just - * ignoring this dynamic state. We are already asserting at pipeline creation - * time that depth bounds testing is not enabled. - */ case VK_DYNAMIC_STATE_DEPTH_BOUNDS: - return 0; + return V3DV_DYNAMIC_DEPTH_BOUNDS; default: unreachable("Unhandled dynamic state"); @@ -2632,6 +2627,7 @@ pipeline_init_dynamic_state( const VkPipelineColorWriteCreateInfoEXT *pColorWriteState) { /* Initialize to default values */ + const struct v3d_device_info *devinfo = &pipeline->device->devinfo; struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state; memset(dynamic, 0, sizeof(*dynamic)); dynamic->stencil_compare_mask.front = ~0; @@ -2639,7 +2635,9 @@ pipeline_init_dynamic_state( dynamic->stencil_write_mask.front = ~0; dynamic->stencil_write_mask.back = ~0; dynamic->line_width = 1.0f; - dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1; + dynamic->color_write_enable = + (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1; + dynamic->depth_bounds.max = 1.0f; /* Create a mask of enabled dynamic states */ uint32_t dynamic_states = 0; @@ -2661,9 +2659,10 @@ pipeline_init_dynamic_state( pViewportState->viewportCount); for (uint32_t i = 0; i < dynamic->viewport.count; i++) { - v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i], - dynamic->viewport.scale[i], - dynamic->viewport.translate[i]); + v3dv_X(pipeline->device, viewport_compute_xform) + (&dynamic->viewport.viewports[i], + dynamic->viewport.scale[i], + dynamic->viewport.translate[i]); } } @@ -2691,6 +2690,11 @@ pipeline_init_dynamic_state( dynamic->stencil_reference.front = pDepthStencilState->front.reference; dynamic->stencil_reference.back = pDepthStencilState->back.reference; } + + if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) { + dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds; + dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds; + } } if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) { @@ -2802,62 +2806,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline, } } -static bool -pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) -{ - for (uint8_t i = 0; i < pipeline->va_count; i++) { - if (vk_format_is_int(pipeline->va[i].vk_format)) - return true; - } - return false; -} - -/* @pipeline can be NULL. We assume in that case that all the attributes have - * a float format (we only create an all-float BO once and we reuse it with - * all float pipelines), otherwise we look at the actual type of each - * attribute used with the specific pipeline passed in. - */ -struct v3dv_bo * -v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, - struct v3dv_pipeline *pipeline) -{ - uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; - struct v3dv_bo *bo; - - bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); - - if (!bo) { - fprintf(stderr, "failed to allocate memory for the default " - "attribute values\n"); - return NULL; - } - - bool ok = v3dv_bo_map(device, bo, size); - if (!ok) { - fprintf(stderr, "failed to map default attribute values buffer\n"); - return false; - } - - uint32_t *attrs = bo->map; - uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; - for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { - attrs[i * 4 + 0] = 0; - attrs[i * 4 + 1] = 0; - attrs[i * 4 + 2] = 0; - VkFormat attr_format = - pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; - if (i < va_count && vk_format_is_int(attr_format)) { - attrs[i * 4 + 3] = 1; - } else { - attrs[i * 4 + 3] = fui(1.0); - } - } - - v3dv_bo_unmap(device, bo); - - return bo; -} - static void pipeline_set_sample_mask(struct v3dv_pipeline *pipeline, const VkPipelineMultisampleStateCreateInfo *ms_info) @@ -2960,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline, /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that * feature and it shouldn't be used by any pipeline. */ - assert(!ds_info || !ds_info->depthBoundsTestEnable); + assert(device->devinfo.ver >= 71 || + !ds_info || !ds_info->depthBoundsTestEnable); + pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable; enable_depth_bias(pipeline, rs_info); @@ -2992,9 +2942,10 @@ pipeline_init(struct v3dv_pipeline *pipeline, v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info); - if (pipeline_has_integer_vertex_attrib(pipeline)) { + if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) { pipeline->default_attribute_values = - v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline); + v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline); + if (!pipeline->default_attribute_values) return VK_ERROR_OUT_OF_DEVICE_MEMORY; } else { diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index c6707211529..89e2f1c7e5c 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -123,6 +123,9 @@ struct v3d_simulator_file; /* Minimum required by the Vulkan 1.1 spec */ #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30) +/* Maximum performance counters number */ +#define V3D_MAX_PERFCNT 93 + struct v3dv_physical_device { struct vk_physical_device vk; @@ -581,6 +584,10 @@ struct v3dv_device { * being float being float, allowing us to reuse the same BO for all * pipelines matching this requirement. Pipelines that need integer * attributes will create their own BO. + * + * Note that since v71 the default attribute values are not needed, so this + * can be NULL. + * */ struct v3dv_bo *default_attribute_float; @@ -772,6 +779,8 @@ struct v3dv_image_view { const struct v3dv_format *format; + uint8_t view_swizzle[4]; + uint8_t plane_count; struct { uint8_t image_plane; @@ -782,8 +791,8 @@ struct v3dv_image_view { uint32_t internal_type; uint32_t offset; - /* Precomputed (composed from createinfo->components and formar swizzle) - * swizzles to pass in to the shader key. + /* Precomputed swizzle (composed from the view swizzle and the format + * swizzle). * * This could be also included on the descriptor bo, but the shader state * packet doesn't need it on a bo, so we can just avoid a memory copy @@ -946,6 +955,7 @@ struct v3dv_frame_tiling { uint32_t layers; uint32_t render_target_count; uint32_t internal_bpp; + uint32_t total_color_bpp; bool msaa; bool double_buffer; uint32_t tile_width; @@ -1040,7 +1050,8 @@ enum v3dv_dynamic_state_bits { V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6, V3DV_DYNAMIC_LINE_WIDTH = 1 << 7, V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8, - V3DV_DYNAMIC_ALL = (1 << 9) - 1, + V3DV_DYNAMIC_DEPTH_BOUNDS = 1 << 9, + V3DV_DYNAMIC_ALL = (1 << 10) - 1, }; /* Flags for dirty pipeline state. @@ -1065,6 +1076,7 @@ enum v3dv_cmd_dirty_bits { V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 16, V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 17, V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 18, + V3DV_CMD_DIRTY_DEPTH_BOUNDS = 1 << 19, }; struct v3dv_dynamic_state { @@ -1101,6 +1113,11 @@ struct v3dv_dynamic_state { float slope_factor; } depth_bias; + struct { + float min; + float max; + } depth_bounds; + float line_width; uint32_t color_write_enable; @@ -1196,7 +1213,7 @@ struct v3dv_timestamp_query_cpu_job_info { }; /* Number of perfmons required to handle all supported performance counters */ -#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \ +#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \ DRM_V3D_MAX_PERF_COUNTERS) struct v3dv_perf_query { @@ -1369,6 +1386,7 @@ void v3dv_job_start_frame(struct v3dv_job *job, bool allocate_tile_state_now, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa); bool v3dv_job_type_is_gpu(struct v3dv_job *job); @@ -1667,7 +1685,7 @@ struct v3dv_query_pool { /* Only used with performance queries */ struct { uint32_t ncounters; - uint8_t counters[V3D_PERFCNT_NUM]; + uint8_t counters[V3D_MAX_PERFCNT]; /* V3D has a limit on the number of counters we can track in a * single performance monitor, so if too many counters are requested @@ -1803,7 +1821,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, struct drm_v3d_submit_tfu *tfu); -void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info, +void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device, + struct v3dv_csd_indirect_cpu_job_info *info, const uint32_t *wg_counts); void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, @@ -2289,11 +2308,15 @@ struct v3dv_pipeline { unsigned char sha1[20]; /* In general we can reuse v3dv_device->default_attribute_float, so note - * that the following can be NULL. + * that the following can be NULL. In 7.x this is not used, so it will be + * NULL. * * FIXME: the content of this BO will be small, so it could be improved to * be uploaded to a common BO. But as in most cases it will be NULL, it is * not a priority. + * + * Note that since v71 the default attribute values are not needed, so this + * can be NULL. */ struct v3dv_bo *default_attribute_values; @@ -2323,6 +2346,9 @@ struct v3dv_pipeline { bool is_z16; } depth_bias; + /* Depth bounds */ + bool depth_bounds_test_enabled; + struct { void *mem_ctx; struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */ @@ -2338,6 +2364,13 @@ struct v3dv_pipeline { uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH]; }; +static inline bool +v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device) +{ + return device->devinfo.ver > 71 || + (device->devinfo.ver == 71 && device->devinfo.rev >= 5); +} + static inline VkPipelineBindPoint v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline) { @@ -2500,10 +2533,6 @@ void v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline, struct v3dv_pipeline_cache *cache); -struct v3dv_bo * -v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, - struct v3dv_pipeline *pipeline); - VkResult v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device, nir_shader *nir, @@ -2608,12 +2637,32 @@ u64_compare(const void *key1, const void *key2) case 42: \ v3d_X_thing = &v3d42_##thing; \ break; \ + case 71: \ + v3d_X_thing = &v3d71_##thing; \ + break; \ default: \ unreachable("Unsupported hardware generation"); \ } \ v3d_X_thing; \ }) +/* Helper to get hw-specific macro values */ +#define V3DV_X(device, thing) ({ \ + __typeof(V3D42_##thing) V3D_X_THING; \ + switch (device->devinfo.ver) { \ + case 42: \ + V3D_X_THING = V3D42_##thing; \ + break; \ + case 71: \ + V3D_X_THING = V3D71_##thing; \ + break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + V3D_X_THING; \ +}) + + /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to * define v3dX for each version supported, because when we compile code that @@ -2626,6 +2675,10 @@ u64_compare(const void *key1, const void *key2) # define v3dX(x) v3d42_##x # include "v3dvx_private.h" # undef v3dX + +# define v3dX(x) v3d71_##x +# include "v3dvx_private.h" +# undef v3dX #endif #ifdef ANDROID diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c index 3284c467d74..deb7821f02b 100644 --- a/src/broadcom/vulkan/v3dv_query.c +++ b/src/broadcom/vulkan/v3dv_query.c @@ -23,7 +23,6 @@ #include "v3dv_private.h" -#include "common/v3d_performance_counters.h" #include "util/timespec.h" #include "compiler/nir/nir_builder.h" @@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device, DRM_IOCTL_V3D_PERFMON_CREATE, &req); if (ret) - fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret)); + fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret)); pool->queries[query].perf.kperfmon_ids[i] = req.id; } @@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); assert(pq_info); - assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM); pool->perfmon.ncounters = pq_info->counterIndexCount; for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) @@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device, assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); struct v3dv_query *q = &pool->queries[query]; - uint64_t counter_values[V3D_PERFCNT_NUM]; + uint64_t counter_values[V3D_MAX_PERFCNT]; for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { struct drm_v3d_perfmon_get_values req = { @@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions) { - uint32_t desc_count = *pCounterCount; + V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice); - VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, - out, pCounters, pCounterCount); - VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, - out_desc, pCounterDescriptions, &desc_count); - - for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { - vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { - counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; - counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; - counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; - - unsigned char sha1_result[20]; - _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], - strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), - sha1_result); - - memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); - } - - vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, - &out_desc, desc) { - desc->flags = 0; - snprintf(desc->name, sizeof(desc->name), "%s", - v3d_performance_counters[i][V3D_PERFCNT_NAME]); - snprintf(desc->category, sizeof(desc->category), "%s", - v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); - snprintf(desc->description, sizeof(desc->description), "%s", - v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); - } - } - - return vk_outarray_status(&out); + return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount, + pCounters, + pCounterDescriptions); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index b4aae195180..429d14a9196 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, if (memcmp(group_counts, info->csd_job->csd.wg_count, sizeof(info->csd_job->csd.wg_count)) != 0) { - v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); + v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts); } return VK_SUCCESS; diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c index 72fa9a1b39c..0e681cc4ee2 100644 --- a/src/broadcom/vulkan/v3dv_uniforms.c +++ b/src/broadcom/vulkan/v3dv_uniforms.c @@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect); struct v3dv_cl_out *uniforms = cl_start(&job->indirect); - + float clipper_xy_granularity = + V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); for (int i = 0; i < uinfo->count; i++) { uint32_t data = uinfo->data[i]; @@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, break; case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f); + cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity); break; case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f); + cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity); break; case QUNIFORM_VIEWPORT_Z_OFFSET: { diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c index f182b790d36..1bd634f5027 100644 --- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c @@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job) }; config.width_in_pixels = tiling->width; config.height_in_pixels = tiling->height; +#if V3D_VERSION == 42 config.number_of_render_targets = MAX2(tiling->render_target_count, 1); config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + unreachable("HW generation 71 not supported yet."); +#endif uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr; cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config); @@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { config.width_in_pixels = tiling->width; config.height_in_pixels = tiling->height; +#if V3D_VERSION == 42 config.number_of_render_targets = MAX2(tiling->render_target_count, 1); config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideally we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif } /* There's definitely nothing in the VCD cache we want. */ @@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer, iview->vk.base_array_layer + layer, image_plane); + /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it + * is broken in earlier V3D versions. + */ + assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear); + cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = buffer; store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset); @@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, const VkImageAspectFlags aspects = vk_format_aspects(ds_attachment->desc.format); +#if V3D_VERSION <= 42 + /* GFXH-1689: The per-buffer store command's clear buffer bit is broken + * for depth/stencil. + * + * There used to be some confusion regarding the Clear Tile Buffers + * Z/S bit also being broken, but we confirmed with Broadcom that this + * is not the case, it was just that some other hardware bugs (that we + * need to work around, such as GFXH-1461) could cause this bit to behave + * incorrectly. + * + * There used to be another issue where the RTs bit in the Clear Tile + * Buffers packet also cleared Z/S, but Broadcom confirmed this is + * fixed since V3D 4.1. + * + * So if we have to emit a clear of depth or stencil we don't use + * the per-buffer store clear bit, even if we need to store the buffers, + * instead we always have to use the Clear Tile Buffers Z/S bit. + * If we have configured the job to do early Z/S clearing, then we + * don't want to emit any Clear Tile Buffers command at all here. + * + * Note that GFXH-1689 is not reproduced in the simulator, where + * using the clear buffer bit in depth/stencil stores works fine. + */ + /* Only clear once on the first subpass that uses the attachment */ uint32_t ds_first_subpass = !state->pass->multiview_enabled ? ds_attachment->first_subpass : @@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, ds_attachment->desc.stencilLoadOp, subpass->do_stencil_clear_with_draw); + use_global_zs_clear = !state->job->early_zs_clear && + (needs_depth_clear || needs_stencil_clear); +#endif +#if V3D_VERSION >= 71 + /* The store command's clear buffer bit cannot be used for Z/S stencil: + * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles, + * so we don't want to emit redundant clears here. + */ + use_global_zs_clear = false; +#endif + /* Skip the last store if it is not required */ uint32_t ds_last_subpass = !pass->multiview_enabled ? ds_attachment->last_subpass : @@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, needs_stencil_store = subpass->resolve_stencil; } - /* GFXH-1689: The per-buffer store command's clear buffer bit is broken - * for depth/stencil. - * - * There used to be some confusion regarding the Clear Tile Buffers - * Z/S bit also being broken, but we confirmed with Broadcom that this - * is not the case, it was just that some other hardware bugs (that we - * need to work around, such as GFXH-1461) could cause this bit to behave - * incorrectly. - * - * There used to be another issue where the RTs bit in the Clear Tile - * Buffers packet also cleared Z/S, but Broadcom confirmed this is - * fixed since V3D 4.1. - * - * So if we have to emit a clear of depth or stencil we don't use - * the per-buffer store clear bit, even if we need to store the buffers, - * instead we always have to use the Clear Tile Buffers Z/S bit. - * If we have configured the job to do early Z/S clearing, then we - * don't want to emit any Clear Tile Buffers command at all here. - * - * Note that GFXH-1689 is not reproduced in the simulator, where - * using the clear buffer bit in depth/stencil stores works fine. - */ - use_global_zs_clear = !state->job->early_zs_clear && - (needs_depth_clear || needs_stencil_clear); if (needs_depth_store || needs_stencil_store) { const uint32_t zs_buffer = v3dv_zs_buffer(needs_depth_store, needs_stencil_store); @@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, * bit and instead we have to emit a single clear of all tile buffers. */ if (use_global_zs_clear || use_global_rt_clear) { +#if V3D_VERSION == 42 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = use_global_zs_clear; clear.clear_all_render_targets = use_global_rt_clear; } +#endif +#if V3D_VERSION >= 71 + cl_emit(cl, CLEAR_RENDER_TARGETS, clear); +#endif } } @@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job, } } +/* Note that for v71, render target cfg packets has just one field that + * combined the internal type and clamp mode. For simplicity we keep just one + * helper. + * + * Note: rt_type is in fact a "enum V3DX(Internal_Type)". + * + * FIXME: for v71 we are not returning all the possible combinations for + * render target internal type and clamp. For example for int types we are + * always using clamp int, and for 16f we are using clamp none or pos (that + * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary + * right now we are just porting what we were doing on 4.2 + */ +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format) +{ +#if V3D_VERSION == 42 + if (vk_format_is_int(vk_format)) + return V3D_RENDER_TARGET_CLAMP_INT; + else if (vk_format_is_srgb(vk_format)) + return V3D_RENDER_TARGET_CLAMP_NORM; + else + return V3D_RENDER_TARGET_CLAMP_NONE; +#endif +#if V3D_VERSION >= 71 + switch (rt_type) { + case V3D_INTERNAL_TYPE_8I: + return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; + case V3D_INTERNAL_TYPE_8UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; + case V3D_INTERNAL_TYPE_8: + return V3D_RENDER_TARGET_TYPE_CLAMP_8; + case V3D_INTERNAL_TYPE_16I: + return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; + case V3D_INTERNAL_TYPE_16UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; + case V3D_INTERNAL_TYPE_16F: + return vk_format_is_srgb(vk_format) ? + V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : + V3D_RENDER_TARGET_TYPE_CLAMP_16F; + case V3D_INTERNAL_TYPE_32I: + return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; + case V3D_INTERNAL_TYPE_32UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; + case V3D_INTERNAL_TYPE_32F: + return V3D_RENDER_TARGET_TYPE_CLAMP_32F; + default: + unreachable("Unknown internal render target type"); + } + + return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; +#endif +} + +static void +cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer, + int rt, + uint32_t *rt_bpp, +#if V3D_VERSION == 42 + uint32_t *rt_type, + uint32_t *rt_clamp) +#else + uint32_t *rt_type_clamp) +#endif +{ + const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + + assert(state->subpass_idx < state->pass->subpass_count); + const struct v3dv_subpass *subpass = + &state->pass->subpasses[state->subpass_idx]; + + if (rt >= subpass->color_count) + return; + + struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; + const uint32_t attachment_idx = attachment->attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + return; + + assert(attachment_idx < state->framebuffer->attachment_count && + attachment_idx < state->attachment_alloc_count); + struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; + assert(vk_format_is_color(iview->vk.format)); + + assert(iview->plane_count == 1); + *rt_bpp = iview->planes[0].internal_bpp; +#if V3D_VERSION == 42 + *rt_type = iview->planes[0].internal_type; + *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, + iview->vk.format); +#endif +#if V3D_VERSION >= 71 + *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, + iview->vk.format); +#endif +} + void v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) { @@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) config.number_of_render_targets = MAX2(subpass->color_count, 1); config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { const struct v3dv_image_view *iview = @@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) * Early-Z/S clearing is independent of Early Z/S testing, so it is * possible to enable one but not the other so long as their * respective requirements are met. + * + * From V3D 4.5.6, Z/S buffers are always cleared automatically + * between tiles, but we still want to enable early ZS clears + * when Z/S are not loaded or stored. */ struct v3dv_render_pass_attachment *ds_attachment = &pass->attachments[ds_attachment_idx]; @@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) const VkImageAspectFlags ds_aspects = vk_format_aspects(ds_attachment->desc.format); - bool needs_depth_clear = - check_needs_clear(state, - ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, - ds_attachment->first_subpass, - ds_attachment->desc.loadOp, - subpass->do_depth_clear_with_draw); - bool needs_depth_store = v3dv_cmd_buffer_check_needs_store(state, ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, ds_attachment->last_subpass, ds_attachment->desc.storeOp) || subpass->resolve_depth; +#if V3D_VERSION <= 42 + bool needs_depth_clear = + check_needs_clear(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + subpass->do_depth_clear_with_draw); do_early_zs_clear = needs_depth_clear && !needs_depth_store; +#endif +#if V3D_VERSION >= 71 + bool needs_depth_load = + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp); + do_early_zs_clear = !needs_depth_load && !needs_depth_store; +#endif + if (do_early_zs_clear && vk_format_has_stencil(ds_attachment->desc.format)) { bool needs_stencil_load = @@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) */ job->early_zs_clear = do_early_zs_clear; +#if V3D_VERSION >= 71 + uint32_t base_addr = 0; +#endif for (uint32_t i = 0; i < subpass->color_count; i++) { uint32_t attachment_idx = subpass->color_attachments[i].attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) + if (attachment_idx == VK_ATTACHMENT_UNUSED) { +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.render_target_number = i; + rt.stride = 1; /* Unused */ + } +#endif continue; + } struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; @@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) const struct v3d_resource_slice *slice = &image->planes[plane].slices[iview->vk.base_mip_level]; - const uint32_t *clear_color = + UNUSED const uint32_t *clear_color = &state->attachments[attachment_idx].clear_value.color[0]; - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; if (slice->tiling == V3D_TILING_UIF_NO_XOR || slice->tiling == V3D_TILING_UIF_XOR) { int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2; @@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = clear_color[0]; clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; @@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) clear.render_target_number = i; }; } +#endif + +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.clear_color_low_bits = clear_color[0]; + cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp, + &rt.internal_type_and_clamping); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = base_addr; + rt.render_target_number = i; + + /* base_addr in multiples of 512 bits. We divide by 8 because stride + * is in 128-bit units, but it is packing 2 rows worth of data, so we + * need to divide it by 2 so it is only 1 row, and then again by 4 so + * it is in 512-bit units. + */ + base_addr += (tiling->tile_height * rt.stride) / 8; + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) clear_color[1]) | + (((uint64_t) (clear_color[2] & 0xff)) << 32); + rt.render_target_number = i; + } + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) | + (((uint64_t) (clear_color[3])) << 24); + rt.render_target_number = i; + } + } +#endif } +#if V3D_VERSION >= 71 + /* If we don't have any color RTs, we still need to emit one and flag + * it as not used using stride = 1. + */ + if (subpass->color_count == 0) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.stride = 1; + } + } +#endif + +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 0, &rt.render_target_0_internal_bpp, &rt.render_target_0_internal_type, &rt.render_target_0_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 1, &rt.render_target_1_internal_bpp, &rt.render_target_1_internal_type, &rt.render_target_1_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 2, &rt.render_target_2_internal_bpp, &rt.render_target_2_internal_type, &rt.render_target_2_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 3, &rt.render_target_3_internal_bpp, &rt.render_target_3_internal_type, &rt.render_target_3_clamp); } +#endif /* Ends rendering mode config. */ if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { @@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } if (cmd_buffer->state.tile_aligned_render_area && (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { +#if V3D_VERSION == 42 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = !job->early_zs_clear; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt); +#endif } cl_emit(rcl, END_OF_TILE_MARKER, end); } @@ -1054,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) cl_emit(rcl, END_OF_RENDERING, end); } +void +v3dX(viewport_compute_xform)(const VkViewport *viewport, + float scale[3], + float translate[3]) +{ + float x = viewport->x; + float y = viewport->y; + float half_width = 0.5f * viewport->width; + float half_height = 0.5f * viewport->height; + double n = viewport->minDepth; + double f = viewport->maxDepth; + + scale[0] = half_width; + translate[0] = half_width + x; + scale[1] = half_height; + translate[1] = half_height + y; + + scale[2] = (f - n); + translate[2] = n; + + /* It seems that if the scale is small enough the hardware won't clip + * correctly so we work around this my choosing the smallest scale that + * seems to work. + * + * This case is exercised by CTS: + * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero + * + * V3D 7.x fixes this by using the new + * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND. + */ +#if V3D_VERSION <= 42 + const float min_abs_scale = 0.0005f; + if (fabs(scale[2]) < min_abs_scale) + scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; +#endif +} + void v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) { @@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size); v3dv_return_if_oom(cmd_buffer, NULL); +#if V3D_VERSION == 42 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f; clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f; } +#endif +#if V3D_VERSION >= 71 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f; + clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f; + } +#endif float translate_z, scale_z; v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0, &translate_z, &scale_z); +#if V3D_VERSION == 42 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { clip.viewport_z_offset_zc_to_zs = translate_z; clip.viewport_z_scale_zc_to_zs = scale_z; } +#endif + +#if V3D_VERSION >= 71 + /* If the Z scale is too small guardband clipping may not clip correctly */ + if (fabsf(scale_z) < 0.01f) { + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } + } else { + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } + } +#endif + cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) { /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled, * we are using OpenGL's [-1, 1] instead. @@ -1205,14 +1499,48 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) cl_emit(&job->bcl, DEPTH_OFFSET, bias) { bias.depth_offset_factor = dynamic->depth_bias.slope_factor; bias.depth_offset_units = dynamic->depth_bias.constant_factor; +#if V3D_VERSION <= 42 if (pipeline->depth_bias.is_z16) bias.depth_offset_units *= 256.0f; +#endif bias.limit = dynamic->depth_bias.depth_bias_clamp; } cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS; } +void +v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer) +{ + /* No depthBounds support for v42, so this method is empty on that case. + * + * Note that this method is being called as v3dv_job_init flag all state as + * dirty. See FIXME note at v3dv_job_init. + */ + +#if V3D_VERSION >= 71 + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + + if (!pipeline->depth_bounds_test_enabled) + return; + + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); + + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS)); + v3dv_return_if_oom(cmd_buffer, NULL); + + struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; + cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) { + bounds.lower_test_limit = dynamic->depth_bounds.min; + bounds.upper_test_limit = dynamic->depth_bounds.max; + } + + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS; +#endif +} + void v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer) { @@ -1256,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); + const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo; + const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver); + const uint32_t blend_packets_size = cl_packet_length(BLEND_ENABLES) + cl_packet_length(BLEND_CONSTANT_COLOR) + - cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS; + cl_packet_length(BLEND_CFG) * max_color_rts; v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size); v3dv_return_if_oom(cmd_buffer, NULL); @@ -1271,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) } } - for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { + for (uint32_t i = 0; i < max_color_rts; i++) { if (pipeline->blend.enables & (1 << i)) cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]); } @@ -1298,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; + uint32_t color_write_mask = ~dynamic->color_write_enable | + pipeline->blend.color_write_masks; +#if V3D_VERSION <= 42 + /* Only 4 RTs */ + color_write_mask &= 0xffff; +#endif + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { - mask.mask = (~dynamic->color_write_enable | - pipeline->blend.color_write_masks) & 0xffff; + mask.mask = color_write_mask; } cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; @@ -1591,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); - bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); - v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); v3dv_return_if_oom(cmd_buffer, NULL); cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { +#if V3D_VERSION == 42 + bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); config.early_z_enable = enable_ez; config.early_z_updates_enable = config.early_z_enable && pipeline->z_updates_enable; +#endif } } @@ -1845,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, gs_bin->prog_data.gs->base.threads == 4; shader.geometry_bin_mode_shader_start_in_final_thread_section = gs_bin->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_bin_mode_shader_propagate_nans = true; +#endif shader.geometry_bin_mode_shader_uniforms_address = gs_bin_uniforms; @@ -1855,7 +2195,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, gs->prog_data.gs->base.threads == 4; shader.geometry_render_mode_shader_start_in_final_thread_section = gs->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_render_mode_shader_propagate_nans = true; +#endif shader.geometry_render_mode_shader_uniforms_address = gs_render_uniforms; } @@ -2031,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) pipeline->vpm_cfg.Gv); } +#if V3D_VERSION == 42 struct v3dv_bo *default_attribute_values = pipeline->default_attribute_values != NULL ? pipeline->default_attribute_values : pipeline->device->default_attribute_float; +#endif cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD, pipeline->shader_state_record, shader) { @@ -2060,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs; shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs; +#if V3D_VERSION == 42 shader.address_of_default_attribute_values = v3dv_cl_address(default_attribute_values, 0); +#endif shader.any_shader_reads_hardware_written_primitive_id = (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid; @@ -2370,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, buffer->mem_offset + offset); } } - -void -v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, - int rt, - uint32_t *rt_bpp, - uint32_t *rt_type, - uint32_t *rt_clamp) -{ - const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - - assert(state->subpass_idx < state->pass->subpass_count); - const struct v3dv_subpass *subpass = - &state->pass->subpasses[state->subpass_idx]; - - if (rt >= subpass->color_count) - return; - - struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; - const uint32_t attachment_idx = attachment->attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) - return; - - assert(attachment_idx < state->framebuffer->attachment_count && - attachment_idx < state->attachment_alloc_count); - struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; - assert(vk_format_is_color(iview->vk.format)); - - assert(iview->plane_count == 1); - *rt_bpp = iview->planes[0].internal_bpp; - *rt_type = iview->planes[0].internal_type; - if (vk_format_is_int(iview->vk.view_format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; - else if (vk_format_is_srgb(iview->vk.view_format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; - else - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; -} diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c index e235983864c..1b50d51e19f 100644 --- a/src/broadcom/vulkan/v3dvx_device.c +++ b/src/broadcom/vulkan/v3dvx_device.c @@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = { [VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS, }; - static union pipe_color_union encode_border_color( + const struct v3dv_device *device, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) { const struct util_format_description *desc = @@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color( * colors so we need to fix up the swizzle manually for this case. */ uint8_t swizzle[4]; - if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && + const bool v3d_has_reverse_swap_rb_bits = + v3dv_texture_shader_state_has_rb_swap_reverse_bits(device); + if (!v3d_has_reverse_swap_rb_bits && + v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) { swizzle[0] = PIPE_SWIZZLE_W; swizzle[1] = PIPE_SWIZZLE_X; swizzle[2] = PIPE_SWIZZLE_Y; swizzle[3] = PIPE_SWIZZLE_Z; + } + /* In v3d 7.x we no longer have a reverse flag for the border color. Instead + * we have to use the new reverse and swap_r/b flags in the texture shader + * state which will apply the format swizzle automatically when sampling + * the border color too and we should not apply it manually here. + */ + else if (v3d_has_reverse_swap_rb_bits && + (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) || + v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) { + swizzle[0] = PIPE_SWIZZLE_X; + swizzle[1] = PIPE_SWIZZLE_Y; + swizzle[2] = PIPE_SWIZZLE_Z; + swizzle[3] = PIPE_SWIZZLE_W; } else { memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle)); } @@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color( (1 << (desc->channel[i].size - 1)) - 1); } - /* convert from float to expected format */ +#if V3D_VERSION <= 42 + /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions + * for us. In V3D 4.x we need to manually convert floating point color + * values to the expected format. + */ if (vk_format_is_srgb(bc_info->format) || vk_format_is_compressed(bc_info->format)) { for (int i = 0; i < 4; i++) @@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color( } } } +#endif return border; } void -v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, +v3dX(pack_sampler_state)(const struct v3dv_device *device, + struct v3dv_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) { @@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, s.border_color_mode = border_color_mode; if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) { - union pipe_color_union border = encode_border_color(bc_info); + union pipe_color_union border = encode_border_color(device, bc_info); s.border_color_word_0 = border.ui[0]; s.border_color_word_1 = border.ui[1]; @@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( const struct v3dv_framebuffer *framebuffer, const struct v3dv_cmd_buffer_attachment_state *attachments, const struct v3dv_subpass *subpass, - uint8_t *max_bpp, + uint8_t *max_internal_bpp, + uint8_t *total_color_bpp, bool *msaa) { STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0); - *max_bpp = V3D_INTERNAL_BPP_32; + *max_internal_bpp = V3D_INTERNAL_BPP_32; + *total_color_bpp = 0; *msaa = false; if (subpass) { @@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( assert(att); assert(att->plane_count == 1); - if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); + if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + const uint32_t internal_bpp = att->planes[0].internal_bpp; + *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); + *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + } if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; @@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; } - return; } @@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( assert(att); assert(att->plane_count == 1); - if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); + if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + const uint32_t internal_bpp = att->planes[0].internal_bpp; + *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); + *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + } if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c index 80a3e5bfde8..de984e81220 100644 --- a/src/broadcom/vulkan/v3dvx_image.c +++ b/src/broadcom/vulkan/v3dvx_image.c @@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]); tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]); - tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; - tex.texture_type = image_view->format->planes[plane].tex_type; if (image->vk.image_type == VK_IMAGE_TYPE_3D) { @@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; - tex.srgb = vk_format_is_srgb(image_view->vk.view_format); - /* At this point we don't have the job. That's the reason the first * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to * add the bo to the job. This also means that we need to add manually @@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device, v3dv_layer_offset(image, 0, image_view->vk.base_array_layer, iplane); tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + + bool is_srgb = vk_format_is_srgb(image_view->vk.format); + + /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose + * the reverse and/or swap_r/b swizzle from the format table with the + * image view swizzle. This, however, doesn't work for border colors, + * for that there is the reverse_standard_border_color. + * + * In v3d 7.x, however, there is no reverse_standard_border_color bit, + * since the reverse and swap_r/b bits also affect border colors. It is + * because of this that we absolutely need to use these bits with + * reversed and swpaped formats, since that's the only way to ensure + * correct border colors. In that case we don't want to program the + * swizzle to the composition of the format swizzle and the view + * swizzle like we do in v3d 4.x, since the format swizzle is applied + * via the reverse and swap_r/b bits. + */ +#if V3D_VERSION == 42 + tex.srgb = is_srgb; + tex.reverse_standard_border_color = + image_view->planes[plane].channel_reverse; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; + + tex.reverse = image_view->planes[plane].channel_reverse; + tex.r_b_swap = image_view->planes[plane].swap_rb; + + if (tex.reverse || tex.r_b_swap) { + tex.swizzle_r = + v3d_translate_pipe_swizzle(image_view->view_swizzle[0]); + tex.swizzle_g = + v3d_translate_pipe_swizzle(image_view->view_swizzle[1]); + tex.swizzle_b = + v3d_translate_pipe_swizzle(image_view->view_swizzle[2]); + tex.swizzle_a = + v3d_translate_pipe_swizzle(image_view->view_swizzle[3]); + } + + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex.texture_base_pointer_cb = base_offset >> 6; + tex.texture_base_pointer_cr = base_offset >> 6; +#endif } } } @@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, assert(buffer_view->format->plane_count == 1); tex.texture_type = buffer_view->format->planes[0].tex_type; - tex.srgb = vk_format_is_srgb(buffer_view->vk_format); + + bool is_srgb = vk_format_is_srgb(buffer_view->vk_format); +#if V3D_VERSION == 42 + tex.srgb = is_srgb; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; +#endif /* At this point we don't have the job. That's the reason the first * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to @@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, buffer_view->offset; tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + +#if V3D_VERSION >= 71 + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex.texture_base_pointer_cb = base_offset >> 6; + tex.texture_base_pointer_cr = base_offset >> 6; +#endif } } diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c index 04147b82cbd..858096f9e4b 100644 --- a/src/broadcom/vulkan/v3dvx_meta_common.c +++ b/src/broadcom/vulkan/v3dvx_meta_common.c @@ -26,6 +26,7 @@ #include "broadcom/common/v3d_macros.h" #include "broadcom/common/v3d_tfu.h" +#include "broadcom/common/v3d_util.h" #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" @@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job, config.number_of_render_targets = 1; config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif config.internal_depth_type = fb->internal_depth_type; } + const uint32_t *color = NULL; if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) { - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; if (clear_info->image) { const struct v3dv_image *image = clear_info->image; @@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job, } } - const uint32_t *color = &clear_info->clear_value->color[0]; + color = &clear_info->clear_value->color[0]; + +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = color[0]; clear.clear_color_next_24_bits = color[1] & 0x00ffffff; @@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job, clear.render_target_number = 0; }; } +#endif } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { rt.render_target_0_internal_bpp = tiling->internal_bpp; rt.render_target_0_internal_type = fb->internal_type; rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; } +#endif + +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + if (color) + rt.clear_color_low_bits = color[0]; + rt.internal_bpp = tiling->internal_bpp; + rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type, + fb->vk_format); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = 0; + rt.render_target_number = 0; + } + + if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) color[1]) | + (((uint64_t) (color[2] & 0xff)) << 32); + rt.render_target_number = 0; + } + } + + if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (color[2] & 0xffffff00)) >> 8) | + (((uint64_t) (color[3])) << 24); + rt.render_target_number = 0; + } + } +#endif cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f; @@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job, */ if (clear_value && (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { +#if V3D_VERSION == 42 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = true; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, CLEAR_RENDER_TARGETS, clear); +#endif } cl_emit(rcl, END_OF_TILE_MARKER, end); } @@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, tfu.iia |= src_offset; +#if V3D_VERSION <= 42 if (src_tiling == V3D_TILING_RASTER) { tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT; } else { @@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, V3D33_TFU_ICFG_FORMAT_SHIFT; } tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT; +#endif +#if V3D_VERSION >= 71 + if (src_tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; + } else { + tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + + (src_tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_ICFG_IFORMAT_SHIFT; + } + tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT; +#endif tfu.ioa = dst_offset; +#if V3D_VERSION <= 42 tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE + (dst_tiling - V3D_TILING_LINEARTILE)) << V3D33_TFU_IOA_FORMAT_SHIFT; +#endif + +#if V3D_VERSION >= 71 + tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE + + (dst_tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_IOC_FORMAT_SHIFT; + + switch (dst_tiling) { + case V3D_TILING_UIF_NO_XOR: + case V3D_TILING_UIF_XOR: + tfu.v71.ioc |= + (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) << + V3D71_TFU_IOC_STRIDE_SHIFT; + break; + case V3D_TILING_RASTER: + tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) << + V3D71_TFU_IOC_STRIDE_SHIFT; + break; + default: + break; + } +#endif switch (src_tiling) { case V3D_TILING_UIF_NO_XOR: @@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, /* The TFU can handle raster sources but always produces UIF results */ assert(dst_tiling != V3D_TILING_RASTER); +#if V3D_VERSION <= 42 /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the * OPAD field for the destination (how many extra UIF blocks beyond * those necessary to cover the height). @@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, uif_block_h; tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT; } +#endif v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu); } @@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t width, height; framebuffer_size_for_pixel_count(num_items, &width, &height); - v3dv_job_start_frame(job, width, height, 1, true, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, 1, true, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type, @@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t width, height; framebuffer_size_for_pixel_count(num_items, &width, &height); - v3dv_job_start_frame(job, width, height, 1, true, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, 1, true, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT, diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c index 5d32d414ed8..ad22add155d 100644 --- a/src/broadcom/vulkan/v3dvx_pipeline.c +++ b/src/broadcom/vulkan/v3dvx_pipeline.c @@ -227,6 +227,45 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false; pipeline->z_updates_enable = config.z_updates_enable; + +#if V3D_VERSION >= 71 + /* From the Vulkan spec: + * + * "depthClampEnable controls whether to clamp the fragment’s depth + * values as described in Depth Test. If the pipeline is not created + * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present + * then enabling depth clamp will also disable clipping primitives to + * the z planes of the frustrum as described in Primitive Clipping. + * Otherwise depth clipping is controlled by the state set in + * VkPipelineRasterizationDepthClipStateCreateInfoEXT." + * + * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually + * supported in the driver yet, so in practice we are always enabling Z + * clipping for now. + */ + bool z_clamp_enable = rs_info && rs_info->depthClampEnable; + bool z_clip_enable = false; + const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info = + ds_info ? vk_find_struct_const(ds_info->pNext, + PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) : + NULL; + if (clip_info) + z_clip_enable = clip_info->depthClipEnable; + else if (!z_clamp_enable) + z_clip_enable = true; + + if (z_clip_enable) { + config.z_clipping_mode = pipeline->negative_one_to_one ? + V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE; + } else { + config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; + } + + config.z_clamp_mode = z_clamp_enable; + + config.depth_bounds_test_enable = + ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment; +#endif }; } @@ -360,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline, static void pack_shader_state_record(struct v3dv_pipeline *pipeline) { - assert(sizeof(pipeline->shader_state_record) == + assert(sizeof(pipeline->shader_state_record) >= cl_packet_length(GL_SHADER_STATE_RECORD)); struct v3d_fs_prog_data *prog_data_fs = @@ -435,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) shader.number_of_varyings_in_fragment_shader = prog_data_fs->num_inputs; - shader.coordinate_shader_propagate_nans = true; - shader.vertex_shader_propagate_nans = true; - shader.fragment_shader_propagate_nans = true; - /* Note: see previous note about addresses */ /* shader.coordinate_shader_code_address */ /* shader.vertex_shader_code_address */ /* shader.fragment_shader_code_address */ +#if V3D_VERSION == 42 + shader.coordinate_shader_propagate_nans = true; + shader.vertex_shader_propagate_nans = true; + shader.fragment_shader_propagate_nans = true; + /* FIXME: Use combined input/output size flag in the common case (also * on v3d, see v3dx_draw). */ @@ -451,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) prog_data_vs_bin->separate_segments; shader.vertex_shader_has_separate_input_and_output_vpm_blocks = prog_data_vs->separate_segments; - shader.coordinate_shader_input_vpm_segment_size = prog_data_vs_bin->separate_segments ? prog_data_vs_bin->vpm_input_size : 1; shader.vertex_shader_input_vpm_segment_size = prog_data_vs->separate_segments ? prog_data_vs->vpm_input_size : 1; +#endif + + /* On V3D 7.1 there isn't a specific flag to set if we are using + * shared/separate segments or not. We just set the value of + * vpm_input_size to 0, and set output to the max needed. That should be + * already properly set on prog_data_vs_bin + */ +#if V3D_VERSION == 71 + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->vpm_input_size; + shader.vertex_shader_input_vpm_segment_size = + prog_data_vs->vpm_input_size; +#endif shader.coordinate_shader_output_vpm_segment_size = prog_data_vs_bin->vpm_output_size; @@ -659,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, } } } + +#if V3D_VERSION == 42 +static bool +pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) +{ + for (uint8_t i = 0; i < pipeline->va_count; i++) { + if (vk_format_is_int(pipeline->va[i].vk_format)) + return true; + } + return false; +} +#endif + +bool +v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline) +{ +#if V3D_VERSION == 42 + return pipeline_has_integer_vertex_attrib(pipeline); +#endif + + return false; +} + +/* @pipeline can be NULL. In that case we assume the most common case. For + * example, for v42 we assume in that case that all the attributes have a + * float format (we only create an all-float BO once and we reuse it with all + * float pipelines), otherwise we look at the actual type of each attribute + * used with the specific pipeline passed in. + */ +struct v3dv_bo * +v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline) +{ +#if V3D_VERSION >= 71 + return NULL; +#endif + + uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; + struct v3dv_bo *bo; + + bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); + + if (!bo) { + fprintf(stderr, "failed to allocate memory for the default " + "attribute values\n"); + return NULL; + } + + bool ok = v3dv_bo_map(device, bo, size); + if (!ok) { + fprintf(stderr, "failed to map default attribute values buffer\n"); + return NULL; + } + + uint32_t *attrs = bo->map; + uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; + for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { + attrs[i * 4 + 0] = 0; + attrs[i * 4 + 1] = 0; + attrs[i * 4 + 2] = 0; + VkFormat attr_format = + pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; + if (i < va_count && vk_format_is_int(attr_format)) { + attrs[i * 4 + 3] = 1; + } else { + attrs[i * 4 + 3] = fui(1.0); + } + } + + v3dv_bo_unmap(device, bo); + + return bo; +} diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h index ad8ddfa5731..0f5887eab93 100644 --- a/src/broadcom/vulkan/v3dvx_private.h +++ b/src/broadcom/vulkan/v3dvx_private.h @@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer); void v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer); +void +v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer); + void v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer); @@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color, uint32_t internal_size, uint32_t *hw_color); -void -v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, - int rt, - uint32_t *rt_bpp, - uint32_t *rt_type, - uint32_t *rt_clamp); - /* Used at v3dv_device */ void -v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, +v3dX(pack_sampler_state)(const struct v3dv_device *device, + struct v3dv_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info); @@ -143,7 +140,9 @@ void v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer, const struct v3dv_cmd_buffer_attachment_state *attachments, const struct v3dv_subpass *subpass, - uint8_t *max_bpp, bool *msaa); + uint8_t *max_internal_bpp, + uint8_t *total_color_bpp, + bool *msaa); #ifdef DEBUG void @@ -313,10 +312,24 @@ void v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, const VkPipelineVertexInputStateCreateInfo *vi_info, const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info); + +bool +v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline); + +struct v3dv_bo * +v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline); + /* Used at v3dv_queue */ void v3dX(job_emit_noop)(struct v3dv_job *job); +/* Used at v3dv_query */ +VkResult +v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions); + /* Used at v3dv_descriptor_set, and other descriptor set utils */ uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type); @@ -325,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void); uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane); uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane); + +/* General utils */ + +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); + +#define V3D42_CLIPPER_XY_GRANULARITY 256.0f +#define V3D71_CLIPPER_XY_GRANULARITY 64.0f + +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); + +void +v3dX(viewport_compute_xform)(const VkViewport *viewport, + float scale[3], + float translate[3]); diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c new file mode 100644 index 00000000000..e59a1e84ff6 --- /dev/null +++ b/src/broadcom/vulkan/v3dvx_query.c @@ -0,0 +1,67 @@ +/* + * Copyright © 2023 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3dv_private.h" + +#include "common/v3d_performance_counters.h" + +VkResult +v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) +{ + uint32_t desc_count = *pCounterCount; + + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, + out, pCounters, pCounterCount); + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, + out_desc, pCounterDescriptions, &desc_count); + + for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { + vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { + counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; + counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; + counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; + + unsigned char sha1_result[20]; + _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], + strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), + sha1_result); + + memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); + } + + vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, + &out_desc, desc) { + desc->flags = 0; + snprintf(desc->name, sizeof(desc->name), "%s", + v3d_performance_counters[i][V3D_PERFCNT_NAME]); + snprintf(desc->category, sizeof(desc->category), "%s", + v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); + snprintf(desc->description, sizeof(desc->description), "%s", + v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); + } + } + + return vk_outarray_status(&out); +} diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c index efe63de425c..6eed2de9d54 100644 --- a/src/broadcom/vulkan/v3dvx_queue.c +++ b/src/broadcom/vulkan/v3dvx_queue.c @@ -29,7 +29,8 @@ void v3dX(job_emit_noop)(struct v3dv_job *job) { - v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false); + v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, + V3D_INTERNAL_BPP_32, 4, false); v3dX(job_emit_binning_flush)(job); struct v3dv_cl *rcl = &job->rcl; @@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job) config.image_height_pixels = 1; config.number_of_render_targets = 1; config.multisample_mode_4x = false; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = 3; /* Tile size 64 */ + config.log2_tile_height = 3; /* Tile size 64 */ +#endif } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32; rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8; rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.internal_bpp = V3D_INTERNAL_BPP_32; + rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8; + rt.stride = 1; /* Unused RT */ + } +#endif cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { clear.z_clear_value = 1.0f; diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py index e6383b67737..46395d79a89 100644 --- a/src/compiler/nir/nir_constant_expressions.py +++ b/src/compiler/nir/nir_constant_expressions.py @@ -62,6 +62,8 @@ template = """\ #include "util/softfloat.h" #include "util/bigmath.h" #include "util/format/format_utils.h" +#include "util/format_r11g11b10f.h" +#include "util/u_math.h" #include "nir_constant_expressions.h" /** @@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u) return _mesa_half_to_float(u); } +/* Broadcom v3d specific instructions */ +/** + * Packs 2 2x16 floating split into a r11g11b10f + */ +static uint32_t v11fpack_v3d(const uint32_t src0, + const uint32_t src1) +{ + float rgb[3]; + + rgb[0] = unpack_half_1x16((src0 & 0xffff)); + rgb[1] = unpack_half_1x16((src0 >> 16)); + rgb[2] = unpack_half_1x16((src1 & 0xffff)); + + return float3_to_r11g11b10f(rgb); +} + +/** + * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16, + * as it receives a uint16_t val instead of a float + */ +static uint8_t _mesa_half_to_snorm8(uint16_t val) +{ + float x = _mesa_half_to_float(val); + + return pack_snorm_1x8(x); +} + +static uint16_t _mesa_float_to_snorm16(uint32_t val) +{ + union fi aux; + aux.ui = val; + return pack_snorm_1x16(aux.f); +} + +static uint16_t _mesa_float_to_unorm16(uint32_t val) +{ + union fi aux; + aux.ui = val; + return pack_unorm_1x16(aux.f); +} + +/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too + * verbose. It is likely that there would be a simpler way to implement + * it. + */ +static uint32_t float_pack16_v3d(uint32_t f32) +{ + float f = uif(f32); + return _mesa_float_to_half(f); +} + +static uint32_t float_unpack16_v3d(uint32_t f16) +{ + float f = _mesa_half_to_float(f16); + return fui(f); +} + +static uint32_t vfpack_v3d(uint32_t a, uint32_t b) +{ + return float_pack16_v3d(b) << 16 | float_pack16_v3d(a); +} + +static uint32_t vfsat_v3d(uint32_t a) +{ + return vfpack_v3d( + fui(SATURATE(_mesa_half_to_float(a & 0xffff))), + fui(SATURATE(_mesa_half_to_float(a >> 16)))); +} + +static uint32_t fmul_v3d(uint32_t a, uint32_t b) +{ + float f = uif(a); + float g = uif(b); + + float x = f * g; + + return fui(x); +} + +#define L(x) float_unpack16_v3d((x) & 0xffff) +#define H(x) float_unpack16_v3d((x) >> 16) +#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b))) + +static uint32_t vfmul_v3d(uint32_t a, uint32_t b) +{ + return V(fmul_v3d, a, b); +} + +/* Convert 2x16-bit floating point to 2x10-bit unorm */ +static uint32_t vftounorm10lo(uint32_t src0) +{ + return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff); +} + +/* + * Convert 2x16-bit floating point to one 2-bit and one + * 10-bit unorm + */ +static uint32_t vftounorm10hi(uint32_t src0) +{ + return vfmul_v3d(vfsat_v3d(src0), 0x000303ff); +} + + /* Some typed vector structures to make things like src0.y work */ typedef int8_t int1_t; typedef uint8_t uint1_t; diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index e4d87aa6126..63aa7cfa315 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) { } """) +# v3d-specific opcodes + +# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into +# r11g11b10 bits, rounding to nearest even +binop_convert("v11fpack_v3d", tuint32, tuint32, "", + "v11fpack_v3d(src0, src1)") + +# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The +# difference with pack_32_2x16_split is that the sources are 32bit too. So it +# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit +# pack. +binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32, + "(src0.x & 0xffff) | (src1.x << 16)") + +# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2 +binop_convert("v10pack_v3d", tuint32, tuint32, "", + "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30") + +# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits: +# dst[7:0] = src0[7:0] +# dst[15:8] = src0[23:16] +# dst[23:16] = src1[7:0] +# dst[31:24] = src1[23:16] +opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32], + False, "", + "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8") + +# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm +unop("vftounorm8_v3d", tuint32, + "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)") +unop("vftosnorm8_v3d", tuint32, + "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)") + +# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm +unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)") +unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)") + +# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm +unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)") + +# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit +# and one 10 bit unorm +unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)") + # Mali-specific opcodes unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)")) diff --git a/src/gallium/drivers/v3d/driinfo_v3d.h b/src/gallium/drivers/v3d/driinfo_v3d.h index 147ad0b49bd..8f989e8aa57 100644 --- a/src/gallium/drivers/v3d/driinfo_v3d.h +++ b/src/gallium/drivers/v3d/driinfo_v3d.h @@ -2,4 +2,6 @@ DRI_CONF_SECTION_MISCELLANEOUS DRI_CONF_V3D_NONMSAA_TEXTURE_SIZE_LIMIT(false) + DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(false) + DRI_CONF_V3D_IS_XSERVER_PROCESS(false) DRI_CONF_SECTION_END diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build index dfa1e88097b..e47682db1aa 100644 --- a/src/gallium/drivers/v3d/meson.build +++ b/src/gallium/drivers/v3d/meson.build @@ -34,7 +34,6 @@ files_libv3d = files( 'v3d_query.c', 'v3d_query.h', 'v3d_query_pipe.c', - 'v3d_query_perfcnt.c', 'v3d_resource.c', 'v3d_resource.h', 'v3d_screen.c', @@ -47,8 +46,10 @@ files_per_version = files( 'v3dx_emit.c', 'v3dx_format_table.c', 'v3dx_job.c', + 'v3dx_query_perfcnt.c', 'v3dx_rcl.c', 'v3dx_state.c', + 'v3dx_tfu.c', ) v3d_args = ['-DV3D_BUILD_NEON'] @@ -58,7 +59,17 @@ if dep_v3dv3.found() v3d_args += '-DUSE_V3D_SIMULATOR' endif -v3d_versions = ['33', '42'] +v3d_versions = ['33', '42', '71'] + +v3d_deps = [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers] + +if with_platform_x11 + v3d_deps += dep_xcb +endif + +if with_platform_wayland + v3d_deps += dep_wayland_client +endif per_version_libs = [] foreach ver : v3d_versions @@ -71,7 +82,7 @@ foreach ver : v3d_versions ], c_args : [v3d_args, '-DV3D_VERSION=' + ver], gnu_symbol_visibility : 'hidden', - dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers], + dependencies : v3d_deps, ) endforeach @@ -94,10 +105,7 @@ libv3d = static_library( c_args : [v3d_args], cpp_args : [v3d_args], gnu_symbol_visibility : 'hidden', - dependencies : [ - dep_v3dv3, dep_libdrm, dep_valgrind, - idep_nir_headers, idep_mesautil, - ], + dependencies : v3d_deps + idep_mesautil, link_with: [per_version_libs], ) diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c index b7dc56a044e..ee3c14b154c 100644 --- a/src/gallium/drivers/v3d/v3d_blit.c +++ b/src/gallium/drivers/v3d/v3d_blit.c @@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info) info->mask &= ~PIPE_MASK_S; } -static bool -v3d_tfu(struct pipe_context *pctx, - struct pipe_resource *pdst, - struct pipe_resource *psrc, - unsigned int src_level, - unsigned int base_level, - unsigned int last_level, - unsigned int src_layer, - unsigned int dst_layer, - bool for_mipmap) -{ - struct v3d_context *v3d = v3d_context(pctx); - struct v3d_screen *screen = v3d->screen; - struct v3d_resource *src = v3d_resource(psrc); - struct v3d_resource *dst = v3d_resource(pdst); - struct v3d_resource_slice *src_base_slice = &src->slices[src_level]; - struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level]; - int msaa_scale = pdst->nr_samples > 1 ? 2 : 1; - int width = u_minify(pdst->width0, base_level) * msaa_scale; - int height = u_minify(pdst->height0, base_level) * msaa_scale; - enum pipe_format pformat; - - if (psrc->format != pdst->format) - return false; - if (psrc->nr_samples != pdst->nr_samples) - return false; - - /* Can't write to raster. */ - if (dst_base_slice->tiling == V3D_TILING_RASTER) - return false; - - /* When using TFU for blit, we are doing exact copies (both input and - * output format must be the same, no scaling, etc), so there is no - * pixel format conversions. Thus we can rewrite the format to use one - * that is TFU compatible based on its texel size. - */ - if (for_mipmap) { - pformat = pdst->format; - } else { - switch (dst->cpp) { - case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break; - case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break; - case 4: pformat = PIPE_FORMAT_R32_FLOAT; break; - case 2: pformat = PIPE_FORMAT_R16_FLOAT; break; - case 1: pformat = PIPE_FORMAT_R8_UNORM; break; - default: unreachable("unsupported format bit-size"); break; - }; - } - - uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat); - struct v3d_device_info *devinfo = &screen->devinfo; - - if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) { - assert(for_mipmap); - return false; - } - - v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); - v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); - - struct drm_v3d_submit_tfu tfu = { - .ios = (height << 16) | width, - .bo_handles = { - dst->bo->handle, - src != dst ? src->bo->handle : 0 - }, - .in_sync = v3d->out_sync, - .out_sync = v3d->out_sync, - }; - uint32_t src_offset = (src->bo->offset + - v3d_layer_offset(psrc, src_level, src_layer)); - tfu.iia |= src_offset; - if (src_base_slice->tiling == V3D_TILING_RASTER) { - tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER << - V3D33_TFU_ICFG_FORMAT_SHIFT); - } else { - tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE + - (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << - V3D33_TFU_ICFG_FORMAT_SHIFT); - } - - uint32_t dst_offset = (dst->bo->offset + - v3d_layer_offset(pdst, base_level, dst_layer)); - tfu.ioa |= dst_offset; - if (last_level != base_level) - tfu.ioa |= V3D33_TFU_IOA_DIMTW; - tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE + - (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << - V3D33_TFU_IOA_FORMAT_SHIFT); - - tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT; - tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT; - - switch (src_base_slice->tiling) { - case V3D_TILING_UIF_NO_XOR: - case V3D_TILING_UIF_XOR: - tfu.iis |= (src_base_slice->padded_height / - (2 * v3d_utile_height(src->cpp))); - break; - case V3D_TILING_RASTER: - tfu.iis |= src_base_slice->stride / src->cpp; - break; - case V3D_TILING_LINEARTILE: - case V3D_TILING_UBLINEAR_1_COLUMN: - case V3D_TILING_UBLINEAR_2_COLUMN: - break; - } - - /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the - * OPAD field for the destination (how many extra UIF blocks beyond - * those necessary to cover the height). When filling mipmaps, the - * miplevel 1+ tiling state is inferred. - */ - if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR || - dst_base_slice->tiling == V3D_TILING_UIF_XOR) { - int uif_block_h = 2 * v3d_utile_height(dst->cpp); - int implicit_padded_height = align(height, uif_block_h); - - tfu.icfg |= (((dst_base_slice->padded_height - - implicit_padded_height) / uif_block_h) << - V3D33_TFU_ICFG_OPAD_SHIFT); - } - - int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu); - if (ret != 0) { - fprintf(stderr, "Failed to submit TFU job: %d\n", ret); - return false; - } - - dst->writes++; - - return true; -} - bool v3d_generate_mipmap(struct pipe_context *pctx, struct pipe_resource *prsc, @@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx, if (first_layer != last_layer) return false; - return v3d_tfu(pctx, - prsc, prsc, - base_level, - base_level, last_level, - first_layer, first_layer, - true); + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_screen *screen = v3d->screen; + struct v3d_device_info *devinfo = &screen->devinfo; + + return v3d_X(devinfo, tfu)(pctx, + prsc, prsc, + base_level, + base_level, last_level, + first_layer, first_layer, + true); } static void @@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info) if (info->dst.format != info->src.format) return; - if (v3d_tfu(pctx, info->dst.resource, info->src.resource, - info->src.level, - info->dst.level, info->dst.level, - info->src.box.z, info->dst.box.z, - false)) { + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_screen *screen = v3d->screen; + struct v3d_device_info *devinfo = &screen->devinfo; + + if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource, + info->src.level, + info->dst.level, info->dst.level, + info->src.box.z, info->dst.box.z, + false)) { info->mask &= ~PIPE_MASK_RGBA; } } @@ -495,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info) bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa; uint32_t tile_width, tile_height, max_bpp; - v3d_get_tile_buffer_size(msaa, double_buffer, + v3d_get_tile_buffer_size(devinfo, msaa, double_buffer, is_color_blit ? 1 : 0, surfaces, src_surf, &tile_width, &tile_height, &max_bpp); diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c index f12e8c92139..1dc4bd017fe 100644 --- a/src/gallium/drivers/v3d/v3d_context.c +++ b/src/gallium/drivers/v3d/v3d_context.c @@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d, } void -v3d_get_tile_buffer_size(bool is_msaa, +v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, + bool is_msaa, bool double_buffer, uint32_t nr_cbufs, struct pipe_surface **cbufs, @@ -232,11 +233,13 @@ v3d_get_tile_buffer_size(bool is_msaa, assert(!is_msaa || !double_buffer); uint32_t max_cbuf_idx = 0; + uint32_t total_bpp = 0; *max_bpp = 0; for (int i = 0; i < nr_cbufs; i++) { if (cbufs[i]) { struct v3d_surface *surf = v3d_surface(cbufs[i]); *max_bpp = MAX2(*max_bpp, surf->internal_bpp); + total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp); max_cbuf_idx = MAX2(i, max_cbuf_idx); } } @@ -245,9 +248,11 @@ v3d_get_tile_buffer_size(bool is_msaa, struct v3d_surface *bsurf = v3d_surface(bbuf); assert(bbuf->texture->nr_samples <= 1 || is_msaa); *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp); + total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp); } - v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp, + v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, + *max_bpp, total_bpp, is_msaa, double_buffer, tile_width, tile_height); } diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h index 97850b0363e..eb184b4b203 100644 --- a/src/gallium/drivers/v3d/v3d_context.h +++ b/src/gallium/drivers/v3d/v3d_context.h @@ -265,6 +265,7 @@ struct v3d_vertex_stateobj { unsigned num_elements; uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)]; + /* defaults can be NULL for some hw generation */ struct pipe_resource *defaults; uint32_t defaults_offset; }; @@ -794,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx); void v3d_flag_dirty_sampler_state(struct v3d_context *v3d, enum pipe_shader_type shader); -void v3d_get_tile_buffer_size(bool is_msaa, +void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo, + bool is_msaa, bool double_buffer, uint32_t nr_cbufs, struct pipe_surface **cbufs, @@ -818,16 +820,52 @@ void v3d_disk_cache_store(struct v3d_context *v3d, /* Helper to call hw ver specific functions */ #define v3d_X(devinfo, thing) ({ \ - __typeof(&v3d42_##thing) v3d_X_thing; \ - if ((devinfo)->ver >= 42) \ - v3d_X_thing = &v3d42_##thing; \ - else if ((devinfo)->ver >= 33) \ + __typeof(&v3d33_##thing) v3d_X_thing; \ + switch (devinfo->ver) { \ + case 33: \ + case 40: \ v3d_X_thing = &v3d33_##thing; \ - else \ + break; \ + case 42: \ + v3d_X_thing = &v3d42_##thing; \ + break; \ + case 71: \ + v3d_X_thing = &v3d71_##thing; \ + break; \ + default: \ unreachable("Unsupported hardware generation"); \ + } \ v3d_X_thing; \ }) +/* FIXME: The same for vulkan/opengl. Common place? define it at the + * v3d_packet files? + */ +#define V3D33_CLIPPER_XY_GRANULARITY 256.0f +#define V3D42_CLIPPER_XY_GRANULARITY 256.0f +#define V3D71_CLIPPER_XY_GRANULARITY 64.0f + +/* Helper to get hw-specific macro values */ +#define V3DV_X(devinfo, thing) ({ \ + __typeof(V3D33_##thing) V3D_X_THING; \ + switch (devinfo->ver) { \ + case 33: \ + case 40: \ + V3D_X_THING = V3D33_##thing; \ + break; \ + case 41: \ + case 42: \ + V3D_X_THING = V3D42_##thing; \ + break; \ + case 71: \ + V3D_X_THING = V3D71_##thing; \ + break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + V3D_X_THING; \ +}) + #ifdef v3dX # include "v3dx_context.h" #else @@ -838,6 +876,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d, # define v3dX(x) v3d42_##x # include "v3dx_context.h" # undef v3dX + +# define v3dX(x) v3d71_##x +# include "v3dx_context.h" +# undef v3dX #endif #endif /* V3D_CONTEXT_H */ diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c index b022ed45073..577890a06c3 100644 --- a/src/gallium/drivers/v3d/v3d_job.c +++ b/src/gallium/drivers/v3d/v3d_job.c @@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d) job->double_buffer = false; } - v3d_get_tile_buffer_size(job->msaa, job->double_buffer, + v3d_get_tile_buffer_size(&v3d->screen->devinfo, + job->msaa, job->double_buffer, job->nr_cbufs, job->cbufs, job->bbuf, - &job->tile_width, &job->tile_height, + &job->tile_width, + &job->tile_height, &job->internal_bpp); /* The dirty flags are tracking what's been updated while v3d->job has diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c index db98c89625f..83f82e44a3d 100644 --- a/src/gallium/drivers/v3d/v3d_query.c +++ b/src/gallium/drivers/v3d/v3d_query.c @@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index, struct pipe_driver_query_group_info *info) { struct v3d_screen *screen = v3d_screen(pscreen); + struct v3d_device_info *devinfo = &screen->devinfo; - return v3d_get_driver_query_group_info_perfcnt(screen, index, info); + return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen, + index, + info); } int @@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, struct pipe_driver_query_info *info) { struct v3d_screen *screen = v3d_screen(pscreen); + struct v3d_device_info *devinfo = &screen->devinfo; - return v3d_get_driver_query_info_perfcnt(screen, index, info); + return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen, + index, + info); } static struct pipe_query * @@ -53,9 +59,13 @@ static struct pipe_query * v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries, unsigned *query_types) { - return v3d_create_batch_query_perfcnt(v3d_context(pctx), - num_queries, - query_types); + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_screen *screen = v3d->screen; + struct v3d_device_info *devinfo = &screen->devinfo; + + return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx), + num_queries, + query_types); } static void diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h index 3e1426b8d86..605ed1a12f9 100644 --- a/src/gallium/drivers/v3d/v3d_query.h +++ b/src/gallium/drivers/v3d/v3d_query.h @@ -42,11 +42,5 @@ struct v3d_query }; struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index); -struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries, - unsigned *query_types); -int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index, - struct pipe_driver_query_group_info *info); -int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index, - struct pipe_driver_query_info *info); #endif /* V3D_QUERY_H */ diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c index a0a210ccad5..8e31acb0ff0 100644 --- a/src/gallium/drivers/v3d/v3d_resource.c +++ b/src/gallium/drivers/v3d/v3d_resource.c @@ -439,7 +439,7 @@ v3d_resource_get_handle(struct pipe_screen *pscreen, case WINSYS_HANDLE_TYPE_SHARED: return v3d_bo_flink(bo, &whandle->handle); case WINSYS_HANDLE_TYPE_KMS: - if (screen->ro) { + if (screen->ro && rsc->scanout) { if (renderonly_get_handle(rsc->scanout, whandle)) { whandle->stride = rsc->slices[0].stride; return true; @@ -785,6 +785,27 @@ v3d_resource_setup(struct pipe_screen *pscreen, return rsc; } +static bool +v3d_resource_should_scanout(struct pipe_screen *pscreen, + const struct pipe_resource *tmpl, + const uint64_t *modifiers, + int count) +{ + struct v3d_screen *screen = v3d_screen(pscreen); + + if (tmpl->bind & PIPE_BIND_SCANOUT) { + if (screen->maintain_ignorable_scanout) + return true; + if (screen->has_x_session && screen->ignore_scanout_usages) { + if (drm_find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF, + modifiers, count)) + return false; + } + return true; + } + return false; +} + static struct pipe_resource * v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, const struct pipe_resource *tmpl, @@ -798,6 +819,8 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, struct pipe_resource *prsc = &rsc->base; /* Use a tiled layout if we can, for better 3D performance. */ bool should_tile = true; + bool should_scanout = v3d_resource_should_scanout(pscreen, tmpl, + modifiers, count); assert(tmpl->target != PIPE_BUFFER || (tmpl->format == PIPE_FORMAT_NONE || @@ -827,7 +850,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, /* If using the old-school SCANOUT flag, we don't know what the screen * might support other than linear. Just force linear. */ - if (tmpl->bind & PIPE_BIND_SCANOUT) + if ((tmpl->bind & PIPE_BIND_SCANOUT) && should_scanout) should_tile = false; /* No user-specified modifier; determine our own. */ @@ -849,7 +872,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, v3d_setup_slices(rsc, 0, tmpl->bind & PIPE_BIND_SHARED); - if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) { + if (screen->ro && should_scanout) { struct winsys_handle handle; struct pipe_resource scanout_tmpl = { .target = prsc->target, @@ -979,7 +1002,7 @@ v3d_resource_from_handle(struct pipe_screen *pscreen, } } - if (screen->ro) { + if (screen->ro && !rsc->tiled) { /* Make sure that renderonly has a handle to our buffer in the * display's fd, so that a later renderonly_get_handle() * returns correct handles or GEM names. @@ -1025,7 +1048,9 @@ v3d_update_shadow_texture(struct pipe_context *pctx, assert(view->texture != pview->texture); - if (shadow->writes == orig->writes && orig->bo->private) + if (shadow->writes == orig->writes && + orig->base.sync_status == 0 && + (orig->bo->private || orig->base.sync_condition)) return; perf_debug("Updating %dx%d@%d shadow for linear texture\n", @@ -1068,6 +1093,7 @@ v3d_update_shadow_texture(struct pipe_context *pctx, } shadow->writes = orig->writes; + orig->base.sync_status = 0; } static struct pipe_surface * diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c index bce1eeafcd9..4d2478b130d 100644 --- a/src/gallium/drivers/v3d/v3d_screen.c +++ b/src/gallium/drivers/v3d/v3d_screen.c @@ -47,6 +47,42 @@ #include "compiler/v3d_compiler.h" #include "drm-uapi/drm_fourcc.h" +#ifdef HAVE_WAYLAND_PLATFORM +#include +#endif + +#ifdef HAVE_X11_PLATFORM +#include +#endif + +static bool +check_x_session() +{ + bool xcb_connection = false; + +#ifdef HAVE_WAYLAND_PLATFORM + struct wl_display *display; + + display = wl_display_connect(NULL); + + if (display) { + wl_display_disconnect(display); + return xcb_connection; + } +#endif + +#ifdef HAVE_X11_PLATFORM + xcb_connection_t *conn; + + conn = xcb_connect(NULL, NULL); + + if (!xcb_connection_has_error(conn)) + xcb_connection = true; + xcb_disconnect(conn); +#endif + return xcb_connection; +} + static const char * v3d_screen_get_name(struct pipe_screen *pscreen) { @@ -255,9 +291,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: return V3D_MAX_ARRAY_LAYERS; - /* Render targets. */ case PIPE_CAP_MAX_RENDER_TARGETS: - return 4; + return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver); case PIPE_CAP_VENDOR_ID: return 0x14E4; @@ -919,6 +954,12 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config, if (!v3d_get_device_info(screen->fd, &screen->devinfo, &v3d_ioctl)) goto fail; + if (screen->devinfo.ver >= 71) { + fprintf(stderr, "WARNING: v3d support for hw version %i is neither " + "a complete nor a conformant OpenGL implementation. Testing " + "use only.\n", screen->devinfo.ver); + } + driParseConfigFiles(config->options, config->options_info, 0, "v3d", NULL, NULL, NULL, 0, NULL, 0); @@ -937,6 +978,29 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config, v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH); screen->has_perfmon = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_PERFMON); + screen->ignore_scanout_usages = getenv("V3D_IGNORE_SCANOUT_USAGES"); + + const char *is_xserver_process = + "v3d_is_xserver_process"; + screen->is_xserver_process = + driCheckOption(config->options, + is_xserver_process, + DRI_BOOL) && + driQueryOptionb(config->options, + is_xserver_process); + + const char *maintain_ignorable_scanout_name = + "v3d_maintain_ignorable_scanout"; + screen->maintain_ignorable_scanout = + driCheckOption(config->options, + maintain_ignorable_scanout_name, + DRI_BOOL) && + driQueryOptionb(config->options, + maintain_ignorable_scanout_name); + + screen->has_x_session = !screen->is_xserver_process && + check_x_session(); + v3d_fence_init(screen); v3d_process_debug_variable(); diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h index 1da9b83c965..c0f22707075 100644 --- a/src/gallium/drivers/v3d/v3d_screen.h +++ b/src/gallium/drivers/v3d/v3d_screen.h @@ -83,6 +83,12 @@ struct v3d_screen { bool has_cache_flush; bool has_perfmon; bool nonmsaa_texture_size_limit; + bool ignore_scanout_usages; + bool is_xserver_process; + bool maintain_ignorable_scanout; + + /* Are we running in an X session? */ + bool has_x_session; struct v3d_simulator_file *sim_file; diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c index 95eb838954f..1b8758bae7d 100644 --- a/src/gallium/drivers/v3d/v3d_uniforms.c +++ b/src/gallium/drivers/v3d/v3d_uniforms.c @@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, struct v3d_compiled_shader *shader, enum pipe_shader_type stage) { + struct v3d_device_info *devinfo = &v3d->screen->devinfo; struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage]; struct v3d_texture_stateobj *texstate = &v3d->tex[stage]; struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms; @@ -282,6 +283,9 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, struct v3d_cl_out *uniforms = cl_start(&job->indirect); + float clipper_xy_granularity = + V3DV_X(devinfo, CLIPPER_XY_GRANULARITY); + for (int i = 0; i < uinfo->count; i++) { uint32_t data = uinfo->data[i]; @@ -293,10 +297,10 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job, cl_aligned_u32(&uniforms, gallium_uniforms[data]); break; case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f); + cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity); break; case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f); + cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity); break; case QUNIFORM_VIEWPORT_Z_OFFSET: diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h index 03d7c244ea2..c487ac3b996 100644 --- a/src/gallium/drivers/v3d/v3dx_context.h +++ b/src/gallium/drivers/v3d/v3dx_context.h @@ -51,3 +51,23 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format, */ bool v3dX(tfu_supports_tex_format)(uint32_t tex_format, bool for_mipmap); + +bool v3dX(tfu)(struct pipe_context *pctx, + struct pipe_resource *pdst, + struct pipe_resource *psrc, + unsigned int src_level, + unsigned int base_level, + unsigned int last_level, + unsigned int src_layer, + unsigned int dst_layer, + bool for_mipmap); + +int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, + unsigned index, + struct pipe_driver_query_group_info *info); +int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, + unsigned index, + struct pipe_driver_query_info *info); +struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, + unsigned num_queries, + unsigned *query_types); diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 0640dab1884..85083035ea6 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -95,7 +95,25 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) #endif assert(!job->msaa || !job->double_buffer); -#if V3D_VERSION >= 40 +#if V3D_VERSION >= 71 + cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { + config.width_in_pixels = job->draw_width; + config.height_in_pixels = job->draw_height; + + config.log2_tile_width = log2_tile_size(job->tile_width); + config.log2_tile_height = log2_tile_size(job->tile_height); + + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); + } + +#endif + +#if V3D_VERSION >= 40 && V3D_VERSION <= 42 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { config.width_in_pixels = job->draw_width; config.height_in_pixels = job->draw_height; @@ -107,7 +125,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) config.maximum_bpp_of_all_render_targets = job->internal_bpp; } -#else /* V3D_VERSION < 40 */ +#endif +#if V3D_VERSION < 40 /* "Binning mode lists start with a Tile Binning Mode Configuration * item (120)" * @@ -134,7 +153,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) config.maximum_bpp_of_all_render_targets = job->internal_bpp; } -#endif /* V3D_VERSION < 40 */ +#endif /* There's definitely nothing in the VCD cache we want. */ cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin); @@ -377,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job, gs_bin->prog_data.gs->base.threads == 4; shader.geometry_bin_mode_shader_start_in_final_thread_section = gs_bin->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_bin_mode_shader_propagate_nans = true; +#endif shader.geometry_bin_mode_shader_uniforms_address = gs_bin_uniforms; @@ -387,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job, gs->prog_data.gs->base.threads == 4; shader.geometry_render_mode_shader_start_in_final_thread_section = gs->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_render_mode_shader_propagate_nans = true; +#endif shader.geometry_render_mode_shader_uniforms_address = gs_render_uniforms; } @@ -638,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, shader.number_of_varyings_in_fragment_shader = v3d->prog.fs->prog_data.fs->num_inputs; - shader.coordinate_shader_propagate_nans = true; - shader.vertex_shader_propagate_nans = true; - shader.fragment_shader_propagate_nans = true; - shader.coordinate_shader_code_address = cl_address(v3d_resource(v3d->prog.cs->resource)->bo, v3d->prog.cs->offset); @@ -652,6 +671,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, cl_address(v3d_resource(v3d->prog.fs->resource)->bo, v3d->prog.fs->offset); +#if V3D_VERSION <= 42 + shader.coordinate_shader_propagate_nans = true; + shader.vertex_shader_propagate_nans = true; + shader.fragment_shader_propagate_nans = true; + /* XXX: Use combined input/output size flag in the common * case. */ @@ -659,13 +683,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, v3d->prog.cs->prog_data.vs->separate_segments; shader.vertex_shader_has_separate_input_and_output_vpm_blocks = v3d->prog.vs->prog_data.vs->separate_segments; - shader.coordinate_shader_input_vpm_segment_size = v3d->prog.cs->prog_data.vs->separate_segments ? v3d->prog.cs->prog_data.vs->vpm_input_size : 1; shader.vertex_shader_input_vpm_segment_size = v3d->prog.vs->prog_data.vs->separate_segments ? v3d->prog.vs->prog_data.vs->vpm_input_size : 1; +#endif + /* On V3D 7.1 there isn't a specific flag to set if we are using + * shared/separate segments or not. We just set the value of + * vpm_input_size to 0, and set output to the max needed. That should be + * already properly set on prog_data_vs_bin + */ +#if V3D_VERSION == 71 + shader.coordinate_shader_input_vpm_segment_size = + v3d->prog.cs->prog_data.vs->vpm_input_size; + shader.vertex_shader_input_vpm_segment_size = + v3d->prog.vs->prog_data.vs->vpm_input_size; +#endif shader.coordinate_shader_output_vpm_segment_size = v3d->prog.cs->prog_data.vs->vpm_output_size; @@ -724,9 +759,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, shader.instance_id_read_by_vertex_shader = v3d->prog.vs->prog_data.vs->uses_iid; +#if V3D_VERSION <= 42 shader.address_of_default_attribute_values = cl_address(v3d_resource(vtx->defaults)->bo, vtx->defaults_offset); +#endif } bool cs_loaded_any = false; @@ -1436,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; - /* Number of batches the dispatch will invoke (minus 1). */ - submit.cfg[4] = num_batches - 1; + /* Number of batches the dispatch will invoke. + * V3D 7.1.6 and later don't subtract 1 from the number of batches + */ + if (v3d->screen->devinfo.ver < 71 || + (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) { + submit.cfg[4] = num_batches - 1; + } else { + submit.cfg[4] = num_batches; + } /* Make sure we didn't accidentally underflow. */ assert(submit.cfg[4] != ~0); @@ -1445,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo); submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset + v3d->prog.compute->offset); - submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; + if (v3d->screen->devinfo.ver < 71) + submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (v3d->prog.compute->prog_data.base->single_seg) submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; if (v3d->prog.compute->prog_data.base->threads == 4) @@ -1560,9 +1605,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers, /* GFXH-1461: If we were to emit a load of just depth or just stencil, * then the clear for the other may get lost. We need to decide now * if it would be possible to need to emit a load of just one after - * we've set up our TLB clears. + * we've set up our TLB clears. This issue is fixed since V3D 4.3.18. */ - if (buffers & PIPE_CLEAR_DEPTHSTENCIL && + if (v3d->screen->devinfo.ver <= 42 && + buffers & PIPE_CLEAR_DEPTHSTENCIL && (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL && job->zsbuf && util_format_is_depth_and_stencil(job->zsbuf->texture->format)) { diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c index 0ad3fb68b1e..82a45e44f82 100644 --- a/src/gallium/drivers/v3d/v3dx_emit.c +++ b/src/gallium/drivers/v3d/v3dx_emit.c @@ -512,13 +512,17 @@ v3dX(emit_state)(struct pipe_context *pctx) /* Note: EZ state may update based on the compiled FS, * along with ZSA */ +#if V3D_VERSION <= 42 config.early_z_updates_enable = (job->ez_state != V3D_EZ_DISABLED); +#endif if (v3d->zsa->base.depth_enabled) { config.z_updates_enable = v3d->zsa->base.depth_writemask; +#if V3D_VERSION <= 42 config.early_z_enable = config.early_z_updates_enable; +#endif config.depth_test_function = v3d->zsa->base.depth_func; } else { @@ -535,13 +539,27 @@ v3dX(emit_state)(struct pipe_context *pctx) v3d_line_smoothing_enabled(v3d) ? V3D_LINE_RASTERIZATION_PERP_END_CAPS : V3D_LINE_RASTERIZATION_DIAMOND_EXIT; - } +#if V3D_VERSION >= 71 + /* The following follows the logic implemented at v3dv + * plus the definition of depth_clip_near/far and + * depth_clamp. + * + * Note: some extensions are not supported by v3d + * (like ARB_depth_clamp) that would affect this, but + * the values on rasterizer are taking that into + * account. + */ + config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near || + v3d->rasterizer->base.depth_clip_far; +#endif + } } if (v3d->dirty & V3D_DIRTY_RASTERIZER && v3d->rasterizer->base.offset_tri) { - if (job->zsbuf && + if (v3d->screen->devinfo.ver <= 42 && + job->zsbuf && job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) { cl_emit_prepacked_sized(&job->bcl, v3d->rasterizer->depth_offset_z16, @@ -564,12 +582,23 @@ v3dX(emit_state)(struct pipe_context *pctx) } if (v3d->dirty & V3D_DIRTY_VIEWPORT) { +#if V3D_VERSION <= 42 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { clip.viewport_half_width_in_1_256th_of_pixel = v3d->viewport.scale[0] * 256.0f; clip.viewport_half_height_in_1_256th_of_pixel = v3d->viewport.scale[1] * 256.0f; } +#endif +#if V3D_VERSION >= 71 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_64th_of_pixel = + v3d->viewport.scale[0] * 64.0f; + clip.viewport_half_height_in_1_64th_of_pixel = + v3d->viewport.scale[1] * 64.0f; + } +#endif + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { clip.viewport_z_offset_zc_to_zs = @@ -633,8 +662,10 @@ v3dX(emit_state)(struct pipe_context *pctx) } #endif + const uint32_t max_rts = + V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver); if (blend->base.independent_blend_enable) { - for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) + for (int i = 0; i < max_rts; i++) emit_rt_blend(v3d, job, &blend->base, i, (1 << i), v3d->blend_dst_alpha_one & (1 << i)); @@ -650,16 +681,16 @@ v3dX(emit_state)(struct pipe_context *pctx) * RTs without. */ emit_rt_blend(v3d, job, &blend->base, 0, - ((1 << V3D_MAX_DRAW_BUFFERS) - 1) & + ((1 << max_rts) - 1) & v3d->blend_dst_alpha_one, true); emit_rt_blend(v3d, job, &blend->base, 0, - ((1 << V3D_MAX_DRAW_BUFFERS) - 1) & + ((1 << max_rts) - 1) & ~v3d->blend_dst_alpha_one, false); } else { emit_rt_blend(v3d, job, &blend->base, 0, - (1 << V3D_MAX_DRAW_BUFFERS) - 1, + (1 << max_rts) - 1, v3d->blend_dst_alpha_one); } } @@ -668,8 +699,10 @@ v3dX(emit_state)(struct pipe_context *pctx) if (v3d->dirty & V3D_DIRTY_BLEND) { struct pipe_blend_state *blend = &v3d->blend->base; + const uint32_t max_rts = + V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver); cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < max_rts; i++) { int rt = blend->independent_blend_enable ? i : 0; int rt_mask = blend->rt[rt].colormask; diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c similarity index 94% rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c index e00d84e375f..431aad14b4f 100644 --- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c +++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c @@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon) } int -v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index, - struct pipe_driver_query_group_info *info) +v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index, + struct pipe_driver_query_group_info *info) { if (!screen->has_perfmon) return 0; @@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde } int -v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index, - struct pipe_driver_query_info *info) +v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index, + struct pipe_driver_query_info *info) { if (!screen->has_perfmon) return 0; @@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = { }; struct pipe_query * -v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries, - unsigned *query_types) +v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries, + unsigned *query_types) { struct v3d_query_perfcnt *pquery = NULL; struct v3d_query *query; diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c index 82547437c25..d3fbc9aff5d 100644 --- a/src/gallium/drivers/v3d/v3dx_rcl.c +++ b/src/gallium/drivers/v3d/v3dx_rcl.c @@ -23,8 +23,9 @@ #include "util/format/u_format.h" #include "v3d_context.h" -#include "broadcom/common/v3d_tiling.h" #include "broadcom/common/v3d_macros.h" +#include "broadcom/common/v3d_tiling.h" +#include "broadcom/common/v3d_util.h" #include "broadcom/cle/v3dx_pack.h" #define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 | \ @@ -419,10 +420,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer) * clearing Z/S. */ if (job->clear) { +#if V3D_VERSION <= 42 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = !job->early_zs_clear; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(cl, CLEAR_RENDER_TARGETS, clear); +#endif + } #endif /* V3D_VERSION >= 40 */ } @@ -483,10 +490,64 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer) } } -#if V3D_VERSION >= 40 +#if V3D_VERSION > 33 +/* Note that for v71, render target cfg packets has just one field that + * combined the internal type and clamp mode. For simplicity we keep just one + * helper. + * + * Note: rt_type is in fact a "enum V3DX(Internal_Type)". + * + */ +static uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + enum pipe_format format) +{ +#if V3D_VERSION == 42 + if (util_format_is_pure_integer(format)) { + return V3D_RENDER_TARGET_CLAMP_INT; + } else if (util_format_is_srgb(format)) { + return V3D_RENDER_TARGET_CLAMP_NORM; + } else { + return V3D_RENDER_TARGET_CLAMP_NONE; + } +#endif +#if V3D_VERSION >= 71 + switch (rt_type) { + case V3D_INTERNAL_TYPE_8I: + return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; + case V3D_INTERNAL_TYPE_8UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; + case V3D_INTERNAL_TYPE_8: + return V3D_RENDER_TARGET_TYPE_CLAMP_8; + case V3D_INTERNAL_TYPE_16I: + return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; + case V3D_INTERNAL_TYPE_16UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; + case V3D_INTERNAL_TYPE_16F: + return util_format_is_srgb(format) ? + V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : + V3D_RENDER_TARGET_TYPE_CLAMP_16F; + case V3D_INTERNAL_TYPE_32I: + return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; + case V3D_INTERNAL_TYPE_32UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; + case V3D_INTERNAL_TYPE_32F: + return V3D_RENDER_TARGET_TYPE_CLAMP_32F; + default: + unreachable("Unknown internal render target type"); + } + return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; +#endif + return 0; +} +#endif + +#if V3D_VERSION >= 71 static void -v3d_setup_render_target(struct v3d_job *job, int cbuf, - uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp) +v3d_setup_render_target(struct v3d_job *job, + int cbuf, + uint32_t *rt_bpp, + uint32_t *rt_type_clamp) { if (!job->cbufs[cbuf]) return; @@ -497,19 +558,35 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf, struct v3d_surface *bsurf = v3d_surface(job->bbuf); *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp); } - *rt_type = surf->internal_type; - if (util_format_is_srgb(surf->base.format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; -#if V3D_VERSION >= 42 - else if (util_format_is_pure_integer(surf->base.format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; -#endif - else - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; + *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type, + surf->base.format); } +#endif -#else /* V3D_VERSION < 40 */ +#if V3D_VERSION >= 40 && V3D_VERSION <= 42 +static void +v3d_setup_render_target(struct v3d_job *job, + int cbuf, + uint32_t *rt_bpp, + uint32_t *rt_type, + uint32_t *rt_clamp) +{ + if (!job->cbufs[cbuf]) + return; + + struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]); + *rt_bpp = surf->internal_bpp; + if (job->bbuf) { + struct v3d_surface *bsurf = v3d_surface(job->bbuf); + *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp); + } + *rt_type = surf->internal_type; + *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type, + surf->base.format); +} +#endif +#if V3D_VERSION < 40 static void v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf, struct v3d_resource *rsc, bool is_separate_stencil) @@ -656,7 +733,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = NONE; } -#else +#endif +#if V3D_VERSION >= 40 for (int i = 0; i < 2; i++) { if (i > 0) cl_emit(&job->rcl, TILE_COORDINATES, coords); @@ -664,16 +742,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer) cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = NONE; } + if (i == 0 || do_double_initial_tile_clear(job)) { +#if V3D_VERSION < 71 cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = !job->early_zs_clear; clear.clear_all_render_targets = true; } +#else + cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear); +#endif } cl_emit(&job->rcl, END_OF_TILE_MARKER, end); } #endif - cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush); v3d_rcl_emit_generic_per_tile_list(job, layer); @@ -775,18 +857,52 @@ v3dX(emit_rcl)(struct v3d_job *job) config.multisample_mode_4x = job->msaa; config.double_buffer_in_non_ms_mode = job->double_buffer; +#if V3D_VERSION <= 42 config.maximum_bpp_of_all_render_targets = job->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(job->tile_width); + config.log2_tile_height = log2_tile_size(job->tile_height); + + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif + } +#if V3D_VERSION >= 71 + uint32_t base_addr = 0; + + /* If we don't have any color RTs, we sill need to emit one and flat + * it as not used using stride = 1 + */ + if (job->nr_cbufs == 0) { + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.stride = 1; /* Unused */ + } + } +#endif for (int i = 0; i < job->nr_cbufs; i++) { struct pipe_surface *psurf = job->cbufs[i]; - if (!psurf) + if (!psurf) { +#if V3D_VERSION >= 71 + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.render_target_number = i; + rt.stride = 1; /* Unused */ + } +#endif continue; + } + struct v3d_surface *surf = v3d_surface(psurf); struct v3d_resource *rsc = v3d_resource(psurf->texture); UNUSED uint32_t config_pad = 0; - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; /* XXX: Set the pad for raster. */ if (surf->tiling == V3D_TILING_UIF_NO_XOR || @@ -819,6 +935,7 @@ v3dX(emit_rcl)(struct v3d_job *job) } #endif /* V3D_VERSION < 40 */ +#if V3D_VERSION <= 42 cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = job->clear_color[i][0]; @@ -847,9 +964,42 @@ v3dX(emit_rcl)(struct v3d_job *job) clear.render_target_number = i; }; } +#endif +#if V3D_VERSION >= 71 + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.clear_color_low_bits = job->clear_color[i][0]; + v3d_setup_render_target(job, i, &rt.internal_bpp, + &rt.internal_type_and_clamping); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(job->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = base_addr; + rt.render_target_number = i; + + base_addr += (job->tile_height * rt.stride) / 8; + } + + if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) job->clear_color[i][1]) | + (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32); + rt.render_target_number = i; + } + } + + if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) | + (((uint64_t) (job->clear_color[i][3])) << 24); + rt.render_target_number = i; + } + } +#endif } -#if V3D_VERSION >= 40 +#if V3D_VERSION >= 40 && V3D_VERSION <= 42 cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { v3d_setup_render_target(job, 0, &rt.render_target_0_internal_bpp, diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c index 0f1735fee66..a7fad572a2d 100644 --- a/src/gallium/drivers/v3d/v3dx_state.c +++ b/src/gallium/drivers/v3d/v3dx_state.c @@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx, #endif } - /* The HW treats polygon offset units based on a Z24 buffer, so we + /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we * need to scale up offset_units if we're only Z16. */ +#if V3D_VERSION <= 42 v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) { depth.depth_offset_factor = cso->offset_scale; depth.depth_offset_units = cso->offset_units * 256.0; @@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx, depth.limit = cso->offset_clamp; #endif } +#endif return so; } @@ -138,8 +140,9 @@ v3d_create_blend_state(struct pipe_context *pctx, so->base = *cso; + uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION); if (cso->independent_blend_enable) { - for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { + for (int i = 0; i < max_rts; i++) { so->blend_enables |= cso->rt[i].blend_enable << i; /* V3D 4.x is when we got independent blend enables. */ @@ -148,7 +151,7 @@ v3d_create_blend_state(struct pipe_context *pctx, } } else { if (cso->rt[0].blend_enable) - so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1; + so->blend_enables = (1 << max_rts) - 1; } return so; @@ -337,6 +340,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso) v3d->dirty |= V3D_DIRTY_ZSA; } + +static bool +needs_default_attribute_values(void) +{ +#if V3D_VERSION <= 42 + /* FIXME: on vulkan we are able to refine even further, as we know in + * advance when we create the pipeline if we have a integer vertex + * attrib. Pending to check if we could do something similar here. + */ + return true; +#endif + return false; +} + static void * v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, const struct pipe_vertex_element *elements) @@ -414,24 +431,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, } } - /* Set up the default attribute values in case any of the vertex - * elements use them. - */ - uint32_t *attrs; - u_upload_alloc(v3d->state_uploader, 0, - V3D_MAX_VS_INPUTS * sizeof(float), 16, - &so->defaults_offset, &so->defaults, (void **)&attrs); - - for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) { - attrs[i * 4 + 0] = 0; - attrs[i * 4 + 1] = 0; - attrs[i * 4 + 2] = 0; - if (i < so->num_elements && - util_format_is_pure_integer(so->pipe[i].src_format)) { - attrs[i * 4 + 3] = 1; - } else { - attrs[i * 4 + 3] = fui(1.0); + if (needs_default_attribute_values()) { + /* Set up the default attribute values in case any of the vertex + * elements use them. + */ + uint32_t *attrs; + u_upload_alloc(v3d->state_uploader, 0, + V3D_MAX_VS_INPUTS * sizeof(float), 16, + &so->defaults_offset, &so->defaults, (void **)&attrs); + + for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) { + attrs[i * 4 + 0] = 0; + attrs[i * 4 + 1] = 0; + attrs[i * 4 + 2] = 0; + if (i < so->num_elements && + util_format_is_pure_integer(so->pipe[i].src_format)) { + attrs[i * 4 + 3] = 1; + } else { + attrs[i * 4 + 3] = fui(1.0); + } } + } else { + so->defaults = NULL; + so->defaults_offset = 0; } u_upload_unmap(v3d->state_uploader); @@ -699,21 +721,22 @@ v3d_upload_sampler_state_variant(void *map, break; } - if (variant >= V3D_SAMPLER_STATE_32) { - sampler.border_color_word_0 = border.ui[0]; - sampler.border_color_word_1 = border.ui[1]; - sampler.border_color_word_2 = border.ui[2]; - sampler.border_color_word_3 = border.ui[3]; - } else { - sampler.border_color_word_0 = - _mesa_float_to_half(border.f[0]); - sampler.border_color_word_1 = - _mesa_float_to_half(border.f[1]); - sampler.border_color_word_2 = - _mesa_float_to_half(border.f[2]); - sampler.border_color_word_3 = - _mesa_float_to_half(border.f[3]); +#if V3D_VERSION <= 42 + /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions + * for us. In V3D 4.x we need to manually convert floating point color + * values to the expected format. + */ + if (variant < V3D_SAMPLER_STATE_32) { + border.ui[0] = _mesa_float_to_half(border.f[0]); + border.ui[1] = _mesa_float_to_half(border.f[1]); + border.ui[2] = _mesa_float_to_half(border.f[2]); + border.ui[3] = _mesa_float_to_half(border.f[3]); } +#endif + sampler.border_color_word_0 = border.ui[0]; + sampler.border_color_word_1 = border.ui[1]; + sampler.border_color_word_2 = border.ui[2]; + sampler.border_color_word_3 = border.ui[3]; } } } @@ -869,7 +892,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te } static void -v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, +v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo, + struct V3DX(TEXTURE_SHADER_STATE) *tex, struct pipe_resource *prsc, int base_level, int last_level, int first_layer, int last_layer, @@ -917,19 +941,29 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex, } tex->base_level = base_level; + #if V3D_VERSION >= 40 tex->max_level = last_level; /* Note that we don't have a job to reference the texture's sBO * at state create time, so any time this sampler view is used * we need to add the texture to the job. */ - tex->texture_base_pointer = - cl_address(NULL, - rsc->bo->offset + - v3d_layer_offset(prsc, 0, first_layer)); + const uint32_t base_offset = rsc->bo->offset + + v3d_layer_offset(prsc, 0, first_layer); + + tex->texture_base_pointer = cl_address(NULL, base_offset); #endif + tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64; +#if V3D_VERSION >= 71 + tex->chroma_offset_x = 1; + tex->chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex->texture_base_pointer_cb = base_offset >> 6; + tex->texture_base_pointer_cr = base_offset >> 6; +#endif + /* Since other platform devices may produce UIF images even * when they're not big enough for V3D to assume they're UIF, * we force images with level 0 as UIF to be always treated @@ -977,7 +1011,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, v3dx_pack(map, TEXTURE_SHADER_STATE, tex) { if (prsc->target != PIPE_BUFFER) { - v3d_setup_texture_shader_state(&tex, prsc, + v3d_setup_texture_shader_state(&v3d->screen->devinfo, + &tex, prsc, cso->u.tex.first_level, cso->u.tex.last_level, cso->u.tex.first_layer, @@ -990,7 +1025,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, cso->u.buf.size); } - tex.srgb = util_format_is_srgb(cso->format); + bool is_srgb = util_format_is_srgb(cso->format); +#if V3D_VERSION <= 42 + tex.srgb = is_srgb; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; +#endif #if V3D_VERSION >= 40 tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]); @@ -1040,7 +1081,10 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d, * shader code if we wanted to read an MSAA sRGB * texture without sRGB decode. */ +#if V3D_VERSION <= 42 tex.srgb = false; +#endif + } else { tex.texture_type = v3d_get_tex_format(&screen->devinfo, cso->format); @@ -1404,7 +1448,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d, v3dx_pack(map, TEXTURE_SHADER_STATE, tex) { if (prsc->target != PIPE_BUFFER) { - v3d_setup_texture_shader_state(&tex, prsc, + v3d_setup_texture_shader_state(&v3d->screen->devinfo, + &tex, prsc, iview->base.u.tex.level, iview->base.u.tex.level, iview->base.u.tex.first_layer, diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c new file mode 100644 index 00000000000..d6b51390a11 --- /dev/null +++ b/src/gallium/drivers/v3d/v3dx_tfu.c @@ -0,0 +1,202 @@ +/* + * Copyright © 2021 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_context.h" +#include "broadcom/common/v3d_tfu.h" + +bool +v3dX(tfu)(struct pipe_context *pctx, + struct pipe_resource *pdst, + struct pipe_resource *psrc, + unsigned int src_level, + unsigned int base_level, + unsigned int last_level, + unsigned int src_layer, + unsigned int dst_layer, + bool for_mipmap) +{ + struct v3d_context *v3d = v3d_context(pctx); + struct v3d_screen *screen = v3d->screen; + struct v3d_resource *src = v3d_resource(psrc); + struct v3d_resource *dst = v3d_resource(pdst); + struct v3d_resource_slice *src_base_slice = &src->slices[src_level]; + struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level]; + int msaa_scale = pdst->nr_samples > 1 ? 2 : 1; + int width = u_minify(pdst->width0, base_level) * msaa_scale; + int height = u_minify(pdst->height0, base_level) * msaa_scale; + enum pipe_format pformat; + + if (psrc->format != pdst->format) + return false; + if (psrc->nr_samples != pdst->nr_samples) + return false; + + if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D) + return false; + + /* Can't write to raster. */ + if (dst_base_slice->tiling == V3D_TILING_RASTER) + return false; + + /* When using TFU for blit, we are doing exact copies (both input and + * output format must be the same, no scaling, etc), so there is no + * pixel format conversions. Thus we can rewrite the format to use one + * that is TFU compatible based on its texel size. + */ + if (for_mipmap) { + pformat = pdst->format; + } else { + switch (dst->cpp) { + case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break; + case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break; + case 4: pformat = PIPE_FORMAT_R32_FLOAT; break; + case 2: pformat = PIPE_FORMAT_R16_FLOAT; break; + case 1: pformat = PIPE_FORMAT_R8_UNORM; break; + default: unreachable("unsupported format bit-size"); break; + }; + } + + uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat); + + if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) { + assert(for_mipmap); + return false; + } + + v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false); + v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false); + + struct drm_v3d_submit_tfu tfu = { + .ios = (height << 16) | width, + .bo_handles = { + dst->bo->handle, + src != dst ? src->bo->handle : 0 + }, + .in_sync = v3d->out_sync, + .out_sync = v3d->out_sync, + }; + uint32_t src_offset = (src->bo->offset + + v3d_layer_offset(psrc, src_level, src_layer)); + tfu.iia |= src_offset; + + uint32_t dst_offset = (dst->bo->offset + + v3d_layer_offset(pdst, base_level, dst_layer)); + tfu.ioa |= dst_offset; + + switch (src_base_slice->tiling) { + case V3D_TILING_UIF_NO_XOR: + case V3D_TILING_UIF_XOR: + tfu.iis |= (src_base_slice->padded_height / + (2 * v3d_utile_height(src->cpp))); + break; + case V3D_TILING_RASTER: + tfu.iis |= src_base_slice->stride / src->cpp; + break; + case V3D_TILING_LINEARTILE: + case V3D_TILING_UBLINEAR_1_COLUMN: + case V3D_TILING_UBLINEAR_2_COLUMN: + break; + } + +#if V3D_VERSION <= 42 + if (src_base_slice->tiling == V3D_TILING_RASTER) { + tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER << + V3D33_TFU_ICFG_FORMAT_SHIFT); + } else { + tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE + + (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << + V3D33_TFU_ICFG_FORMAT_SHIFT); + } + tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT; + + if (last_level != base_level) + tfu.ioa |= V3D33_TFU_IOA_DIMTW; + + tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE + + (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << + V3D33_TFU_IOA_FORMAT_SHIFT); + + tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT; + + /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the + * OPAD field for the destination (how many extra UIF blocks beyond + * those necessary to cover the height). When filling mipmaps, the + * miplevel 1+ tiling state is inferred. + */ + if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR || + dst_base_slice->tiling == V3D_TILING_UIF_XOR) { + int uif_block_h = 2 * v3d_utile_height(dst->cpp); + int implicit_padded_height = align(height, uif_block_h); + + tfu.icfg |= (((dst_base_slice->padded_height - + implicit_padded_height) / uif_block_h) << + V3D33_TFU_ICFG_OPAD_SHIFT); + } +#endif /* V3D_VERSION <= 42 */ + +#if V3D_VERSION >= 71 + if (src_base_slice->tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; + } else { + tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + + (src_base_slice->tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_ICFG_IFORMAT_SHIFT; + } + tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT; + + if (last_level != base_level) + tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW; + + tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE + + (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_IOC_FORMAT_SHIFT); + + switch (dst_base_slice->tiling) { + case V3D_TILING_UIF_NO_XOR: + case V3D_TILING_UIF_XOR: + tfu.v71.ioc |= + (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) << + V3D71_TFU_IOC_STRIDE_SHIFT; + break; + case V3D_TILING_RASTER: + tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) << + V3D71_TFU_IOC_STRIDE_SHIFT; + break; + default: + break; + } + + tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT; +#endif /* V3D_VERSION >= 71*/ + + int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu); + if (ret != 0) { + fprintf(stderr, "Failed to submit TFU job: %d\n", ret); + return false; + } + + dst->writes++; + + return true; +} + diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index 1c3f77f6588..9bdefb55194 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -610,6 +610,10 @@ struct pipe_resource unsigned bind; /**< bitmask of PIPE_BIND_x */ unsigned flags; /**< bitmask of PIPE_RESOURCE_FLAG_x */ + /* Hack for avoiding sync on v3d */ + unsigned sync_condition; + unsigned sync_status; + /** * For planar images, ie. YUV EGLImage external, etc, pointer to the * next plane. diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c index 32135770e9d..2534c817dcc 100644 --- a/src/loader/loader_dri3_helper.c +++ b/src/loader/loader_dri3_helper.c @@ -275,7 +275,7 @@ dri3_update_max_num_back(struct loader_dri3_drawable *draw) if (draw->swap_interval == 0) draw->max_num_back = 4; else - draw->max_num_back = 3; + draw->max_num_back = 2; assert(draw->max_num_back <= LOADER_DRI3_MAX_BACK); break; diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 77c38bf48d5..1eb2dac8018 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -1058,6 +1058,9 @@ struct gl_texture_object * the pipe_resource *pt above. */ bool needs_validation; + + /* Hack for avoiding sync on v3d */ + GLboolean SyncCondition; }; diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c index d8fb1ed4317..048deaa02f6 100644 --- a/src/mesa/main/texparam.c +++ b/src/mesa/main/texparam.c @@ -273,6 +273,13 @@ set_tex_parameteri(struct gl_context *ctx, } switch (pname) { + case GL_SYNC_CONDITION: + if (!!texObj->SyncCondition == !!params[0]) + return GL_FALSE; + texObj->SyncCondition = !!params[0]; + return GL_TRUE; + case GL_SYNC_STATUS: + return GL_TRUE; case GL_TEXTURE_MIN_FILTER: if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target)) goto invalid_dsa; @@ -930,6 +937,17 @@ _mesa_texture_parameter_invalidate(struct gl_context *ctx, { if (texparam_invalidates_sampler_views(pname)) st_texture_release_all_sampler_views(st_context(ctx), texObj); + + switch (pname) { + case GL_SYNC_CONDITION: + texObj->pt->sync_condition = texObj->SyncCondition; + break; + case GL_SYNC_STATUS: + texObj->pt->sync_status = 1; + break; + default: + ; /* nothing */ + } } void diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index 24cc2888755..2bc2748e7fe 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -77,6 +77,7 @@ TODO: document the other workarounds. @@ -750,6 +751,7 @@ TODO: document the other workarounds. diff --git a/src/util/driconf.h b/src/util/driconf.h index ab7aa2c6553..70fa9f7b41b 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -517,6 +517,14 @@ DRI_CONF_OPT_B(v3d_nonmsaa_texture_size_limit, def, \ "Report the non-MSAA-only texture size limit") +#define DRI_CONF_V3D_IS_XSERVER_PROCESS(def) \ + DRI_CONF_OPT_B(v3d_is_xserver_process, def, \ + "Identifies if the application is the Xserver.") + +#define DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(def) \ + DRI_CONF_OPT_B(v3d_maintain_ignorable_scanout, def, \ + "Maintain SCANOUT usage on resource allocations when the environment allows ignoring SCANOUT usage.") + /** * \brief virgl specific configuration options */