nixfiles/hosts/raspberry-pi5/profiles/rbp2-001-add-raspberrypi5-support.patch

12319 lines
523 KiB
Diff
Raw Normal View History

2024-09-04 21:58:31 +01:00
diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h
index 3dfc0af8756..1a7d7a689de 100644
--- a/include/drm-uapi/v3d_drm.h
+++ b/include/drm-uapi/v3d_drm.h
@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu {
/* Pointer to an array of ioctl extensions*/
__u64 extensions;
+
+ struct {
+ __u32 ioc;
+ __u32 pad;
+ } v71;
};
/* Submits a compute shader for dispatch. This job will block on any
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index 31a0d5bfa94..8ac32b313e4 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -23,7 +23,8 @@ v3d_versions = [
[21, 21],
[33, 33],
[41, 33],
- [42, 33]
+ [42, 33],
+ [71, 33]
]
v3d_xml_files = []
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index a0242b5f1c2..624353ca2bf 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="33" max_ver="42">
+<vcxml gen="3.3" min_ver="33" max_ver="71">
<enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
<value name="NEVER" value="0"/>
@@ -167,13 +167,36 @@
<value name="depth_16" value="2"/>
</enum>
- <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+ <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
<value name="none" value="0"/> <!-- no clamping -->
<value name="norm" value="1"/> <!-- [0,1] for f16 -->
<value name="pos" value="2"/> <!-- [0, for f16 -->
<value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
</enum>
+ <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
+ <value name="8i" value="0"/> <!-- no clamping -->
+ <value name="16i" value="1"/> <!-- no clamping -->
+ <value name="32i" value="2"/> <!-- no clamping -->
+ <value name="8ui" value="4"/> <!-- no clamping -->
+ <value name="16ui" value="5"/> <!-- no clamping -->
+ <value name="32ui" value="6"/> <!-- no clamping -->
+ <value name="8" value="8"/> <!-- no clamping -->
+ <value name="16f" value="9"/> <!-- no clamping -->
+ <value name="32f" value="10"/> <!-- no clamping -->
+ <value name="8i_clamped" value="16"/> <!-- clamp to integer RT's range -->
+ <value name="16i_clamped" value="17"/> <!-- clamp to integer RT's range -->
+ <value name="32i_clamped" value="18"/> <!-- clamp to integer RT's range -->
+ <value name="8ui_clamped" value="20"/> <!-- clamp to integer RT's range -->
+ <value name="16ui_clamped" value="21"/> <!-- clamp to integer RT's range -->
+ <value name="32ui_clamped" value="22"/> <!-- clamp to integer RT's range -->
+ <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
+ <value name="16f_clamp_pos" value="25"/> <!-- [0, for f16 -->
+ <value name="16f_clamp_pq" value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
+ <value name="16f_clamp_hlg" value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
+ <value name="invalid" value="32"/>
+ </enum>
+
<!---
CL cache flush commands are not fully documented and subject to a
number of hardware issues that make them unreliable. Specifically:
@@ -263,13 +286,27 @@
<value name="r8ui" value="36"/>
<value name="srgbx8" value="37" max_ver="33"/>
<value name="rgbx8" value="38" max_ver="33"/>
- <value name="bstc" value="39" min_ver="41"/>
+ <value name="bstc8" value="39" min_ver="41"/>
<value name="d32f" value="40" min_ver="41"/>
<value name="d24" value="41" min_ver="41"/>
<value name="d16" value="42" min_ver="41"/>
<value name="d24s8" value="43" min_ver="41"/>
<value name="s8" value="44" min_ver="41"/>
<value name="rgba5551" value="45" min_ver="41"/>
+ <value name="bstc8_srgb" value="46" min_ver="71"/>
+ <value name="bstc10" value="47" min_ver="71"/>
+ <value name="bstc10_srgb" value="48" min_ver="71"/>
+ <value name="bstc10_pq" value="49" min_ver="71"/>
+ <value name="rgba10x6" value="50" min_ver="71"/>
+ <value name="bstc10_hlg" value="55" min_ver="71"/>
+ <value name="rgba10x6_hlg" value="56" min_ver="71"/>
+ <value name="rgb10_a2_hlg" value="57" min_ver="71"/>
+ <value name="bstc10_pq_bt1886" value="58" min_ver="71"/>
+ <value name="rgba10x6_pq_bt1886" value="59" min_ver="71"/>
+ <value name="rgb10_a2_pq_bt1886" value="60" min_ver="71"/>
+ <value name="bstc10_hlg_bt1886" value="61" min_ver="71"/>
+ <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
+ <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
</enum>
<enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
@@ -314,6 +351,12 @@
<value name="perp end caps" value="1"/>
</enum>
+ <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
+ <value name="NONE" value="0"/>
+ <value name="MIN_ONE_TO_ONE" value="1"/>
+ <value name="ZERO_TO_ONE" value="2"/>
+ </enum>
+
<packet code="0" name="Halt"/>
<packet code="1" name="NOP"/>
<packet code="4" name="Flush"/>
@@ -381,11 +424,13 @@
<field name="Last Tile of Frame" size="1" start="0" type="bool"/>
</packet>
- <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+ <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
<field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
<field name="Clear all Render Targets" size="1" start="0" type="bool"/>
</packet>
+ <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
+
<packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
<field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
<field name="Enable Z load" size="1" start="7" type="bool"/>
@@ -443,6 +488,10 @@
<value name="Render target 1" value="1"/>
<value name="Render target 2" value="2"/>
<value name="Render target 3" value="3"/>
+ <value name="Render target 4" value="4" min_ver="71"/>
+ <value name="Render target 5" value="5" min_ver="71"/>
+ <value name="Render target 6" value="6" min_ver="71"/>
+ <value name="Render target 7" value="7" min_ver="71"/>
<value name="None" value="8"/>
<value name="Z" value="9"/>
<value name="Stencil" value="10"/>
@@ -789,7 +838,7 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
- <packet code="84" name="Blend Cfg" min_ver="41">
+ <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
<field name="Render Target Mask" size="4" start="24" type="uint"/>
<field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
<field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
@@ -799,6 +848,16 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
+ <packet code="84" name="Blend Cfg" min_ver="71">
+ <field name="Render Target Mask" size="8" start="24" type="uint"/>
+ <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+ <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+ <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
+ <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
+ <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
+ <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+ </packet>
+
<packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
<field name="Alpha (F16)" size="16" start="48" type="uint"/>
<field name="Blue (F16)" size="16" start="32" type="uint"/>
@@ -828,7 +887,12 @@
<field name="address" size="32" start="0" type="address"/>
</packet>
- <packet code="96" name="Cfg Bits">
+ <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
+ <field name="Lower Test Limit" size="32" start="0" type="float"/>
+ <field name="Upper Test Limit" size="32" start="32" type="float"/>
+ </packet>
+
+ <packet code="96" name="Cfg Bits" max_ver="42">
<field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
<field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
<field name="Blend enable" size="1" start="19" type="bool"/>
@@ -846,6 +910,25 @@
<field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
</packet>
+ <packet code="96" name="Cfg Bits" min_ver="71">
+ <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
+ <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+ <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+ <field name="Blend enable" size="1" start="19" type="bool"/>
+ <field name="Stencil enable" size="1" start="18" type="bool"/>
+ <field name="Z updates enable" size="1" start="15" type="bool"/>
+ <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
+ <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
+ <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
+ <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
+ <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
+ <field name="Line Rasterization" size="1" start="4" type="uint"/>
+ <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+ <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+ <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+ <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+ </packet>
+
<packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
<packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
@@ -907,16 +990,26 @@
<field name="Minimum Zw" size="32" start="0" type="float"/>
</packet>
- <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
<field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
<field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
+ <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
+ <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
+ </packet>
+
<packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
<field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
<field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
+ <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+ <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+ </packet>
+
<packet name="Number of Layers" code="119" min_ver="41">
<field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
</packet>
@@ -947,7 +1040,7 @@
<field name="sub-id" size="1" start="0" type="uint" default="0"/>
</packet>
- <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">
<field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
<field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
@@ -971,6 +1064,35 @@
</field>
</packet>
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
+ <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+ <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
+
+ <field name="Log2 Tile Height" size="3" start="11" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="8" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
+
+ <field name="tile allocation block size" size="2" start="4" type="uint">
+ <value name="tile allocation block size 64b" value="0"/>
+ <value name="tile allocation block size 128b" value="1"/>
+ <value name="tile allocation block size 256b" value="2"/>
+ </field>
+ <field name="tile allocation initial block size" size="2" start="2" type="uint">
+ <value name="tile allocation initial block size 64b" value="0"/>
+ <value name="tile allocation initial block size 128b" value="1"/>
+ <value name="tile allocation initial block size 256b" value="2"/>
+ </field>
+ </packet>
+
<packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
<field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
<field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
@@ -1002,7 +1124,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
<field name="Pad" size="12" start="52" type="uint"/>
<field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
@@ -1018,7 +1140,11 @@
<field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
<field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
- <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+ <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
+ <value name="Render target maximum 32bpp" value="0"/>
+ <value name="Render target maximum 64bpp" value="1"/>
+ <value name="Render target maximum 128bpp" value="2"/>
+ </field>
<field name="Image Height (pixels)" size="16" start="24" type="uint"/>
<field name="Image Width (pixels)" size="16" start="8" type="uint"/>
@@ -1027,6 +1153,43 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
+ <field name="Pad" size="6" start="58" type="uint"/>
+
+ <field name="Log2 Tile Height" size="3" start="55" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="52" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
+
+ <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+ <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
+
+ <field name="Early-Z disable" size="1" start="46" type="bool"/>
+
+ <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
+ <value name="Early-Z direction LT/LE" value="0"/>
+ <value name="Early-Z direction GT/GE" value="1"/>
+ </field>
+
+ <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
+ <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+ <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+
+ <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+ <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+ <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
+
+ <field name="sub-id" size="3" start="0" type="uint" default="0"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
<field name="Address" size="32" start="32" type="address"/>
@@ -1048,7 +1211,8 @@
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+ <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
+ <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">
<field name="Pad" size="28" start="36" type="uint"/>
@@ -1099,7 +1263,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
<field name="unused" size="16" start="48" type="uint"/>
<field name="Z Clear Value" size="32" start="16" type="float"/>
@@ -1108,6 +1272,15 @@
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
+ <field name="unused" size="16" start="48" type="uint"/>
+
+ <field name="Z Clear Value" size="32" start="16" type="float"/>
+
+ <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
+ <field name="sub-id" size="4" start="0" type="uint" default="1"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
@@ -1117,7 +1290,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -1126,6 +1299,19 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
+
+ <field name="Clear Color low bits" size="32" start="32" type="uint"/>
+ <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
+ <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
+
+ <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
+ <!-- In multiples of 512 bits -->
+ <field name="Base Address" size="11" start="7" type="uint"/>
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="2"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
@@ -1135,7 +1321,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -1144,6 +1330,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
+ <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="3"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
@@ -1155,7 +1348,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="6"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
<!-- image height is for Y flipping -->
@@ -1166,6 +1359,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
+ <field name="Clear Color top bits" size="56" start="8" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="4"/>
+ </packet>
+
<packet code="124" shortname="tile_coords" name="Tile Coordinates">
<field name="tile row number" size="12" start="12" type="uint"/>
<field name="tile column number" size="12" start="0" type="uint"/>
@@ -1240,7 +1440,7 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
- <struct name="GL Shader State Record" min_ver="41">
+ <struct name="GL Shader State Record" min_ver="41" max_ver="42">
<field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
<field name="Enable clipping" size="1" start="1" type="bool"/>
@@ -1299,6 +1499,63 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
+ <struct name="GL Shader State Record" min_ver="71">
+ <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+ <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+ <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+ <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+ <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+ <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+ <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+ <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+ <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+ <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+
+ <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+ <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+ <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+ <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+ <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+ <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+ <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+ <field name="No prim pack" size="1" start="19" type="bool"/>
+ <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
+
+ <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+ <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+ <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+ <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+ <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+ <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+ <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+ <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+ <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+ <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
+ <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
+ <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
+ <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
+ <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+ <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
+ <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
+ <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
+ <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
+ <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
+
+ <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
+ <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
+ <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
+ <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
+ <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
+ </struct>
+
<struct name="Geometry Shader State Record" min_ver="41">
<field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
<field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
@@ -1543,7 +1800,7 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
- <struct name="TMU Config Parameter 2" min_ver="42">
+ <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
<field name="Pad" size="7" start="25" type="uint"/>
<field name="LOD Query" size="1" start="24" type="bool"/>
<field name="Op" size="4" start="20" type="TMU Op"/>
@@ -1558,6 +1815,23 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
+ <struct name="TMU Config Parameter 2" min_ver="71">
+ <field name="Pad" size="5" start="27" type="uint"/>
+ <field name="Write conversion" size="1" start="26" type="bool"/>
+ <field name="DIM query" size="1" start="25" type="bool"/>
+ <field name="LOD Query" size="1" start="24" type="bool"/>
+ <field name="Op" size="4" start="20" type="TMU Op"/>
+ <field name="Offset R" size="4" start="16" type="int"/>
+ <field name="Offset T" size="4" start="12" type="int"/>
+ <field name="Offset S" size="4" start="8" type="int"/>
+ <field name="Gather Mode" size="1" start="7" type="bool"/>
+ <field name="Gather Component" size="2" start="5" type="uint"/>
+ <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+ <field name="Sample Number" size="2" start="2" type="uint"/>
+ <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+ <field name="Offset Format 8" size="1" start="0" type="bool"/>
+ </struct>
+
<struct name="Texture Shader State" max_ver="33">
<field name="UIF XOR disable" size="1" start="255" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
@@ -1611,7 +1885,7 @@
<field name="Filter" size="4" start="0" type="TMU Filter"/>
</struct>
- <struct name="Texture Shader State" min_ver="41">
+ <struct name="Texture Shader State" min_ver="41" max_ver="42">
<field name="Pad" size="56" start="136" type="uint"/>
<field name="UIF XOR disable" size="1" start="135" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
@@ -1652,6 +1926,82 @@
<field name="Flip texture X Axis" size="1" start="0" type="bool"/>
</struct>
+ <struct name="Texture Shader State" min_ver="71">
+ <field name="Pad" size="2" start="190" type="uint"/>
+ <!-- When we use an address type, there is an implicit requirement
+ that the address is a 32-bit that is encoded starting at a 32-bit
+ aligned bit offset into the packet. If the address field has less than
+ 32 bits, it is assumed that the address is aligned. For example, a
+ 26-bit address field is expected to be 64-byte aligned (6 lsb bits
+ are 0) and that this will be encoded into a packet starting at bit
+ offset 6 into a 32-bit dword (since bits 0..5 of the address are
+ implicitly 0 and don't need to be explicitly encoded).
+
+ Unfortunately, the CB address below doesn't match this requirement:
+ it starts at bit 138, which is 10 bits into a 32-bit dword, but it
+ represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
+ encode it as an address type. To fix this we encode these addresses
+ as uint types which has two implications:
+ 1. the driver is responsible for manually addinng the buffer objects
+ for these addresses to the job BO list.
+ 2. the driver needs to pass an actual 26-bit address value by manually
+ shifting the 6 lsb bits (that are implicitly 0).
+ -->
+ <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
+ <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
+ <field name="Chroma offset y" size="1" start="137" type="uint"/>
+ <field name="Chroma offset x" size="1" start="136" type="uint"/>
+
+ <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+ <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+ <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+ <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+ <field name="Base Level" size="4" start="124" type="uint"/>
+ <field name="Max Level" size="4" start="120" type="uint"/>
+
+ <field name="Swizzle A" size="3" start="117" type="uint">
+ <value name="Swizzle Zero" value="0"/>
+ <value name="Swizzle One" value="1"/>
+ <value name="Swizzle Red" value="2"/>
+ <value name="Swizzle Green" value="3"/>
+ <value name="Swizzle Blue" value="4"/>
+ <value name="Swizzle Alpha" value="5"/>
+ </field>
+
+ <field name="Swizzle B" size="3" start="114" type="uint"/>
+ <field name="Swizzle G" size="3" start="111" type="uint"/>
+ <field name="Swizzle R" size="3" start="108" type="uint"/>
+ <field name="Extended" size="1" start="107" type="bool"/>
+
+ <field name="Texture type" size="7" start="100" type="uint"/>
+ <field name="Image Depth" size="14" start="86" type="uint"/>
+ <field name="Image Height" size="14" start="72" type="uint"/>
+ <field name="Image Width" size="14" start="58" type="uint"/>
+
+ <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
+ at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
+ Array Stride starting at 33, which is backwards incompatible,
+ We use the definition from 7.1.5.
+ -->
+ <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
+ <field name="R/B swap" size="1" start="32" type="bool"/>
+
+ <field name="Texture base pointer" size="32" start="0" type="address"/>
+
+ <field name="Reverse" size="1" start="5" type="bool"/>
+ <field name="Transfer func" size="3" start="2" type="uint">
+ <value name="Transfer Func None" value="0"/>
+ <value name="Transfer Func sRGB" value="1"/>
+ <value name="Transfer Func PQ" value="2"/>
+ <value name="Transfer Func HLG" value="3"/>
+ <value name="Transfer Func PQ BT1886" value="4"/>
+ <value name="Transfer Func HLG BT1886" value="5"/>
+ </field>
+ <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+ <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+ </struct>
+
<struct name="Sampler State" min_ver="41">
<field name="Border color word 3" size="32" start="160" type="uint"/>
<field name="Border color word 2" size="32" start="128" type="uint"/>
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
index 5762e5aaa70..e5a1eb26698 100644
--- a/src/broadcom/cle/v3dx_pack.h
+++ b/src/broadcom/cle/v3dx_pack.h
@@ -37,6 +37,8 @@
# include "cle/v3d_packet_v41_pack.h"
#elif (V3D_VERSION == 42)
# include "cle/v3d_packet_v42_pack.h"
+#elif (V3D_VERSION == 71)
+# include "cle/v3d_packet_v71_pack.h"
#else
# error "Need to add a pack header include for this v3d version"
#endif
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index 6ace62b0310..cda407a00bf 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+ const uint8_t *cl, uint32_t *size, bool reloc_mode);
static inline void
out(struct clif_dump *clif, const char *fmt, ...)
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 272190eb2e5..7bc2b662cfc 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
struct drm_v3d_get_param ident1 = {
.param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
};
+ struct drm_v3d_get_param hub_ident3 = {
+ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
+ };
int ret;
ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
@@ -62,10 +65,13 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
int qups = (ident1.value >> 8) & 0xf;
devinfo->qpu_count = nslc * qups;
+ devinfo->has_accumulators = devinfo->ver < 71;
+
switch (devinfo->ver) {
case 33:
case 41:
case 42:
+ case 71:
break;
default:
fprintf(stderr,
@@ -75,5 +81,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
return false;
}
- return true;
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ devinfo->rev = (hub_ident3.value >> 8) & 0xff;
+
+ return true;
}
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 97abd9b8d9f..8dfc7858727 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -34,11 +34,17 @@ struct v3d_device_info {
/** Simple V3D version: major * 10 + minor */
uint8_t ver;
+ /** V3D revision number */
+ uint8_t rev;
+
/** Size of the VPM, in bytes. */
int vpm_size;
/* NSLC * QUPS from the core's IDENT registers. */
int qpu_count;
+
+ /* If the hw has accumulator registers */
+ bool has_accumulators;
};
typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index 46f38bd7484..354c8784914 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -42,7 +42,8 @@
#define V3D_MAX_SAMPLES 4
-#define V3D_MAX_DRAW_BUFFERS 4
+#define V3D_MAX_DRAW_BUFFERS 8
+#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)
#define V3D_MAX_POINT_SIZE 512.0f
#define V3D_MAX_LINE_WIDTH 32
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
index fe89398208a..b4291fb5350 100644
--- a/src/broadcom/common/v3d_macros.h
+++ b/src/broadcom/common/v3d_macros.h
@@ -41,6 +41,9 @@
#elif (V3D_VERSION == 42)
# define V3DX(x) V3D42_##x
# define v3dX(x) v3d42_##x
+#elif (V3D_VERSION == 71)
+# define V3DX(x) V3D71_##x
+# define v3dX(x) v3d71_##x
#else
# error "Need to add prefixing macros for this v3d version"
#endif
diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
index 08d750c2cbe..a8f0cff8784 100644
--- a/src/broadcom/common/v3d_performance_counters.h
+++ b/src/broadcom/common/v3d_performance_counters.h
@@ -28,6 +28,110 @@
#define V3D_PERFCNT_NAME 1
#define V3D_PERFCNT_DESCRIPTION 2
+#ifndef V3D_VERSION
+# error "The V3D_VERSION macro must be defined"
+#endif
+
+#if (V3D_VERSION >= 71)
+
+static const char *v3d_performance_counters[][3] = {
+ {"CORE", "cycle-count", "[CORE] Cycle counter"},
+ {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
+ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+ {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+ {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
+ {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
+ {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
+ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+ {"L2T", "L2T-local", "[L2T] Local mode access"},
+ {"L2T", "L2T-writeback", "[L2T] Writeback"},
+ {"L2T", "L2T-zero", "[L2T] Zero"},
+ {"L2T", "L2T-merge", "[L2T] Merge"},
+ {"L2T", "L2T-fill", "[L2T] Fill"},
+ {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
+ {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
+ {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
+ {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
+ {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
+ {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
+ {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
+ {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
+ {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
+ {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
+ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+ {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
+ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+ {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
+ {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
+ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+ {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+ {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+ {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
+ {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
+ {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
+ {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
+ {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
+ {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
+ {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
+ {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
+ {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
+ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
+ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+ {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
+ {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
+ {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
+ {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
+ {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
+ {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
+};
+
+#elif (V3D_VERSION >= 41)
+
static const char *v3d_performance_counters[][3] = {
{"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
{"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = {
{"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
};
+#else
+static const char *v3d_performance_counters[][3] = { };
+#endif
+
#endif
diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
index 80da224ca2d..572d0074794 100644
--- a/src/broadcom/common/v3d_tfu.h
+++ b/src/broadcom/common/v3d_tfu.h
@@ -48,4 +48,27 @@
#define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
#define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15
+/* Disable level 0 write, just write following mipmaps */
+#define V3D71_TFU_IOC_DIMTW (1 << 0)
+#define V3D71_TFU_IOC_FORMAT_SHIFT 12
+#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
+#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6
+#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7
+
+#define V3D71_TFU_IOC_STRIDE_SHIFT 16
+#define V3D71_TFU_IOC_NUMMM_SHIFT 4
+
+#define V3D71_TFU_ICFG_OTYPE_SHIFT 16
+#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23
+#define V3D71_TFU_ICFG_FORMAT_RASTER 0
+#define V3D71_TFU_ICFG_FORMAT_SAND_128 1
+#define V3D71_TFU_ICFG_FORMAT_SAND_256 2
+#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15
+
#endif
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
index 57872a923d3..8a50d279985 100644
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@@ -87,10 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
return best_wgs_per_sg;
}
+#define V3D71_TLB_COLOR_SIZE (16 * 1024)
+#define V3D71_TLB_DETPH_SIZE (16 * 1024)
+#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024)
+
+static bool
+tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
+{
+ /* First, we check if we can fit this tile size allocating the depth
+ * TLB memory to color.
+ */
+ if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
+ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
+ return true;
+ }
+
+ /* Otherwise the tile must fit in the main TLB buffers */
+ return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
+ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
+}
+
void
-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
- bool msaa, bool double_buffer,
- uint32_t *width, uint32_t *height)
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+ uint32_t color_attachment_count,
+ /* V3D 4.x max internal bpp of all RTs */
+ uint32_t max_internal_bpp,
+ /* V3D 7.x accumulated bpp for all RTs (in bytes) */
+ uint32_t total_color_bpp,
+ bool msaa,
+ bool double_buffer,
+ uint32_t *width,
+ uint32_t *height)
{
static const uint8_t tile_sizes[] = {
64, 64,
@@ -103,19 +130,65 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
};
uint32_t idx = 0;
- if (color_attachment_count > 2)
- idx += 2;
- else if (color_attachment_count > 1)
- idx += 1;
+ if (devinfo->ver >= 71) {
+ /* In V3D 7.x, we use the actual bpp used by color attachments to compute
+ * the tile size instead of the maximum bpp. This may allow us to choose a
+ * larger tile size than we would in 4.x in scenarios with multiple RTs
+ * with different bpps.
+ *
+ * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
+ * used for depth instead of the main 16KB depth TLB buffer when the depth
+ * tile fits in the auxiliary buffer, allowing the hardware to allocate
+ * the 16KB from the main depth TLB to the color TLB. If we can do that,
+ * then we are effectively doubling the memory we have for color and we
+ * can also select a larger tile size. This is necessary to support
+ * the most expensive configuration: 8x128bpp RTs + MSAA.
+ *
+ * FIXME: the docs state that depth TLB memory can be used for color
+ * if depth testing is not used by setting the 'depth disable' bit in the
+ * rendering configuration. However, this comes with a requirement that
+ * occlussion queries must not be active. We need to clarify if this means
+ * active at the point at which we emit a tile rendering configuration
+ * item, meaning that the we have a query spanning a full render pass
+ * (this is something we can tell before we emit the rendering
+ * configuration item) or active in the subpass for which we are enabling
+ * the bit (which we can't tell until later, when we record commands for
+ * the subpass). If it is the latter, then we cannot use this feature.
+ *
+ * FIXME: pending handling double_buffer.
+ */
+ const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
+ const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
+ do {
+ const uint32_t tile_w = tile_sizes[idx * 2];
+ const uint32_t tile_h = tile_sizes[idx * 2 + 1];
+ if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
+ break;
+ idx++;
+ } while (idx < ARRAY_SIZE(tile_sizes) / 2);
+
+ /* FIXME: pending handling double_buffer */
+ assert(!double_buffer);
+ } else {
+ /* On V3D 4.x tile size is selected based on the number of RTs, the
+ * maximum bpp across all of them and whether 4x MSAA is used.
+ */
+ if (color_attachment_count > 4)
+ idx += 3;
+ else if (color_attachment_count > 2)
+ idx += 2;
+ else if (color_attachment_count > 1)
+ idx += 1;
- /* MSAA and double-buffer are mutually exclusive */
- assert(!msaa || !double_buffer);
- if (msaa)
- idx += 2;
- else if (double_buffer)
- idx += 1;
+ /* MSAA and double-buffer are mutually exclusive */
+ assert(!msaa || !double_buffer);
+ if (msaa)
+ idx += 2;
+ else if (double_buffer)
+ idx += 1;
- idx += max_color_bpp;
+ idx += max_internal_bpp;
+ }
assert(idx < ARRAY_SIZE(tile_sizes) / 2);
@@ -170,3 +243,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
unreachable("Unsupported primitive type");
}
}
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp)
+{
+ switch (internal_bpp) {
+ case 0 /* V3D_INTERNAL_BPP_32 */:
+ return 1;
+ case 1 /* V3D_INTERNAL_BPP_64 */:
+ return 2;
+ case 2 /* V3D_INTERNAL_BPP_128 */:
+ return 4;
+ default:
+ unreachable("Unsupported internal BPP");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp)
+{
+ /* stride in multiples of 128 bits, and covers 2 rows. This is the
+ * reason we divide by 2 instead of 4, as we divide number of 32-bit
+ * words per row by 2.
+ */
+
+ return (tile_width * bpp) / 2;
+}
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
index eb802b77f67..d02d41dd089 100644
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@@ -24,6 +24,7 @@
#ifndef V3D_UTIL_H
#define V3D_UTIL_H
+#include "util/macros.h"
#include "common/v3d_device_info.h"
#include "pipe/p_defines.h"
@@ -36,9 +37,14 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
uint32_t wg_size);
void
-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
- bool msaa, bool double_buffer,
- uint32_t *width, uint32_t *height);
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+ uint32_t color_attachment_count,
+ uint32_t max_internal_bpp,
+ uint32_t total_color_bpp,
+ bool msaa,
+ bool double_buffer,
+ uint32_t *width,
+ uint32_t *height);
uint32_t
v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
@@ -46,4 +52,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
uint32_t
v3d_hw_prim_type(enum mesa_prim prim_type);
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp);
+
+/* Some configuration packets want the size on log2, but starting at 0 for
+ * size 8.
+ */
+static inline uint8_t
+log2_tile_size(uint32_t size)
+{
+ switch(size) {
+ case 8:
+ return 0;
+ case 16:
+ return 1;
+ case 32:
+ return 2;
+ case 64:
+ return 3;
+ default:
+ unreachable("Unsupported tile width/height");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp);
#endif
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index ad461dbe24c..4536d3bc67b 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
static struct qreg
emit_smooth_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg w, struct qreg r5)
+ struct qreg vary, struct qreg w, struct qreg c_reg)
{
- return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
}
static struct qreg
emit_noperspective_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
- return vir_FADD(c, vir_MOV(c, vary), r5);
+ return vir_FADD(c, vir_MOV(c, vary), c_reg);
}
static struct qreg
emit_flat_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
vir_MOV_dest(c, c->undef, vary);
- return vir_MOV(c, r5);
+ return vir_MOV(c, c_reg);
}
static struct qreg
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
int8_t input_idx, uint8_t swizzle, int array_index)
{
- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ struct qreg c_reg; /* C coefficient */
+
+ if (c->devinfo->has_accumulators)
+ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ else
+ c_reg = vir_reg(QFILE_REG, 0);
struct qinst *ldvary = NULL;
struct qreg vary;
@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
vary = vir_emit_def(c, ldvary);
} else {
vir_NOP(c)->qpu.sig.ldvary = true;
- vary = r3;
+ vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
}
/* Store the input value before interpolation so we can implement
@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (input_idx >= 0) {
assert(var);
c->interp[input_idx].vp = vary;
- c->interp[input_idx].C = vir_MOV(c, r5);
+ c->interp[input_idx].C = vir_MOV(c, c_reg);
c->interp[input_idx].mode = var->data.interpolation;
}
@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
*/
if (!var) {
assert(input_idx < 0);
- return emit_smooth_varying(c, vary, c->payload_w, r5);
+ return emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
int i = c->num_inputs++;
@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (var->data.centroid) {
BITSET_SET(c->centroid_flags, i);
result = emit_smooth_varying(c, vary,
- c->payload_w_centroid, r5);
+ c->payload_w_centroid, c_reg);
} else {
- result = emit_smooth_varying(c, vary, c->payload_w, r5);
+ result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
break;
case INTERP_MODE_NOPERSPECTIVE:
BITSET_SET(c->noperspective_flags, i);
- result = emit_noperspective_varying(c, vary, r5);
+ result = emit_noperspective_varying(c, vary, c_reg);
break;
case INTERP_MODE_FLAT:
BITSET_SET(c->flat_shade_flags, i);
- result = emit_flat_varying(c, vary, r5);
+ result = emit_flat_varying(c, vary, c_reg);
break;
default:
@@ -1685,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_VFPACK(c, src[0], src[1]);
break;
+ case nir_op_vpack_v3d:
+ result = vir_VPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_v11fpack_v3d:
+ result = vir_V11FPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_v10pack_v3d:
+ result = vir_V10PACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_v8pack_v3d:
+ result = vir_V8PACK(c, src[0], src[1]);
+ break;
+
case nir_op_unpack_half_2x16_split_x:
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1715,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
break;
}
+ case nir_op_vftounorm8_v3d:
+ result = vir_VFTOUNORM8(c, src[0]);
+ break;
+
+ case nir_op_vftosnorm8_v3d:
+ result = vir_VFTOSNORM8(c, src[0]);
+ break;
+
+ case nir_op_vftounorm10lo_v3d:
+ result = vir_VFTOUNORM10LO(c, src[0]);
+ break;
+
+ case nir_op_vftounorm10hi_v3d:
+ result = vir_VFTOUNORM10HI(c, src[0]);
+ break;
+
+ case nir_op_ftounorm16_v3d:
+ result = vir_FTOUNORM16(c, src[0]);
+ break;
+
+ case nir_op_ftosnorm16_v3d:
+ result = vir_FTOSNORM16(c, src[0]);
+ break;
default:
fprintf(stderr, "unknown NIR ALU inst: ");
@@ -2440,15 +2483,17 @@ ntq_setup_outputs(struct v3d_compile *c)
switch (var->data.location) {
case FRAG_RESULT_COLOR:
- c->output_color_var[0] = var;
- c->output_color_var[1] = var;
- c->output_color_var[2] = var;
- c->output_color_var[3] = var;
+ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+ c->output_color_var[i] = var;
break;
case FRAG_RESULT_DATA0:
case FRAG_RESULT_DATA1:
case FRAG_RESULT_DATA2:
case FRAG_RESULT_DATA3:
+ case FRAG_RESULT_DATA4:
+ case FRAG_RESULT_DATA5:
+ case FRAG_RESULT_DATA6:
+ case FRAG_RESULT_DATA7:
c->output_color_var[var->data.location -
FRAG_RESULT_DATA0] = var;
break;
@@ -4321,7 +4366,11 @@ nir_to_vir(struct v3d_compile *c)
{
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ if (c->devinfo->ver < 71)
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ else
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
@@ -4361,8 +4410,13 @@ nir_to_vir(struct v3d_compile *c)
V3D_QPU_WADDR_SYNC));
}
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ if (c->devinfo->ver <= 42) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ } else if (c->devinfo->ver >= 71) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ }
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
@@ -4541,8 +4595,8 @@ vir_check_payload_w(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_REG &&
- inst->src[i].index == 0) {
+ if (inst->src[i].file == c->payload_w.file &&
+ inst->src[i].index == c->payload_w.index) {
c->uses_center_w = true;
return;
}
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 3b32b48f86f..4f767296860 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -155,12 +155,13 @@ static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
enum v3d_qpu_mux mux)
{
+ assert(state->devinfo->ver < 71);
switch (mux) {
case V3D_QPU_MUX_A:
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- if (!n->inst->qpu.sig.small_imm) {
+ if (!n->inst->qpu.sig.small_imm_b) {
add_read_dep(state,
state->last_rf[n->inst->qpu.raddr_b], n);
}
@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint8_t raddr, bool is_small_imm)
+{
+ assert(state->devinfo->ver >= 71);
+
+ if (!is_small_imm)
+ add_read_dep(state, state->last_rf[raddr], n);
+}
+
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
@@ -285,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* If the input and output segments are shared, then all VPM reads to
* a location need to happen before all writes. We handle this by
* serializing all VPM operations for now.
+ *
+ * FIXME: we are assuming that the segments are shared. That is
+ * correct right now as we are only using shared, but technically you
+ * can choose.
*/
bool separate_vpm_segment = false;
@@ -305,15 +321,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a);
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
+ inst->sig.small_imm_a);
+ }
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
+ inst->sig.small_imm_b);
+ }
+ }
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a);
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+ inst->sig.small_imm_c);
+ }
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+ inst->sig.small_imm_d);
+ }
+ }
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
@@ -386,6 +426,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_r[4], n);
if (v3d_qpu_writes_r5(devinfo, inst))
add_write_dep(state, &state->last_r[5], n);
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+ add_write_dep(state, &state->last_rf[0], n);
/* If we add any more dependencies here we should consider whether we
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -500,6 +542,10 @@ struct choose_scoreboard {
int ldvary_count;
int pending_ldtmu_count;
bool first_ldtmu_after_thrsw;
+
+ /* V3D 7.x */
+ int last_implicit_rf0_write_tick;
+ bool has_rf0_flops_conflict;
};
static bool
@@ -524,7 +570,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
}
static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ switch (raddr) {
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
struct qinst *qinst)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -536,24 +599,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
if (inst->alu.add.op != V3D_QPU_A_NOP) {
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+ return true;
+ }
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
}
@@ -577,6 +660,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
v3d_qpu_writes_r4(devinfo, inst))
return true;
+ if (devinfo->ver <= 42)
+ return false;
+
+ /* Don't schedule anything that writes rf0 right after ldvary, since
+ * that would clash with the ldvary's delayed rf0 write (the exception
+ * is another ldvary, since its implicit rf0 write would also have
+ * one cycle of delay and would not clash).
+ */
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !inst->sig.ldvary))) {
+ return true;
+ }
+
return false;
}
@@ -604,29 +702,36 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
}
static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
uint32_t waddr) {
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
- inst->raddr_a == waddr)
- return true;
+ if (devinfo->ver < 71) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm && (inst->raddr_b == waddr))
- return true;
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+ return true;
+ } else {
+ if (v3d71_qpu_reads_raddr(inst, waddr))
+ return true;
+ }
return false;
}
static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
{
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
- qpu_instruction_uses_rf(inst,
+ qpu_instruction_uses_rf(devinfo, inst,
scoreboard->last_stallable_sfu_reg);
}
@@ -692,7 +797,8 @@ enum {
V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
V3D_PERIPHERAL_TSY = (1 << 8),
- V3D_PERIPHERAL_TLB = (1 << 9),
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
};
static uint32_t
@@ -717,8 +823,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
if (v3d_qpu_uses_sfu(inst))
result |= V3D_PERIPHERAL_SFU;
- if (v3d_qpu_uses_tlb(inst))
- result |= V3D_PERIPHERAL_TLB;
+ if (v3d_qpu_reads_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_READ;
+ if (v3d_qpu_writes_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_WRITE;
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
@@ -749,32 +857,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
if (devinfo->ver < 41)
return false;
- /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
- * tmuc).
+ /* V3D 4.x can't do more than one peripheral access except in a
+ * few cases:
*/
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
- b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ if (devinfo->ver <= 42) {
+ /* WRTMUC signal with TMU register write (other than tmuc). */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ }
+
+ /* TMU read with VPM read/write. */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+
+ return false;
}
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
- b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ /* V3D 7.x can't have more than one of these restricted peripherals */
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+ V3D_PERIPHERAL_TSY |
+ V3D_PERIPHERAL_TLB_READ |
+ V3D_PERIPHERAL_SFU |
+ V3D_PERIPHERAL_VPM_READ |
+ V3D_PERIPHERAL_VPM_WRITE;
+
+ const uint32_t a_restricted = a_peripherals & restricted;
+ const uint32_t b_restricted = b_peripherals & restricted;
+ if (a_restricted && b_restricted) {
+ /* WRTMUC signal with TMU register write (other than tmuc) is
+ * allowed though.
+ */
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+ return false;
+ }
}
- /* V3D 4.1+ allows TMU read with VPM read/write. */
- if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
- (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
- b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
- return true;
+ /* Only one TMU read per instruction */
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+ return false;
}
- if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
- (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
- a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
- return true;
+
+ /* Only one TLB access per instruction */
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ)) &&
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ))) {
+ return false;
}
- return false;
+ return true;
}
/* Compute a bitmask of which rf registers are used between
@@ -790,42 +941,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
uint64_t raddrs_used = 0;
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
raddrs_used |= (1ll << a->raddr_a);
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
raddrs_used |= (1ll << a->raddr_b);
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
raddrs_used |= (1ll << b->raddr_a);
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
raddrs_used |= (1ll << b->raddr_b);
return raddrs_used;
}
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
*/
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *add_instr,
- const struct v3d_qpu_instr *mul_instr)
+ const struct v3d_qpu_instr *mul_instr,
+ const struct v3d_device_info *devinfo)
{
+ if (devinfo->ver >= 71) {
+ assert(add_instr->sig.small_imm_a +
+ add_instr->sig.small_imm_b <= 1);
+ assert(add_instr->sig.small_imm_c +
+ add_instr->sig.small_imm_d == 0);
+ assert(mul_instr->sig.small_imm_a +
+ mul_instr->sig.small_imm_b == 0);
+ assert(mul_instr->sig.small_imm_c +
+ mul_instr->sig.small_imm_d <= 1);
+
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+ return (result->sig.small_imm_a +
+ result->sig.small_imm_b +
+ result->sig.small_imm_c +
+ result->sig.small_imm_d) <= 1;
+ }
+
+ assert(devinfo->ver <= 42);
+
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
int naddrs = util_bitcount64(raddrs_used);
if (naddrs > 2)
return false;
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
if (naddrs > 1)
return false;
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
if (add_instr->raddr_b != mul_instr->raddr_b)
return false;
- result->sig.small_imm = true;
- result->raddr_b = add_instr->sig.small_imm ?
+ result->sig.small_imm_b = true;
+ result->raddr_b = add_instr->sig.small_imm_b ?
add_instr->raddr_b : mul_instr->raddr_b;
}
@@ -836,23 +1012,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
raddrs_used &= ~(1ll << raddr_a);
result->raddr_a = raddr_a;
- if (!result->sig.small_imm) {
+ if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
- result->alu.add.a = V3D_QPU_MUX_A;
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_A;
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
raddr_a == mul_instr->raddr_b) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
- result->alu.mul.a = V3D_QPU_MUX_A;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_A;
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
}
}
}
@@ -863,20 +1039,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
result->raddr_b = raddr_b;
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
raddr_b == add_instr->raddr_a) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
- result->alu.add.a = V3D_QPU_MUX_B;
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_B;
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
raddr_b == mul_instr->raddr_a) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
- result->alu.mul.a = V3D_QPU_MUX_B;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_B;
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
}
}
@@ -909,7 +1085,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
}
static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -927,11 +1104,85 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->flags.auf = V3D_QPU_UF_NONE;
inst->alu.mul.output_pack = inst->alu.add.output_pack;
- inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
- inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
+
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
- inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ if (devinfo->ver >= 71) {
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+ if (inst->sig.small_imm_a) {
+ inst->sig.small_imm_c = true;
+ inst->sig.small_imm_a = false;
+ } else if (inst->sig.small_imm_b) {
+ inst->sig.small_imm_d = true;
+ inst->sig.small_imm_b = false;
+ }
+ }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ case V3D_QPU_M_FMOV:
+ return devinfo->ver >= 71;
+ default:
+ return false;
+ }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ return V3D_QPU_A_MOV;
+ case V3D_QPU_M_FMOV:
+ return V3D_QPU_A_FMOV;
+ default:
+ unreachable("unexpected mov opcode");
+ }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+ inst->alu.mul.op = V3D_QPU_M_NOP;
+
+ inst->flags.ac = inst->flags.mc;
+ inst->flags.apf = inst->flags.mpf;
+ inst->flags.auf = inst->flags.muf;
+ inst->flags.mc = V3D_QPU_COND_NONE;
+ inst->flags.mpf = V3D_QPU_PF_NONE;
+ inst->flags.muf = V3D_QPU_UF_NONE;
+
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+ if (inst->sig.small_imm_c) {
+ inst->sig.small_imm_a = true;
+ inst->sig.small_imm_c = false;
+ } else if (inst->sig.small_imm_d) {
+ inst->sig.small_imm_b = true;
+ inst->sig.small_imm_d = false;
+ }
}
static bool
@@ -970,20 +1221,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge.alu.mul = mul_inst.alu.mul;
- merge.flags.mc = b->flags.ac;
- merge.flags.mpf = b->flags.apf;
- merge.flags.muf = b->flags.auf;
+ merge.flags.mc = mul_inst.flags.mc;
+ merge.flags.mpf = mul_inst.flags.mpf;
+ merge.flags.muf = mul_inst.flags.muf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
@@ -999,22 +1250,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
}
}
+ struct v3d_qpu_instr add_inst;
if (b->alu.mul.op != V3D_QPU_M_NOP) {
- if (a->alu.mul.op != V3D_QPU_M_NOP)
- return false;
- merge.alu.mul = b->alu.mul;
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = a;
+ }
+ /* If a's mul op is used but its add op is not, then see if we
+ * can convert either a's mul op or b's mul op to an add op
+ * so we can merge.
+ */
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+ add_inst = *b;
+ qpu_convert_mul_to_add(&add_inst);
- merge.flags.mc = b->flags.mc;
- merge.flags.mpf = b->flags.mpf;
- merge.flags.muf = b->flags.muf;
+ merge.alu.add = add_inst.alu.add;
- mul_instr = b;
- add_instr = a;
+ merge.flags.ac = add_inst.flags.ac;
+ merge.flags.apf = add_inst.flags.apf;
+ merge.flags.auf = add_inst.flags.auf;
+
+ mul_instr = a;
+ add_instr = &add_inst;
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+ add_inst = *a;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge = add_inst;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = &add_inst;
+ } else {
+ return false;
+ }
}
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+ * they have restrictions on the number of raddrs that can be adressed
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
+ * but we are still limited to a single small immediate per instruction.
+ */
if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
- return false;
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
}
merge.sig.thrsw |= b->sig.thrsw;
@@ -1025,7 +1316,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm |= b->sig.small_imm;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1108,7 +1398,7 @@ retry:
* regfile A or B that was written to by the previous
* instruction."
*/
- if (reads_too_soon_after_write(scoreboard, n->inst))
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
@@ -1122,10 +1412,11 @@ retry:
if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* ldunif and ldvary both write r5, but ldunif does so a tick
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ /* ldunif and ldvary both write the same register (r5 for v42
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
+ * If the ldvary's register wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
- * r5 in the same tick.
+ * the register in the same tick.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1204,11 +1495,20 @@ retry:
* ldvary now if the follow-up fixup would place
* it in the delay slots of a thrsw, which is not
* allowed and would prevent the fixup from being
- * successful.
+ * successful. In V3D 7.x we can allow this to happen
+ * as long as it is not the last delay slot.
*/
- if (inst->sig.ldvary &&
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
- continue;
+ if (inst->sig.ldvary) {
+ if (c->devinfo->ver <= 42 &&
+ scoreboard->last_thrsw_tick + 2 >=
+ scoreboard->tick - 1) {
+ continue;
+ }
+ if (c->devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 ==
+ scoreboard->tick - 1) {
+ continue;
+ }
}
/* We can emit a new tmu lookup with a previous ldtmu
@@ -1243,7 +1543,7 @@ retry:
int prio = get_instruction_priority(c->devinfo, inst);
- if (mux_read_stalls(scoreboard, inst)) {
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
/* Don't merge an instruction that stalls */
if (prev_inst)
continue;
@@ -1340,6 +1640,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
}
}
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ scoreboard->has_rf0_flops_conflict = true;
+ }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return;
+
+ /* Thread switch restrictions:
+ *
+ * At the point of a thread switch or thread end (when the actual
+ * thread switch or thread end happens, not when the signalling
+ * instruction is processed):
+ *
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
+ * ldvary instruction in which another signal also wrote to the
+ * register file, and the final instruction of the thread section
+ * contained a signal which wrote to the register file, then the
+ * value of rf0 is undefined at the start of the new section
+ *
+ * Here we use the scoreboard to track if our last rf0 implicit write
+ * happens at the same time that another signal writes the register
+ * file (has_rf0_flops_conflict). We will use that information when
+ * scheduling thrsw instructions to avoid putting anything in their
+ * last delay slot which has a signal that writes to the register file.
+ */
+
+ /* Reset tracking if we have an explicit rf0 write or we are starting
+ * a new thread section.
+ */
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+ scoreboard->last_implicit_rf0_write_tick = -10;
+ scoreboard->has_rf0_flops_conflict = false;
+ }
+
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+ scoreboard->tick + 1 : scoreboard->tick;
+ }
+
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
@@ -1383,6 +1739,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
update_scoreboard_tmu_tracking(scoreboard, qinst);
}
@@ -1580,7 +1938,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (slot > 0 && qinst->uniform != ~0)
return false;
- if (v3d_qpu_waits_vpm(inst))
+ if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
return false;
if (inst->sig.ldvary)
@@ -1588,35 +1946,67 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
/* GFXH-1625: TMUWT not allowed in the final instruction. */
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+ if (c->devinfo->ver <= 42 && slot == 2 &&
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
return false;
+ }
- /* No writing physical registers at the end. */
- bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
- bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
- if ((!add_is_nop && !inst->alu.add.magic_write) ||
- (!mul_is_nop && !inst->alu.mul.magic_write)) {
- return false;
+ if (c->devinfo->ver <= 42) {
+ /* No writing physical registers at the end. */
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
+ return false;
+ }
+
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ return false;
+ }
}
- if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
- !inst->sig_magic) {
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* The thread end instruction must not write to the
+ * register file via the add/mul ALUs.
+ */
+ if (slot == 0 &&
+ (!inst->alu.add.magic_write ||
+ !inst->alu.mul.magic_write)) {
+ return false;
+ }
}
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
return false;
- /* RF0-2 might be overwritten during the delay slots by
- * fragment shader setup.
- */
- if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
- return false;
+ if (c->devinfo->ver <= 42) {
+ /* RF0-2 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+ return false;
- if (inst->raddr_b < 3 &&
- !inst->sig.small_imm &&
- v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
- return false;
+ if (inst->raddr_b < 3 &&
+ !inst->sig.small_imm_b &&
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+ return false;
+ }
+ }
+
+ if (c->devinfo->ver >= 71) {
+ /* RF2-3 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
+ v3d71_qpu_reads_raddr(inst, 3)) {
+ return false;
+ }
+
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+ return false;
+ }
}
}
@@ -1632,6 +2022,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
*/
static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
uint32_t slot)
{
@@ -1642,8 +2033,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
return false;
- if (slot > 0 && qinst->qpu.sig.ldvary)
- return false;
+ if (qinst->qpu.sig.ldvary) {
+ if (c->devinfo->ver <= 42 && slot > 0)
+ return false;
+ if (c->devinfo->ver >= 71 && slot == 2)
+ return false;
+ }
/* unifa and the following 3 instructions can't overlap a
* thread switch/end. The docs further clarify that this means
@@ -1662,6 +2057,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
return false;
+ /* See comment when we set has_rf0_flops_conflict for details */
+ if (c->devinfo->ver >= 71 &&
+ slot == 2 &&
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+ !qinst->qpu.sig_magic) {
+ if (scoreboard->has_rf0_flops_conflict)
+ return false;
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+ return false;
+ }
+
return true;
}
@@ -1694,7 +2100,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* also apply to instructions scheduled after the thrsw that we want
* to place in its delay slots.
*/
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
return false;
/* TLB access is disallowed until scoreboard wait is executed, which
@@ -1767,8 +2173,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
bool is_thrend)
{
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+ qinst, slot)) {
return false;
+ }
if (is_thrend &&
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1969,10 +2377,11 @@ emit_branch(struct v3d_compile *c,
assert(scoreboard->last_branch_tick + 3 < branch_tick);
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
* setmsf.
*/
bool is_safe_msf_branch =
+ c->devinfo->ver >= 71 ||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -2056,46 +2465,72 @@ emit_branch(struct v3d_compile *c,
}
static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
uint32_t num_src;
- enum v3d_qpu_mux mux_a, mux_b;
-
- if (add) {
+ if (add)
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a;
- mux_b = inst->alu.add.b;
- } else {
+ else
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a;
- mux_b = inst->alu.mul.b;
- }
- for (int i = 0; i < num_src; i++) {
- if (magic) {
- if (i == 0 && mux_a == index)
- return true;
- if (i == 1 && mux_b == index)
- return true;
+ if (devinfo->ver <= 42) {
+ enum v3d_qpu_mux mux_a, mux_b;
+ if (add) {
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (magic) {
+ if (i == 0 && mux_a == index)
+ return true;
+ if (i == 1 && mux_b == index)
+ return true;
+ } else {
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
}
}
+
+ return false;
+ }
+
+ assert(devinfo->ver >= 71);
+ assert(!magic);
+
+ uint32_t raddr_a, raddr_b;
+ if (add) {
+ raddr_a = inst->alu.add.a.raddr;
+ raddr_b = inst->alu.add.b.raddr;
+ } else {
+ raddr_a = inst->alu.mul.a.raddr;
+ raddr_b = inst->alu.mul.b.raddr;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (i == 0 && raddr_a == index)
+ return true;
+ if (i == 1 && raddr_b == index)
+ return true;
}
return false;
@@ -2130,6 +2565,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
struct qblock *block,
struct v3d_qpu_instr *inst)
{
+ const struct v3d_device_info *devinfo = c->devinfo;
+
/* We only call this if we have successfully merged an ldvary into a
* previous instruction.
*/
@@ -2142,9 +2579,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
* the ldvary destination, if it does, then moving the ldvary before
* it would overwrite it.
*/
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
return false;
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
return false;
/* The implicit ldvary destination may not be written to by a signal
@@ -2180,13 +2617,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
}
/* The previous instruction cannot have a conflicting signal */
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
return false;
uint32_t sig;
struct v3d_qpu_sig new_sig = prev->qpu.sig;
new_sig.ldvary = true;
- if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
return false;
/* The previous instruction cannot use flags since ldvary uses the
@@ -2199,9 +2636,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
/* We can't put an ldvary in the delay slots of a thrsw. We should've
* prevented this when pairing up the ldvary with another instruction
- * and flagging it for a fixup.
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
+ * second delay slot.
*/
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+ assert((devinfo->ver <= 42 &&
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+ (devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
/* Move the ldvary to the previous instruction and remove it from the
* current one.
@@ -2215,14 +2656,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
inst->sig_magic = false;
inst->sig_addr = 0;
- /* By moving ldvary to the previous instruction we make it update
- * r5 in the current one, so nothing else in it should write r5.
- * This should've been prevented by our dependency tracking, which
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+ if (devinfo->ver >= 71) {
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+ }
+
+ /* By moving ldvary to the previous instruction we make it update r5
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
+ * should write this register.
+ *
+ * This should've been prevented by our depedency tracking, which
* would not allow ldvary to be paired up with an instruction that
- * writes r5 (since our dependency tracking doesn't know that the
- * ldvary write r5 happens in the next instruction).
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
+ * ldvary write to r5/rf0 happens in the next instruction).
*/
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
+ assert(devinfo->ver <= 42 ||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
return true;
}
@@ -2313,7 +2765,7 @@ schedule_instructions(struct v3d_compile *c,
}
}
}
- if (mux_read_stalls(scoreboard, inst))
+ if (read_stalls(c->devinfo, scoreboard, inst))
c->qpu_inst_stalled_count++;
}
@@ -2538,6 +2990,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
scoreboard.first_ldtmu_after_thrsw = true;
+ scoreboard.last_implicit_rf0_write_tick = - 10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 2cc7a0eb0ae..0466ee5d0b6 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
int last_sfu_write;
int last_branch_ip;
int last_thrsw_ip;
+ int first_tlb_z_write;
/* Set when we've found the last-THRSW signal, or if we were started
* in single-segment mode.
@@ -110,11 +111,58 @@ static void
qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
{
const struct v3d_device_info *devinfo = state->c->devinfo;
+
+ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+ state->first_tlb_z_write = state->ip;
+
const struct v3d_qpu_instr *inst = &qinst->qpu;
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+ fail_instr(state, "Implicit branch MSF read after TLB Z write");
+ }
+
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return;
+ if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write) {
+ fail_instr(state, "SETMSF after TLB Z write");
+ }
+
+ if (state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->alu.add.op == V3D_QPU_A_MSF) {
+ fail_instr(state, "MSF read after TLB Z write");
+ }
+
+ if (devinfo->ver < 71) {
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+ inst->sig.small_imm_d) {
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ }
+ } else {
+ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+ !vir_is_add(qinst)) {
+ fail_instr(state, "small imm a/b used but no ADD inst");
+ }
+ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+ !vir_is_mul(qinst)) {
+ fail_instr(state, "small imm c/d used but no MUL inst");
+ }
+ if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+ fail_instr(state, "only one small immediate can be "
+ "enabled per instruction");
+ }
+ }
+
/* LDVARY writes r5 two instructions later and LDUNIF writes
* r5 one instruction later, which is illegal to have
* together.
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
"SFU write started during THRSW delay slots ");
}
- if (inst->sig.ldvary)
- fail_instr(state, "LDVARY during THRSW delay slots");
+ if (inst->sig.ldvary) {
+ if (devinfo->ver <= 42)
+ fail_instr(state, "LDVARY during THRSW delay slots");
+ if (devinfo->ver >= 71 &&
+ state->ip - state->last_thrsw_ip == 2) {
+ fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+ }
+ }
}
(void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
vpm_writes +
tlb_writes +
tsy_writes +
- inst->sig.ldtmu +
+ (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
inst->sig.ldtlb +
inst->sig.ldvpm +
inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if ((inst->alu.add.op != V3D_QPU_A_NOP &&
!inst->alu.add.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "ADD RF write at THREND");
+ }
+ if (inst->alu.add.waddr == 2 ||
+ inst->alu.add.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
!inst->alu.mul.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "MUL RF write at THREND");
+ }
+
+ if (inst->alu.mul.waddr == 2 ||
+ inst->alu.mul.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
!inst->sig_magic) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71 &&
+ (inst->sig_addr == 2 ||
+ inst->sig_addr == 3)) {
+ fail_instr(state, "RF2-3 write after THREND");
+ }
}
/* GFXH-1625: No TMUWT in the last instruction */
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
.last_sfu_write = -10,
.last_thrsw_ip = -10,
.last_branch_ip = -10,
+ .first_tlb_z_write = INT_MAX,
.ip = 0,
.last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 9f4129870e1..b437b5f5168 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -613,6 +613,11 @@ struct v3d_ra_node_info {
struct {
uint32_t priority;
uint8_t class_bits;
+ bool is_program_end;
+ bool unused;
+
+ /* V3D 7.x */
+ bool is_ldunif_dst;
} *info;
uint32_t alloc_count;
};
@@ -1150,8 +1155,8 @@ bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
uint8_t vir_channels_written(struct qinst *inst);
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
@@ -1184,7 +1189,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
bool v3d_nir_lower_scratch(nir_shader *s);
bool v3d_nir_lower_txf_ms(nir_shader *s);
-bool v3d_nir_lower_image_load_store(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
@@ -1425,6 +1430,20 @@ VIR_SFU(LOG)
VIR_SFU(SIN)
VIR_SFU(RSQRT2)
+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
static inline struct qinst *
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
struct qreg dest, struct qreg src)
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2900a29817f..bbb55be4a14 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,6 +40,10 @@
* calculations and load/store using the TMU general memory access path.
*/
+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
bool
v3d_gl_format_is_return_32(enum pipe_format format)
{
@@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format)
/* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
* 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
*/
static nir_ssa_def *
pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
}
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static nir_ssa_def *
+nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2)
+{
+ return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_11f11f10f(nir_builder *b, nir_ssa_def *color)
+{
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ /* FIXME: we noted that we could just use p2 again as the second
+ * element to pack, and CTS tests still works. Just using undef as is
+ * slightly more correct
+ */
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+ return nir_v11fpack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color)
+{
+ nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+
+ return nir_v10pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color)
+{
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ p1 = nir_vftounorm10lo_v3d(b, p1);
+
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = nir_vftounorm10hi_v3d(b, p2);
+
+ return nir_v10pack_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+ NONE,
+ TO_SNORM,
+ TO_UNORM
+};
+
+static inline nir_ssa_def *
+pack_8bit(nir_builder *b, nir_ssa_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) for 1 component if we are not doing any
+ * conversion. But we support also that case, and let the caller
+ * decide which method to use.
+ */
+ nir_ssa_def *p1;
+ nir_ssa_def *p2;
+
+ if (conversion == NONE) {
+ p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ } else {
+ p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ p1 = (conversion == TO_UNORM) ?
+ nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1);
+ }
+ if (num_components == 4) {
+ if (conversion == NONE) {
+ p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ } else {
+ p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = (conversion == TO_UNORM) ?
+ nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2);
+ }
+ } else {
+ /* As mentioned on the comment before, using an undef here
+ * would be more correct. But for this case we are getting
+ * worse values, and in fact even some worse instruction count
+ * with some CTS tests, so we just reuse the first packing
+ */
+ p2 = p1;
+ }
+
+ return nir_v8pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_16bit(nir_builder *b, nir_ssa_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ nir_ssa_def *results[2];
+ nir_ssa_def *channels[4];
+
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) if we are not doing any conversion. But we support
+ * also that case, and let the caller decide which method to use.
+ */
+
+ for (unsigned i = 0; i < num_components; i++) {
+ channels[i] = nir_channel(b, color, i);
+ switch (conversion) {
+ case TO_SNORM:
+ channels[i] = nir_ftosnorm16_v3d(b, channels[i]);
+ break;
+ case TO_UNORM:
+ channels[i] = nir_ftounorm16_v3d(b, channels[i]);
+ break;
+ default:
+ break;
+ }
+ }
+
+ switch (num_components) {
+ case 1:
+ results[0] = channels[0];
+ break;
+ case 4:
+ results[1] = nir_vpack_v3d(b, channels[2], channels[3]);
+ FALLTHROUGH;
+ case 2:
+ results[0] = nir_vpack_v3d(b, channels[0], channels[1]);
+ break;
+ }
+
+ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_ssa_def *
+pack_xbit(nir_builder *b, nir_ssa_def *color,
+ unsigned num_components,
+ const struct util_format_channel_description *r_chan)
+{
+ bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+ enum hw_conversion conversion = NONE;
+ if (r_chan->normalized) {
+ conversion =
+ (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+ }
+
+ switch (r_chan->size) {
+ case 8:
+ if (conversion == NONE && num_components < 2)
+ return pack_bits(b, color, bits_8, num_components, pack_mask);
+ else
+ return pack_8bit(b, color, num_components, conversion);
+ break;
+ case 16:
+ /* pack_mask implies that the generic packing method would
+ * need to include extra operations to handle negative values,
+ * so in that case, even without a conversion, it is better to
+ * use the packing using custom hw operations.
+ */
+ if (conversion == NONE && !pack_mask)
+ return pack_bits(b, color, bits_16, num_components, pack_mask);
+ else
+ return pack_16bit(b, color, num_components, conversion);
+ break;
+ default:
+ unreachable("unrecognized bits");
+ }
+}
+
static bool
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
{
enum pipe_format format = nir_intrinsic_format(instr);
assert(format != PIPE_FORMAT_NONE);
@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
*/
formatted = color;
} else {
- static const unsigned bits_8[4] = {8, 8, 8, 8};
- static const unsigned bits_16[4] = {16, 16, 16, 16};
- static const unsigned bits_1010102[4] = {10, 10, 10, 2};
const unsigned *bits;
switch (r_chan->size) {
@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
return true;
}
+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ enum pipe_format format = nir_intrinsic_format(instr);
+ assert(format != PIPE_FORMAT_NONE);
+ const struct util_format_description *desc =
+ util_format_description(format);
+ const struct util_format_channel_description *r_chan = &desc->channel[0];
+ unsigned num_components = util_format_get_nr_components(format);
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_ssa_def *color = nir_channels(b,
+ nir_ssa_for_src(b, instr->src[3], 4),
+ (1 << num_components) - 1);
+ nir_ssa_def *formatted = NULL;
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ formatted = nir_format_pack_r9g9b9e5(b, color);
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ formatted = pack_11f11f10f(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+ formatted = pack_r10g10b10a2_uint(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+ formatted = pack_r10g10b10a2_unorm(b, color);
+ } else if (r_chan->size == 32) {
+ /* For 32-bit formats, we just have to move the vector
+ * across (possibly reducing the number of channels).
+ */
+ formatted = color;
+ } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+ assert(r_chan->size == 16);
+ formatted = nir_format_float_to_half(b, color);
+ formatted = pack_bits(b, formatted, bits_16, num_components,
+ false);
+ } else {
+ assert(r_chan->size == 8 || r_chan->size == 16);
+ formatted = pack_xbit(b, color, num_components, r_chan);
+ }
+
+ nir_instr_rewrite_src(&instr->instr, &instr->src[3],
+ nir_src_for_ssa(formatted));
+ instr->num_components = formatted->num_components;
+
+ return true;
+}
+
static bool
v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
{
@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
nir_intrinsic_instr *intr =
nir_instr_as_intrinsic(instr);
+ struct v3d_compile *c = (struct v3d_compile *) _state;
+
switch (intr->intrinsic) {
case nir_intrinsic_image_load:
return v3d_nir_lower_image_load(b, intr);
case nir_intrinsic_image_store:
- return v3d_nir_lower_image_store(b, intr);
+ if (c->devinfo->ver >= 71)
+ return v3d_nir_lower_image_store_v71(b, intr);
+ else
+ return v3d_nir_lower_image_store_v42(b, intr);
+ break;
default:
return false;
}
@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
}
bool
-v3d_nir_lower_image_load_store(nir_shader *s)
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
{
return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb,
nir_metadata_block_index |
- nir_metadata_dominance, NULL);
+ nir_metadata_dominance, c);
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 69929a145aa..8a314c8b5a9 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
* The correct fix for this as recommended by Broadcom
* is to convert to .8 fixed-point with ffloor().
*/
- pos = nir_f2i32(b, nir_ffloor(b, pos));
- v3d_nir_store_output(b, state->vp_vpm_offset + i,
- offset_reg, pos);
+ if (c->devinfo->ver <= 42)
+ pos = nir_f2i32(b, nir_ffloor(b, pos));
+ else
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
+ offset_reg, pos);
}
}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 499215454c0..192872f368c 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
return false;
}
- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
@@ -156,8 +156,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
for (int i = 0; i < vir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_VPM:
@@ -178,8 +182,12 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
switch (inst->dst.file) {
case QFILE_MAGIC:
switch (inst->dst.index) {
@@ -209,15 +217,15 @@ vir_set_unpack(struct qinst *inst, int src,
if (vir_is_add(inst)) {
if (src == 0)
- inst->qpu.alu.add.a_unpack = unpack;
+ inst->qpu.alu.add.a.unpack = unpack;
else
- inst->qpu.alu.add.b_unpack = unpack;
+ inst->qpu.alu.add.b.unpack = unpack;
} else {
assert(vir_is_mul(inst));
if (src == 0)
- inst->qpu.alu.mul.a_unpack = unpack;
+ inst->qpu.alu.mul.a.unpack = unpack;
else
- inst->qpu.alu.mul.b_unpack = unpack;
+ inst->qpu.alu.mul.b.unpack = unpack;
}
}
@@ -737,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
+ *
+ * FIXME: initially testing on V3D 7.1 seems to work fine when using
+ * separate segments. So we could try to reevaluate in the future, if
+ * there is any advantage of using separate segments.
*/
prog_data->separate_segments = false;
prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -1572,7 +1584,7 @@ v3d_attempt_compile(struct v3d_compile *c)
NIR_PASS(_, c->s, v3d_nir_lower_io, c);
NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
- NIR_PASS(_, c->s, v3d_nir_lower_image_load_store);
+ NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
nir_lower_idiv_options idiv_options = {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..ab5d4043039 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
- unpack[0] = instr->alu.add.a_unpack;
- unpack[1] = instr->alu.add.b_unpack;
+ unpack[0] = instr->alu.add.a.unpack;
+ unpack[1] = instr->alu.add.b.unpack;
} else {
fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
- unpack[0] = instr->alu.mul.a_unpack;
- unpack[1] = instr->alu.mul.b_unpack;
+ unpack[0] = instr->alu.mul.a.unpack;
+ unpack[1] = instr->alu.mul.b.unpack;
}
for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..2907de9049f 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
flags_inst = NULL;
}
- /* Payload registers: r0/1/2 contain W, centroid W,
- * and Z at program start. Register allocation will
- * force their nodes to R0/1/2.
+ /* Payload registers: for fragment shaders, W,
+ * centroid W, and Z will be initialized at r0/1/2
+ * until v42, or r1/r2/r3 from v71.
+ *
+ * For compute shaders, payload would be r0/r2 until
+ * v42, r3/r2 from v71
+ *
+ * Register allocation will force their nodes to those
+ * registers.
*/
if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
+ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+ if (inst->src[0].index >= min_payload_r ||
+ inst->src[0].index <= max_payload_r) {
c->temp_start[inst->dst.index] = 0;
- break;
}
}
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index da121c2a5bd..1260838ca05 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
#include "v3d_compiler.h"
static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
{
if (!inst)
return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
return false;
}
- switch (inst->src[0].file) {
- case QFILE_MAGIC:
- /* No copy propagating from R3/R4/R5 -- the MOVs from those
- * are there to register allocate values produced into R3/4/5
- * to other regs (though hopefully r3/4/5).
- */
- switch (inst->src[0].index) {
- case V3D_QPU_WADDR_R3:
- case V3D_QPU_WADDR_R4:
- case V3D_QPU_WADDR_R5:
- return false;
+ if (devinfo->ver <= 42) {
+ switch (inst->src[0].file) {
+ case QFILE_MAGIC:
+ /* No copy propagating from R3/R4/R5 -- the MOVs from
+ * those are there to register allocate values produced
+ * into R3/4/5 to other regs (though hopefully r3/4/5).
+ */
+ switch (inst->src[0].index) {
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ return false;
+ default:
+ break;
+ }
+ break;
+
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* MOVs from rf0/1/2 are only to track the live
+ * intervals for W/centroid W/Z.
+ */
+ return false;
+ }
+ break;
+
default:
break;
}
- break;
-
- case QFILE_REG:
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- /* MOVs from rf0/1/2 are only to track the live
+ } else {
+ assert(devinfo->ver >= 71);
+ switch (inst->src[0].file) {
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ /* MOVs from rf1/2/3 are only to track the live
* intervals for W/centroid W/Z.
+ *
+ * Note: rf0 can be implicitly written by ldvary
+ * (no temp involved), so it is not an SSA value and
+ * could clash with writes to other temps that are
+ * also allocated to rf0. In theory, that would mean
+ * that we can't copy propagate from it, but we handle
+ * this at register allocation time, preventing temps
+ * from being allocated to rf0 while the rf0 value from
+ * ldvary is still live.
*/
- return false;
- }
- break;
+ case 1:
+ case 2:
+ case 3:
+ return false;
+ }
+ break;
- default:
- break;
+ default:
+ break;
+ }
}
return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
if (vir_is_add(inst)) {
if (chan == 0)
- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
} else {
if (chan == 0)
- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
}
}
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
*/
struct qinst *mov = movs[inst->src[i].index];
if (!mov) {
- if (!is_copy_mov(c->defs[inst->src[i].index]))
+ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
continue;
mov = c->defs[inst->src[i].index];
@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
continue;
/* these ops can't represent abs. */
- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_VFPACK:
case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
inst->src[i] = mov->src[0];
if (vir_has_unpack(mov, 0)) {
- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
vir_set_unpack(inst, i, unpack);
}
@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)
apply_kills(c, movs, inst);
- if (is_copy_mov(inst))
+ if (is_copy_mov(c->devinfo, inst))
movs[inst->dst.index] = inst;
}
}
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index c7896d57f2b..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
a->qpu.flags.mpf != b->qpu.flags.mpf ||
a->qpu.alu.add.op != b->qpu.alu.add.op ||
a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
return false;
}
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..ed5bc011964 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
/* The small immediate value sits in the raddr B field, so we
* can't have 2 small immediates in one instruction (unless
* they're the same value, but that should be optimized away
- * elsewhere).
+ * elsewhere). Since 7.x we can encode small immediates in
+ * any raddr field, but each instruction can still only use
+ * one.
*/
bool uses_small_imm = false;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
*/
struct v3d_qpu_sig new_sig = inst->qpu.sig;
uint32_t sig_packed;
- new_sig.small_imm = true;
+ if (c->devinfo->ver <= 42) {
+ new_sig.small_imm_b = true;
+ } else {
+ if (vir_is_add(inst)) {
+ if (i == 0)
+ new_sig.small_imm_a = true;
+ else
+ new_sig.small_imm_b = true;
+ } else {
+ if (i == 0)
+ new_sig.small_imm_c = true;
+ else
+ new_sig.small_imm_d = true;
+ }
+ }
+
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
continue;
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
vir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->qpu.sig.small_imm = true;
+ inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+ inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+ inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+ inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
inst->qpu.raddr_b = packed;
inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index b22f915d1df..8eac2b75bd7 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -28,41 +28,73 @@
#define ACC_INDEX 0
#define ACC_COUNT 6
-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT 64
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return ACC_INDEX + ACC_COUNT;
+ else
+ return 0;
+}
+
+/* ACC as accumulator */
#define CLASS_BITS_PHYS (1 << 0)
#define CLASS_BITS_ACC (1 << 1)
#define CLASS_BITS_R5 (1 << 4)
-#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \
- CLASS_BITS_ACC | \
- CLASS_BITS_R5)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+ else
+ return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+ if (!devinfo->has_accumulators) {
+ assert(class_bits & CLASS_BITS_PHYS);
+ class_bits = CLASS_BITS_PHYS;
+ }
+ return class_bits;
+}
static inline uint32_t
-temp_to_node(uint32_t temp)
+temp_to_node(struct v3d_compile *c, uint32_t temp)
{
- return temp + ACC_COUNT;
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
}
static inline uint32_t
-node_to_temp(uint32_t node)
+node_to_temp(struct v3d_compile *c, uint32_t node)
{
- assert(node >= ACC_COUNT);
- return node - ACC_COUNT;
+ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
}
static inline uint8_t
-get_temp_class_bits(struct v3d_ra_node_info *nodes,
+get_temp_class_bits(struct v3d_compile *c,
uint32_t temp)
{
- return nodes->info[temp_to_node(temp)].class_bits;
+ return c->nodes.info[temp_to_node(c, temp)].class_bits;
}
static inline void
-set_temp_class_bits(struct v3d_ra_node_info *nodes,
+set_temp_class_bits(struct v3d_compile *c,
uint32_t temp, uint8_t class_bits)
{
- nodes->info[temp_to_node(temp)].class_bits = class_bits;
+ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
}
static struct ra_class *
@@ -71,11 +103,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
if (class_bits == CLASS_BITS_PHYS) {
return c->compiler->reg_class_phys[c->thread_index];
} else if (class_bits == (CLASS_BITS_R5)) {
+ assert(c->devinfo->has_accumulators);
return c->compiler->reg_class_r5[c->thread_index];
} else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+ assert(c->devinfo->has_accumulators);
return c->compiler->reg_class_phys_or_acc[c->thread_index];
} else {
- assert(class_bits == CLASS_BITS_ANY);
+ assert(class_bits == get_class_bit_any(c->devinfo));
return c->compiler->reg_class_any[c->thread_index];
}
}
@@ -84,7 +118,7 @@ static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
{
assert(temp < c->num_temps && temp < c->nodes.alloc_count);
- return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
+ return choose_reg_class(c, get_temp_class_bits(c, temp));
}
static inline bool
@@ -313,7 +347,7 @@ v3d_choose_spill_node(struct v3d_compile *c)
for (unsigned i = 0; i < c->num_temps; i++) {
if (BITSET_TEST(c->spillable, i)) {
- ra_set_node_spill_cost(c->g, temp_to_node(i),
+ ra_set_node_spill_cost(c->g, temp_to_node(c, i),
spill_costs[i]);
}
}
@@ -331,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
c->nodes.info = reralloc_array_size(c,
c->nodes.info,
sizeof(c->nodes.info[0]),
- c->nodes.alloc_count + ACC_COUNT);
+ c->nodes.alloc_count +
+ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
}
/* Creates the interference node for a new temp. We use this to keep the node
@@ -343,11 +378,15 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
ensure_nodes(c);
int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
- assert(node == temp + ACC_COUNT);
+ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+ node == temp + IMPLICIT_RF_COUNT);
/* We fill the node priority after we are done inserting spills */
c->nodes.info[node].class_bits = class_bits;
c->nodes.info[node].priority = 0;
+ c->nodes.info[node].is_ldunif_dst = false;
+ c->nodes.info[node].is_program_end = false;
+ c->nodes.info[node].unused = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -395,8 +434,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
*/
if (c->spilling) {
int temp_class = CLASS_BITS_PHYS;
- if (i != c->spill_base.index)
+ if (c->devinfo->has_accumulators &&
+ i != c->spill_base.index) {
temp_class |= CLASS_BITS_ACC;
+ }
add_node(c, i, temp_class);
}
}
@@ -436,7 +477,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
*/
assert(c->disable_ldunif_opt);
struct qreg offset = vir_uniform_ui(c, spill_offset);
- add_node(c, offset.index, CLASS_BITS_ANY);
+ add_node(c, offset.index, get_class_bit_any(c->devinfo));
/* We always enable per-quad on spills/fills to ensure we spill
* any channels involved with helper invocations.
@@ -455,14 +496,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
* temp will be used immediately so just like the uniform above we
* can allow accumulators.
*/
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
if (!fill_dst) {
struct qreg dst = vir_TMUWT(c);
assert(dst.file == QFILE_TEMP);
- add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ add_node(c, dst.index, temp_class);
} else {
*fill_dst = vir_LDTMU(c);
assert(fill_dst->file == QFILE_TEMP);
- add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ add_node(c, fill_dst->index, temp_class);
}
/* Temps across the thread switch we injected can't be assigned to
@@ -482,7 +525,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
c->temp_start[i] < ip && c->temp_end[i] >= ip :
c->temp_start[i] <= ip && c->temp_end[i] > ip;
if (thrsw_cross) {
- ra_set_node_class(c->g, temp_to_node(i),
+ ra_set_node_class(c->g, temp_to_node(c, i),
choose_reg_class(c, CLASS_BITS_PHYS));
}
}
@@ -509,8 +552,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
* same register class bits as the original.
*/
if (inst == position) {
- uint8_t class_bits = get_temp_class_bits(&c->nodes,
- inst->dst.index);
+ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
inst->dst = vir_get_temp(c);
add_node(c, inst->dst.index, class_bits);
} else {
@@ -542,7 +584,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
}
static void
-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+ int spill_temp)
{
c->spill_start_num_temps = c->num_temps;
c->spilling = true;
@@ -554,8 +597,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
spill_offset = c->spill_size;
c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
- if (spill_offset == 0)
+ if (spill_offset == 0) {
v3d_setup_spill_base(c);
+
+ /* Don't allocate our spill base to rf0 to avoid
+ * conflicts with instructions doing implicit writes
+ * to that register.
+ */
+ if (!c->devinfo->has_accumulators) {
+ ra_add_node_interference(
+ c->g,
+ temp_to_node(c, c->spill_base.index),
+ implicit_rf_nodes[0]);
+ }
+ }
}
struct qinst *last_thrsw = c->last_thrsw;
@@ -574,7 +629,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
reconstruct_op = orig_def->qpu.alu.add.op;
}
- uint32_t spill_node = temp_to_node(spill_temp);
+ uint32_t spill_node = temp_to_node(c, spill_temp);
/* We must disable the ldunif optimization if we are spilling uniforms */
bool had_disable_ldunif_opt = c->disable_ldunif_opt;
@@ -635,7 +690,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* instruction immediately after, so
* we can use any register class for it.
*/
- add_node(c, unif.index, CLASS_BITS_ANY);
+ add_node(c, unif.index,
+ get_class_bit_any(c->devinfo));
} else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
struct qreg temp =
reconstruct_temp(c, reconstruct_op);
@@ -644,8 +700,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* instruction immediately after so we
* can use ACC.
*/
- add_node(c, temp.index, CLASS_BITS_PHYS |
- CLASS_BITS_ACC);
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+ CLASS_BITS_ACC);
+ add_node(c, temp.index, temp_class);
} else {
/* If we have a postponed spill, we
* don't need a fill as the temp would
@@ -739,12 +797,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* update node priorities based one new liveness data.
*/
uint32_t sb_temp =c->spill_base.index;
- uint32_t sb_node = temp_to_node(sb_temp);
+ uint32_t sb_node = temp_to_node(c, sb_temp);
for (uint32_t i = 0; i < c->num_temps; i++) {
if (c->temp_end[i] == -1)
continue;
- uint32_t node_i = temp_to_node(i);
+ uint32_t node_i = temp_to_node(c, i);
c->nodes.info[node_i].priority =
c->temp_end[i] - c->temp_start[i];
@@ -752,7 +810,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
j < c->num_temps; j++) {
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
- uint32_t node_j = temp_to_node(j);
+ uint32_t node_j = temp_to_node(c, j);
ra_add_node_interference(c->g, node_i, node_j);
}
}
@@ -771,9 +829,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
}
struct v3d_ra_select_callback_data {
+ uint32_t phys_index;
uint32_t next_acc;
uint32_t next_phys;
struct v3d_ra_node_info *nodes;
+ const struct v3d_device_info *devinfo;
};
/* Choosing accumulators improves chances of merging QPU instructions
@@ -785,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
int priority)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Favor accumulators if we have less that this number of physical
* registers. Accumulators have more restrictions (like being
* invalidated through thrsw), so running out of physical registers
@@ -794,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
static const int available_rf_threshold = 5;
int available_rf = 0 ;
for (int i = 0; i < PHYS_COUNT; i++) {
- if (BITSET_TEST(regs, PHYS_INDEX + i))
+ if (BITSET_TEST(regs, v3d_ra->phys_index + i))
available_rf++;
if (available_rf >= available_rf_threshold)
break;
@@ -820,6 +883,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Choose r5 for our ldunifs if possible (nobody else can load to that
* reg, and it keeps the QPU cond field free from being occupied by
* ldunifrf).
@@ -849,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ unsigned int node,
BITSET_WORD *regs,
unsigned int *out)
{
+ /* If this node is for an unused temp, ignore. */
+ if (v3d_ra->nodes->info[node].unused) {
+ *out = 0;
+ return true;
+ }
+
+ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+ * so we can avoid turning them into ldunifrf (which uses the
+ * cond field to encode the dst and would prevent merge with
+ * instructions that use cond flags).
+ */
+ if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ assert(v3d_ra->devinfo->ver >= 71);
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
+ /* The last 3 instructions in a shader can't use some specific registers
+ * (usually early rf registers, depends on v3d version) so try to
+ * avoid allocating these to registers used by the last instructions
+ * in the shader.
+ */
+ const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
+ if (v3d_ra->nodes->info[node].is_program_end &&
+ v3d_ra->next_phys < safe_rf_start) {
+ v3d_ra->next_phys = safe_rf_start;
+ }
+
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
- int phys = PHYS_INDEX + phys_off;
+
+ /* Try to keep rf0 available for ldunif in 7.x (see above). */
+ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+ continue;
+
+ int phys = v3d_ra->phys_index + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
@@ -863,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
}
}
+ /* If we couldn't allocate, do try to assign rf0 if it is available. */
+ if (v3d_ra->devinfo->ver >= 71 &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ v3d_ra->next_phys = 1;
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
return false;
}
@@ -877,7 +986,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
return reg;
}
- if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+ if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
return reg;
/* If we ran out of physical registers try to assign an accumulator
@@ -896,8 +1005,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
* register file can be divided up for fragment shader threading.
*/
int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+ uint8_t phys_index = get_phys_index(compiler->devinfo);
- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
false);
if (!compiler->regs)
return false;
@@ -905,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
for (int threads = 0; threads < max_thread_index; threads++) {
compiler->reg_class_any[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_r5[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_phys_or_acc[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
+ if (compiler->devinfo->has_accumulators) {
+ compiler->reg_class_r5[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ compiler->reg_class_phys_or_acc[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ }
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- for (int i = PHYS_INDEX;
- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ /* Init physical regs */
+ for (int i = phys_index;
+ i < phys_index + (PHYS_COUNT >> threads); i++) {
+ if (compiler->devinfo->has_accumulators)
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_phys[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
- ra_class_add_reg(compiler->reg_class_any[threads], i);
+ /* Init accumulator regs */
+ if (compiler->devinfo->has_accumulators) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->reg_class_any[threads], i);
+ }
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
+ ra_class_add_reg(compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
}
- /* r5 can only store a single 32-bit value, so not much can
- * use it.
- */
- ra_class_add_reg(compiler->reg_class_r5[threads],
- ACC_INDEX + 5);
- ra_class_add_reg(compiler->reg_class_any[threads],
- ACC_INDEX + 5);
}
ra_set_finalize(compiler->regs, NULL);
@@ -944,7 +1061,10 @@ tmu_spilling_allowed(struct v3d_compile *c)
}
static void
-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+ int *acc_nodes,
+ int *implicit_rf_nodes,
+ int last_ldvary_ip,
struct qinst *inst)
{
int32_t ip = inst->ip;
@@ -954,26 +1074,39 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* result to a temp), nothing else can be stored in r3/r4 across
* it.
*/
- if (vir_writes_r3(c->devinfo, inst)) {
+ if (vir_writes_r3_implicitly(c->devinfo, inst)) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
- temp_to_node(i),
+ temp_to_node(c, i),
acc_nodes[3]);
}
}
}
- if (vir_writes_r4(c->devinfo, inst)) {
+ if (vir_writes_r4_implicitly(c->devinfo, inst)) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
- temp_to_node(i),
+ temp_to_node(c, i),
acc_nodes[4]);
}
}
}
+ /* If any instruction writes to a physical register implicitly
+ * nothing else can write the same register across it.
+ */
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_LDVPMV_IN:
@@ -987,7 +1120,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* decides whether the LDVPM is in or out)
*/
assert(inst->dst.file == QFILE_TEMP);
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_PHYS);
break;
}
@@ -1002,7 +1135,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* phys regfile.
*/
assert(inst->dst.file == QFILE_TEMP);
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_PHYS);
break;
}
@@ -1015,6 +1148,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
if (inst->src[0].file == QFILE_REG) {
switch (inst->src[0].index) {
case 0:
+ /* V3D 7.x doesn't use rf0 for thread payload */
+ if (c->devinfo->ver >= 71)
+ break;
+ else
+ FALLTHROUGH;
case 1:
case 2:
case 3: {
@@ -1024,14 +1162,34 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
*/
assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
assert(inst->dst.file == QFILE_TEMP);
- uint32_t node = temp_to_node(inst->dst.index);
+ uint32_t node = temp_to_node(c, inst->dst.index);
ra_set_node_reg(c->g, node,
- PHYS_INDEX + inst->src[0].index);
+ get_phys_index(c->devinfo) +
+ inst->src[0].index);
break;
}
}
}
+ /* Don't allocate rf0 to temps that cross ranges where we have
+ * live implicit rf0 writes from ldvary. We can identify these
+ * by tracking the last ldvary instruction and explicit reads
+ * of rf0.
+ */
+ if (c->devinfo->ver >= 71 &&
+ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+ (vir_get_nsrc(inst) > 1 &&
+ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > last_ldvary_ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
if (inst->dst.file == QFILE_TEMP) {
/* Only a ldunif gets to write to R5, which only has a
* single 32-bit channel of storage.
@@ -1041,36 +1199,95 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* because ldunif has usually a shorter lifespan, allowing for
* more accumulator reuse and QPU merges.
*/
- if (!inst->qpu.sig.ldunif) {
- uint8_t class_bits =
- get_temp_class_bits(&c->nodes, inst->dst.index) &
- ~CLASS_BITS_R5;
- set_temp_class_bits(&c->nodes, inst->dst.index,
- class_bits);
-
+ if (c->devinfo->has_accumulators) {
+ if (!inst->qpu.sig.ldunif) {
+ uint8_t class_bits =
+ get_temp_class_bits(c, inst->dst.index) &
+ ~CLASS_BITS_R5;
+ set_temp_class_bits(c, inst->dst.index,
+ class_bits);
+
+ } else {
+ /* Until V3D 4.x, we could only load a uniform
+ * to r5, so we'll need to spill if uniform
+ * loads interfere with each other.
+ */
+ if (c->devinfo->ver < 40) {
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_R5);
+ }
+ }
} else {
- /* Until V3D 4.x, we could only load a uniform
- * to r5, so we'll need to spill if uniform
- * loads interfere with each other.
+ /* Make sure we don't allocate the ldvary's
+ * destination to rf0, since it would clash
+ * with its implicit write to that register.
+ */
+ if (inst->qpu.sig.ldvary) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, inst->dst.index),
+ implicit_rf_nodes[0]);
+ }
+ /* Flag dst temps from ldunif(a) instructions
+ * so we can try to assign rf0 to them and avoid
+ * converting these to ldunif(a)rf.
*/
- if (c->devinfo->ver < 40) {
- set_temp_class_bits(&c->nodes, inst->dst.index,
- CLASS_BITS_R5);
+ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+ const uint32_t dst_n =
+ temp_to_node(c, inst->dst.index);
+ c->nodes.info[dst_n].is_ldunif_dst = true;
}
}
}
/* All accumulators are invalidated across a thread switch. */
- if (inst->qpu.sig.thrsw) {
+ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
- set_temp_class_bits(&c->nodes, i,
+ set_temp_class_bits(c, i,
CLASS_BITS_PHYS);
}
}
}
}
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+ /* Only look for registers used in this many instructions */
+ uint32_t last_set_count = 6;
+
+ struct qblock *last_block = vir_exit_block(c);
+ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+ if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
+ continue;
+
+ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+ }
+ }
+
+ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->dst.index);
+ c->nodes.info[node].is_program_end = true;
+ }
+
+ if (--last_set_count == 0)
+ break;
+ }
+}
+
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
*
@@ -1080,19 +1297,32 @@ struct qpu_reg *
v3d_register_allocate(struct v3d_compile *c)
{
int acc_nodes[ACC_COUNT];
+ int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+ unsigned num_ra_nodes = c->num_temps;
+ if (c->devinfo->has_accumulators)
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
+ else
+ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
c->nodes = (struct v3d_ra_node_info) {
.alloc_count = c->num_temps,
.info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
- c->num_temps + ACC_COUNT),
+ num_ra_nodes),
};
+ uint32_t phys_index = get_phys_index(c->devinfo);
+
struct v3d_ra_select_callback_data callback_data = {
+ .phys_index = phys_index,
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
- * RF0-2.
+ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+ * using RF2-3.
*/
- .next_phys = 3,
+ .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
.nodes = &c->nodes,
+ .devinfo = c->devinfo,
};
vir_calculate_live_intervals(c);
@@ -1108,27 +1338,35 @@ v3d_register_allocate(struct v3d_compile *c)
c->thread_index--;
}
- c->g = ra_alloc_interference_graph(c->compiler->regs,
- c->num_temps + ARRAY_SIZE(acc_nodes));
+ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
* switches. We could represent these as classes for the nodes to
* live in, but the classes take up a lot of memory to set up, so we
- * don't want to make too many.
+ * don't want to make too many. We use the same mechanism on platforms
+ * without accumulators that can have implicit writes to phys regs.
*/
- for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
- if (i < ACC_COUNT) {
+ for (uint32_t i = 0; i < num_ra_nodes; i++) {
+ c->nodes.info[i].is_ldunif_dst = false;
+ c->nodes.info[i].is_program_end = false;
+ c->nodes.info[i].unused = false;
+ c->nodes.info[i].priority = 0;
+ c->nodes.info[i].class_bits = 0;
+ if (c->devinfo->has_accumulators && i < ACC_COUNT) {
acc_nodes[i] = i;
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
- c->nodes.info[i].priority = 0;
- c->nodes.info[i].class_bits = 0;
+ } else if (!c->devinfo->has_accumulators &&
+ i < ARRAY_SIZE(implicit_rf_nodes)) {
+ implicit_rf_nodes[i] = i;
+ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
} else {
- uint32_t t = node_to_temp(i);
+ uint32_t t = node_to_temp(c, i);
c->nodes.info[i].priority =
c->temp_end[t] - c->temp_start[t];
- c->nodes.info[i].class_bits = CLASS_BITS_ANY;
+ c->nodes.info[i].class_bits =
+ get_class_bit_any(c->devinfo);
}
}
@@ -1136,25 +1374,61 @@ v3d_register_allocate(struct v3d_compile *c)
* interferences.
*/
int ip = 0;
+ int last_ldvary_ip = -1;
vir_for_each_inst_inorder(inst, c) {
inst->ip = ip++;
- update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
+
+ /* ldunif(a) always write to a temporary, so we have
+ * liveness info available to decide if rf0 is
+ * available for them, however, ldvary is different:
+ * it always writes to rf0 directly so we don't have
+ * liveness information for its implicit rf0 write.
+ *
+ * That means the allocator may assign rf0 to a temp
+ * that is defined while an implicit rf0 write from
+ * ldvary is still live. We fix that by manually
+ * tracking rf0 live ranges from ldvary instructions.
+ */
+ if (inst->qpu.sig.ldvary)
+ last_ldvary_ip = ip;
+
+ update_graph_and_reg_classes_for_inst(c, acc_nodes,
+ implicit_rf_nodes,
+ last_ldvary_ip, inst);
}
+ /* Flag the nodes that are used in the last instructions of the program
+ * (there are some registers that cannot be used in the last 3
+ * instructions). We only do this for fragment shaders, because the idea
+ * is that by avoiding this conflict we may be able to emit the last
+ * thread switch earlier in some cases, however, in non-fragment shaders
+ * this won't happen because the last instructions are always VPM stores
+ * with a small immediate, which conflicts with other signals,
+ * preventing us from ever moving the thrsw earlier.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+ flag_program_end_nodes(c);
+
/* Set the register classes for all our temporaries in the graph */
for (uint32_t i = 0; i < c->num_temps; i++) {
- ra_set_node_class(c->g, temp_to_node(i),
+ ra_set_node_class(c->g, temp_to_node(c, i),
choose_reg_class_for_temp(c, i));
}
/* Add register interferences based on liveness data */
for (uint32_t i = 0; i < c->num_temps; i++) {
+ /* And while we are here, let's also flag nodes for
+ * unused temps.
+ */
+ if (c->temp_start[i] > c->temp_end[i])
+ c->nodes.info[temp_to_node(c, i)].unused = true;
+
for (uint32_t j = i + 1; j < c->num_temps; j++) {
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
ra_add_node_interference(c->g,
- temp_to_node(i),
- temp_to_node(j));
+ temp_to_node(c, i),
+ temp_to_node(c, j));
}
}
}
@@ -1171,9 +1445,9 @@ v3d_register_allocate(struct v3d_compile *c)
if (c->spill_size <
V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
int node = v3d_choose_spill_node(c);
- uint32_t temp = node_to_temp(node);
+ uint32_t temp = node_to_temp(c, node);
if (node != -1) {
- v3d_spill_reg(c, acc_nodes, temp);
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
continue;
}
}
@@ -1186,11 +1460,11 @@ v3d_register_allocate(struct v3d_compile *c)
if (node == -1)
goto spill_fail;
- uint32_t temp = node_to_temp(node);
+ uint32_t temp = node_to_temp(c, node);
enum temp_spill_type spill_type =
get_spill_type_for_temp(c, temp);
if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
- v3d_spill_reg(c, acc_nodes, temp);
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
if (c->spills + c->fills > c->max_tmu_spills)
goto spill_fail;
} else {
@@ -1201,14 +1475,14 @@ v3d_register_allocate(struct v3d_compile *c)
/* Allocation was successful, build the 'temp -> reg' map */
temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
for (uint32_t i = 0; i < c->num_temps; i++) {
- int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
- if (ra_reg < PHYS_INDEX) {
+ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+ if (ra_reg < phys_index) {
temp_registers[i].magic = true;
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
ra_reg - ACC_INDEX);
} else {
temp_registers[i].magic = false;
- temp_registers[i].index = ra_reg - PHYS_INDEX;
+ temp_registers[i].index = ra_reg - phys_index;
}
}
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 45e6bfa1470..4ed184cbbcb 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -86,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
return q;
}
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+ /* If we have a small immediate move it from inst->raddr_b to the
+ * corresponding raddr.
+ */
+ if (src.smimm) {
+ assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+ instr->sig.small_imm_c || instr->sig.small_imm_d);
+ *raddr = instr->raddr_b;
+ return;
+ }
+
+ assert(!src.magic);
+ *raddr = src.index;
+}
+
/**
* Allocates the src register (accumulator or register file) into the RADDR
* fields of the instruction.
*/
static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
{
if (src.smimm) {
- assert(instr->sig.small_imm);
+ assert(instr->sig.small_imm_b);
*mux = V3D_QPU_MUX_B;
return;
}
@@ -106,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
return;
}
- if (instr->alu.add.a != V3D_QPU_MUX_A &&
- instr->alu.add.b != V3D_QPU_MUX_A &&
- instr->alu.mul.a != V3D_QPU_MUX_A &&
- instr->alu.mul.b != V3D_QPU_MUX_A) {
+ if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
instr->raddr_a = src.index;
*mux = V3D_QPU_MUX_A;
} else {
if (instr->raddr_a == src.index) {
*mux = V3D_QPU_MUX_A;
} else {
- assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
- instr->alu.add.b == V3D_QPU_MUX_B &&
- instr->alu.mul.a == V3D_QPU_MUX_B &&
- instr->alu.mul.b == V3D_QPU_MUX_B) ||
+ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
src.index == instr->raddr_b);
instr->raddr_b = src.index;
@@ -128,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
}
}
-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux *mux,
+ uint8_t *raddr,
+ struct qpu_reg src,
+ const struct v3d_device_info *devinfo)
{
- static const struct v3d_qpu_sig no_sig = {0};
-
- /* Make sure it's just a lone MOV. */
- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
- qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
- return false;
- }
+ if (devinfo->ver < 71)
+ return v3d33_set_src(instr, mux, src);
+ else
+ return v3d71_set_src(instr, raddr, src);
+}
- /* Check if it's a MOV from a register to itself. */
+static bool
+v3d33_mov_src_and_dst_equal(struct qinst *qinst)
+{
enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
if (qinst->qpu.alu.mul.magic_write) {
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
return false;
- if (qinst->qpu.alu.mul.a !=
+ if (qinst->qpu.alu.mul.a.mux !=
V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
return false;
}
} else {
int raddr;
- switch (qinst->qpu.alu.mul.a) {
+ switch (qinst->qpu.alu.mul.a.mux) {
case V3D_QPU_MUX_A:
raddr = qinst->qpu.raddr_a;
break;
@@ -168,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
return false;
}
+ return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+ if (qinst->qpu.alu.mul.magic_write)
+ return false;
+
+ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+ int raddr;
+
+ raddr = qinst->qpu.alu.mul.a.raddr;
+ if (raddr != waddr)
+ return false;
+
+ return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return v3d33_mov_src_and_dst_equal(qinst);
+ else
+ return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ static const struct v3d_qpu_sig no_sig = {0};
+
+ /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+ * for V3D 7.x there is also A_MOV, we don't need to check for it as
+ * we always emit using M_MOV. We could use A_MOV later on the
+ * squedule to improve performance
+ */
+ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+ qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+ return false;
+ }
+
+ if (!mov_src_and_dst_equal(qinst, devinfo))
+ return false;
+
/* No packing or flags updates, or we need to execute the
* instruction.
*/
- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -277,8 +352,15 @@ v3d_generate_code_block(struct v3d_compile *c,
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
- if (!dst.magic ||
- dst.index != V3D_QPU_WADDR_R5) {
+ bool use_rf;
+ if (c->devinfo->has_accumulators) {
+ use_rf = !dst.magic ||
+ dst.index != V3D_QPU_WADDR_R5;
+ } else {
+ use_rf = dst.magic || dst.index != 0;
+ }
+
+ if (use_rf) {
assert(c->devinfo->ver >= 40);
if (qinst->qpu.sig.ldunif) {
@@ -300,13 +382,18 @@ v3d_generate_code_block(struct v3d_compile *c,
qinst->qpu.sig_magic = dst.magic;
} else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.a, src[0]);
+ &qinst->qpu.alu.add.a.mux,
+ &qinst->qpu.alu.add.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.b, src[1]);
+ &qinst->qpu.alu.add.b.mux,
+ &qinst->qpu.alu.add.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.add.waddr = dst.index;
@@ -314,17 +401,21 @@ v3d_generate_code_block(struct v3d_compile *c,
} else {
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.a, src[0]);
+ &qinst->qpu.alu.mul.a.mux,
+ &qinst->qpu.alu.mul.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.b, src[1]);
+ &qinst->qpu.alu.mul.b.mux,
+ &qinst->qpu.alu.mul.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.mul.waddr = dst.index;
qinst->qpu.alu.mul.magic_write = dst.magic;
- if (is_no_op_mov(qinst)) {
+ if (is_no_op_mov(qinst, c->devinfo)) {
vir_remove_instruction(c, qinst);
continue;
}
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 2c10e46b188..73cb7aa0575 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
subdir('cle')
-v3d_versions = ['33', '41', '42']
+v3d_versions = ['33', '41', '42', '71']
v3d_libs = []
if with_gallium_v3d or with_broadcom_vk
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index 28fb2357b97..c1590a760de 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n)
static void
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
- const struct v3d_qpu_instr *instr, uint8_t mux)
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux mux)
{
if (mux == V3D_QPU_MUX_A) {
append(disasm, "rf%d", instr->raddr_a);
} else if (mux == V3D_QPU_MUX_B) {
- if (instr->sig.small_imm) {
+ if (instr->sig.small_imm_b) {
uint32_t val;
ASSERTED bool ok =
v3d_qpu_small_imm_unpack(disasm->devinfo,
@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
}
}
+enum v3d_qpu_input_class {
+ V3D_QPU_ADD_A,
+ V3D_QPU_ADD_B,
+ V3D_QPU_MUL_A,
+ V3D_QPU_MUL_B
+};
+
+static void
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ uint8_t raddr,
+ enum v3d_qpu_input_class input_class)
+{
+ bool is_small_imm = false;
+ switch(input_class) {
+ case V3D_QPU_ADD_A:
+ is_small_imm = instr->sig.small_imm_a;
+ break;
+ case V3D_QPU_ADD_B:
+ is_small_imm = instr->sig.small_imm_b;
+ break;
+ case V3D_QPU_MUL_A:
+ is_small_imm = instr->sig.small_imm_c;
+ break;
+ case V3D_QPU_MUL_B:
+ is_small_imm = instr->sig.small_imm_d;
+ break;
+ }
+
+ if (is_small_imm) {
+ uint32_t val;
+ ASSERTED bool ok =
+ v3d_qpu_small_imm_unpack(disasm->devinfo,
+ raddr,
+ &val);
+
+ if ((int)val >= -16 && (int)val <= 15)
+ append(disasm, "%d", val);
+ else
+ append(disasm, "0x%08x", val);
+ assert(ok);
+ } else {
+ append(disasm, "rf%d", raddr);
+ }
+}
+
+static void
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ const struct v3d_qpu_input *input,
+ enum v3d_qpu_input_class input_class)
+{
+ if (disasm->devinfo->ver < 71)
+ v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
+ else
+ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
+}
+
static void
v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
{
@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.b.unpack));
}
}
@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
}
}
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 60dabf74e8e..44f20618a5a 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
return "tmu";
+ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
+ */
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
+ return "quad";
+
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
+ return "rep";
+
static const char *waddr_magic[] = {
[V3D_QPU_WADDR_R0] = "r0",
[V3D_QPU_WADDR_R1] = "r1",
@@ -169,6 +177,12 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
[V3D_QPU_A_ITOF] = "itof",
[V3D_QPU_A_CLZ] = "clz",
[V3D_QPU_A_UTOF] = "utof",
+ [V3D_QPU_A_MOV] = "mov",
+ [V3D_QPU_A_FMOV] = "fmov",
+ [V3D_QPU_A_VPACK] = "vpack",
+ [V3D_QPU_A_V8PACK] = "v8pack",
+ [V3D_QPU_A_V10PACK] = "v10pack",
+ [V3D_QPU_A_V11FPACK] = "v11fpack",
};
if (op >= ARRAY_SIZE(op_names))
@@ -191,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
[V3D_QPU_M_MOV] = "mov",
[V3D_QPU_M_NOP] = "nop",
[V3D_QPU_M_FMUL] = "fmul",
+ [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
+ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
+ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
+ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
+ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
+ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
};
if (op >= ARRAY_SIZE(op_names))
@@ -450,6 +470,13 @@ static const uint8_t add_op_args[] = {
[V3D_QPU_A_ITOF] = D | A,
[V3D_QPU_A_CLZ] = D | A,
[V3D_QPU_A_UTOF] = D | A,
+
+ [V3D_QPU_A_MOV] = D | A,
+ [V3D_QPU_A_FMOV] = D | A,
+ [V3D_QPU_A_VPACK] = D | A | B,
+ [V3D_QPU_A_V8PACK] = D | A | B,
+ [V3D_QPU_A_V10PACK] = D | A | B,
+ [V3D_QPU_A_V11FPACK] = D | A | B,
};
static const uint8_t mul_op_args[] = {
@@ -463,6 +490,12 @@ static const uint8_t mul_op_args[] = {
[V3D_QPU_M_NOP] = 0,
[V3D_QPU_M_MOV] = D | A,
[V3D_QPU_M_FMUL] = D | A | B,
+ [V3D_QPU_M_FTOUNORM16] = D | A,
+ [V3D_QPU_M_FTOSNORM16] = D | A,
+ [V3D_QPU_M_VFTOUNORM8] = D | A,
+ [V3D_QPU_M_VFTOSNORM8] = D | A,
+ [V3D_QPU_M_VFTOUNORM10LO] = D | A,
+ [V3D_QPU_M_VFTOUNORM10HI] = D | A,
};
bool
@@ -636,12 +669,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op)
}
bool
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
{
- if (inst->sig.ldtlb ||
- inst->sig.ldtlbu)
- return true;
+ return inst->sig.ldtlb || inst->sig.ldtlbu;
+}
+bool
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
+{
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
@@ -659,6 +694,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
return false;
}
+bool
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+{
+ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
+}
+
bool
v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
{
@@ -846,6 +887,9 @@ bool
v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if(!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
return true;
@@ -856,6 +900,9 @@ bool
v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
@@ -886,6 +933,9 @@ bool
v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
return true;
@@ -896,6 +946,9 @@ bool
v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (v3d_qpu_writes_r5(devinfo, inst))
return true;
if (v3d_qpu_writes_r4(devinfo, inst))
@@ -912,16 +965,68 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
return false;
}
+bool
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
+{
+ if (devinfo->ver >= 71 &&
+ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
+ return true;
+ }
+
+ return false;
+}
+
bool
v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
{
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
- (add_nsrc > 1 && inst->alu.add.b == mux) ||
- (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
- (mul_nsrc > 1 && inst->alu.mul.b == mux));
+ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
+ (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
+ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
+ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+}
+
+bool
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+
+ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
+ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
+ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
+ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
+}
+
+bool
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
+ !inst->alu.add.magic_write &&
+ inst->alu.add.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
+ !inst->alu.mul.magic_write &&
+ inst->alu.mul.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic && inst->sig_addr == waddr) {
+ return true;
+ }
+
+ return false;
}
bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 2e133472698..56eee9f9cac 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
bool ldvpm:1;
bool ldtlb:1;
bool ldtlbu:1;
- bool small_imm:1;
bool ucb:1;
bool rotate:1;
bool wrtmuc:1;
+ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
+ bool small_imm_b:1; /* raddr_b (add b) */
+ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
+ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
};
enum v3d_qpu_cond {
@@ -88,12 +91,13 @@ enum v3d_qpu_uf {
};
enum v3d_qpu_waddr {
- V3D_QPU_WADDR_R0 = 0,
- V3D_QPU_WADDR_R1 = 1,
- V3D_QPU_WADDR_R2 = 2,
- V3D_QPU_WADDR_R3 = 3,
- V3D_QPU_WADDR_R4 = 4,
- V3D_QPU_WADDR_R5 = 5,
+ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */
+ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */
V3D_QPU_WADDR_NOP = 6,
V3D_QPU_WADDR_TLB = 7,
V3D_QPU_WADDR_TLBU = 8,
@@ -108,12 +112,12 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_SYNC = 16,
V3D_QPU_WADDR_SYNCU = 17,
V3D_QPU_WADDR_SYNCB = 18,
- V3D_QPU_WADDR_RECIP = 19,
- V3D_QPU_WADDR_RSQRT = 20,
- V3D_QPU_WADDR_EXP = 21,
- V3D_QPU_WADDR_LOG = 22,
- V3D_QPU_WADDR_SIN = 23,
- V3D_QPU_WADDR_RSQRT2 = 24,
+ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_TMUC = 32,
V3D_QPU_WADDR_TMUS = 33,
V3D_QPU_WADDR_TMUT = 34,
@@ -129,7 +133,8 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_TMUHSCM = 44,
V3D_QPU_WADDR_TMUHSF = 45,
V3D_QPU_WADDR_TMUHSLOD = 46,
- V3D_QPU_WADDR_R5REP = 55,
+ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
+ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */
};
struct v3d_qpu_flags {
@@ -222,6 +227,14 @@ enum v3d_qpu_add_op {
V3D_QPU_A_ITOF,
V3D_QPU_A_CLZ,
V3D_QPU_A_UTOF,
+
+ /* V3D 7.x */
+ V3D_QPU_A_FMOV,
+ V3D_QPU_A_MOV,
+ V3D_QPU_A_VPACK,
+ V3D_QPU_A_V8PACK,
+ V3D_QPU_A_V10PACK,
+ V3D_QPU_A_V11FPACK,
};
enum v3d_qpu_mul_op {
@@ -235,6 +248,14 @@ enum v3d_qpu_mul_op {
V3D_QPU_M_MOV,
V3D_QPU_M_NOP,
V3D_QPU_M_FMUL,
+
+ /* V3D 7.x */
+ V3D_QPU_M_FTOUNORM16,
+ V3D_QPU_M_FTOSNORM16,
+ V3D_QPU_M_VFTOUNORM8,
+ V3D_QPU_M_VFTOSNORM8,
+ V3D_QPU_M_VFTOUNORM10LO,
+ V3D_QPU_M_VFTOUNORM10HI,
};
enum v3d_qpu_output_pack {
@@ -276,6 +297,15 @@ enum v3d_qpu_input_unpack {
/** Swap high and low 16 bits */
V3D_QPU_UNPACK_SWAP_16,
+
+ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UL,
+ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UH,
+ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IL,
+ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IH,
};
enum v3d_qpu_mux {
@@ -289,25 +319,29 @@ enum v3d_qpu_mux {
V3D_QPU_MUX_B,
};
+struct v3d_qpu_input {
+ union {
+ enum v3d_qpu_mux mux; /* V3D 4.x */
+ uint8_t raddr; /* V3D 7.x */
+ };
+ enum v3d_qpu_input_unpack unpack;
+};
+
struct v3d_qpu_alu_instr {
struct {
enum v3d_qpu_add_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} add;
struct {
enum v3d_qpu_mul_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} mul;
};
@@ -379,8 +413,8 @@ struct v3d_qpu_instr {
struct v3d_qpu_sig sig;
uint8_t sig_addr;
bool sig_magic; /* If the signal writes to a magic address */
- uint8_t raddr_a;
- uint8_t raddr_b;
+ uint8_t raddr_a; /* V3D 4.x */
+ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
struct v3d_qpu_flags flags;
union {
@@ -450,6 +484,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -464,6 +500,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -483,4 +521,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr);
#endif
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 94629aff4fc..4e3c3da8866 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -84,6 +84,9 @@
#define V3D_QPU_MUL_A_SHIFT 18
#define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18)
+#define V3D_QPU_RADDR_C_SHIFT 18
+#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18)
+
#define V3D_QPU_ADD_B_SHIFT 15
#define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15)
@@ -98,6 +101,9 @@
#define V3D_QPU_BRANCH_BDI_SHIFT 12
#define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12)
+#define V3D_QPU_RADDR_D_SHIFT 12
+#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12)
+
#define V3D_QPU_RADDR_A_SHIFT 6
#define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6)
@@ -112,12 +118,15 @@
#define LDTMU .ldtmu = true
#define LDVARY .ldvary = true
#define LDVPM .ldvpm = true
-#define SMIMM .small_imm = true
#define LDTLB .ldtlb = true
#define LDTLBU .ldtlbu = true
#define UCB .ucb = true
#define ROT .rotate = true
#define WRTMUC .wrtmuc = true
+#define SMIMM_A .small_imm_a = true
+#define SMIMM_B .small_imm_b = true
+#define SMIMM_C .small_imm_c = true
+#define SMIMM_D .small_imm_d = true
static const struct v3d_qpu_sig v33_sig_map[] = {
/* MISC R3 R4 R5 */
@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDVARY, LDTMU, },
[13] = { THRSW, LDVARY, LDTMU, },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
/* 18-21 reserved */
@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[27] = { THRSW, LDVPM, LDUNIF },
[28] = { LDVPM, LDTMU, },
[29] = { THRSW, LDVPM, LDTMU, },
- [30] = { SMIMM, LDVPM, },
- [31] = { SMIMM, },
+ [30] = { SMIMM_B, LDVPM, },
+ [31] = { SMIMM_B, },
};
static const struct v3d_qpu_sig v40_sig_map[] = {
@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[10] = { LDVARY, LDUNIF },
[11] = { THRSW, LDVARY, LDUNIF },
/* 12-13 reserved */
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[22] = { UCB, },
[23] = { ROT, },
/* 24-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
};
static const struct v3d_qpu_sig v41_sig_map[] = {
@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDUNIFRF },
[13] = { THRSW, LDUNIFRF },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[24] = { LDUNIFA},
[25] = { LDUNIFARF },
/* 26-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
+};
+
+
+static const struct v3d_qpu_sig v71_sig_map[] = {
+ /* MISC phys RF0 */
+ [0] = { },
+ [1] = { THRSW, },
+ [2] = { LDUNIF },
+ [3] = { THRSW, LDUNIF },
+ [4] = { LDTMU, },
+ [5] = { THRSW, LDTMU, },
+ [6] = { LDTMU, LDUNIF },
+ [7] = { THRSW, LDTMU, LDUNIF },
+ [8] = { LDVARY, },
+ [9] = { THRSW, LDVARY, },
+ [10] = { LDVARY, LDUNIF },
+ [11] = { THRSW, LDVARY, LDUNIF },
+ [12] = { LDUNIFRF },
+ [13] = { THRSW, LDUNIFRF },
+ [14] = { SMIMM_A, },
+ [15] = { SMIMM_B, },
+ [16] = { LDTLB, },
+ [17] = { LDTLBU, },
+ [18] = { WRTMUC },
+ [19] = { THRSW, WRTMUC },
+ [20] = { LDVARY, WRTMUC },
+ [21] = { THRSW, LDVARY, WRTMUC },
+ [22] = { UCB, },
+ /* 23 reserved */
+ [24] = { LDUNIFA},
+ [25] = { LDUNIFARF },
+ /* 26-29 reserved */
+ [30] = { SMIMM_C, },
+ [31] = { SMIMM_D, },
};
bool
@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
if (packed_sig >= ARRAY_SIZE(v33_sig_map))
return false;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ *sig = v71_sig_map[packed_sig];
+ else if (devinfo->ver >= 41)
*sig = v41_sig_map[packed_sig];
else if (devinfo->ver == 40)
*sig = v40_sig_map[packed_sig];
@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
{
static const struct v3d_qpu_sig *map;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ map = v71_sig_map;
+ else if (devinfo->ver >= 41)
map = v41_sig_map;
else if (devinfo->ver == 40)
map = v40_sig_map;
@@ -443,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
/* Make a mapping of the table of opcodes in the spec. The opcode is
* determined by a combination of the opcode field, and in the case of 0 or
- * 1-arg opcodes, the mux_b field as well.
+ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
+ * well.
*/
-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
-#define ANYMUX MUX_MASK(0, 7)
+#define OP_MASK(val) BITFIELD64_BIT(val)
+#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
+#define ANYMUX OP_RANGE(0, 7)
+#define ANYOPMASK OP_RANGE(0, 63)
struct opcode_desc {
uint8_t opcode_first;
uint8_t opcode_last;
- uint8_t mux_b_mask;
- uint8_t mux_a_mask;
+
+ union {
+ struct {
+ uint8_t b_mask;
+ uint8_t a_mask;
+ } mux;
+ uint64_t raddr_mask;
+ };
+
uint8_t op;
/* first_ver == 0 if it's the same across all V3D versions.
@@ -465,122 +522,321 @@ struct opcode_desc {
uint8_t last_ver;
};
-static const struct opcode_desc add_ops[] = {
+static const struct opcode_desc add_ops_v33[] = {
/* FADD is FADDNF depending on the order of the mux_a/mux_b. */
- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD },
- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
- { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD },
- { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB },
- { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
- { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
- { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
- { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
- { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
- { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
- { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
- { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
- { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
+ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
+ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
+ { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
+ { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
+ { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
+ { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
+ { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
+ { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
+ { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
+ { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
+ { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
+ { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
+ { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
/* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
- { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
-
- { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
- { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
- { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
-
- { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
- { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
- { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
- { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
- { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
- { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
- { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
- { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
- { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
- { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
- { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
- { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
- { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
- { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
- { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
- { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
- { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
- { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
-
- { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
- { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
- { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
- { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
-
- { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
- { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
- { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
- { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
- { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
- { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
- { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
- { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
- { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
-
- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
- { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
- { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
- { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
- { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
- { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
- { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
+ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
+ { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
+
+ { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
+ { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
+ { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
+
+ { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
+ { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
+ { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
+ { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
+ { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
+ { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
+ { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
+ { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
+ { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
+ { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
+
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
+ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
+ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+
+ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
/* FIXME: MORE COMPLICATED */
- /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+ /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
- { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
- { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
+ { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
+ { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },
- { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
- { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
- { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
- { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
- { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
- { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
- { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
- { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
+ { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
+ { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
+ { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
+ { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
+ { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
+ { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
+ { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
+ { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },
- { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
- { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
+ { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
+ { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },
/* The stvpms are distinguished by the waddr field. */
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
+
+ { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
+ { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
+ { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
+};
+
+static const struct opcode_desc mul_ops_v33[] = {
+ { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
+ { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
+ { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
+ { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
+ { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
+ { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
+ { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
+ { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
+ { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
+ { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
+
+ { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
+};
+
+/* Note that it would have been possible to define all the add/mul opcodes in
+ * just one table, using the first_ver/last_ver. But taking into account that
+ * for v71 there were a lot of changes, it was more tidy this way. Also right
+ * now we are doing a linear search on those tables, so this maintains the
+ * tables smaller.
+ *
+ * Just in case we merge the tables, we define the first_ver as 71 for those
+ * opcodes that changed on v71
+ */
+static const struct opcode_desc add_ops_v71[] = {
+ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
+ { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
+ { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
+ { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
+ { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
+ { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
+ { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
+ { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
+ { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
+ { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
+ { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
+ { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
+ /* FMIN is instead FMAX depending on the raddr_a/b order. */
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
+ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
+
+ { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
+ { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
+ { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
+ { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
+ { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
+
+ { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
+ { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
+ { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
+ { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
+ { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
+ { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
+ { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
+ { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
+
+ { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+ { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+ { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+ { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
+ { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+ { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+ { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+ { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+ { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
+ { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
+ { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
+ { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
+ { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
+ { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
+ { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
+ { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
+ { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
+ { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
+ { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
+
+ { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
+ { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
+
+ { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
+
+ { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
+
+ { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },
- { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
- { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
- { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
+ /* The stvpms are distinguished by the waddr field. */
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
+
+ { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
+
+ { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
+
+ { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+
+ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
+ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
+
+ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
+
+ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+
+ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
+ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
};
-static const struct opcode_desc mul_ops[] = {
- { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
- { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
- { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
- { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
- { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
- { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
- { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
- { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
- { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
- { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
- { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
+static const struct opcode_desc mul_ops_v71[] = {
+ /* For V3D 7.1, second mask field would be ignored */
+ { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
+ { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
+ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+ { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
+ { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
+ { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
+
+ { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
+
+ { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
};
/* Returns true if op_desc should be filtered out based on devinfo->ver
@@ -589,17 +845,23 @@ static const struct opcode_desc mul_ops[] = {
*/
static bool
opcode_invalid_in_version(const struct v3d_device_info *devinfo,
- const struct opcode_desc *op_desc)
+ const uint8_t first_ver,
+ const uint8_t last_ver)
{
- return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
- (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver);
+ return (first_ver != 0 && devinfo->ver < first_ver) ||
+ (last_ver != 0 && devinfo->ver > last_ver);
}
+/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
+ * on the devinfo->ver some would be ignored. We do this way just to avoid
+ * having two really similar lookup_opcode methods
+ */
static const struct opcode_desc *
lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
const struct opcode_desc *opcodes,
size_t num_opcodes, uint32_t opcode,
- uint32_t mux_a, uint32_t mux_b)
+ uint32_t mux_a, uint32_t mux_b,
+ uint32_t raddr)
{
for (int i = 0; i < num_opcodes; i++) {
const struct opcode_desc *op_desc = &opcodes[i];
@@ -608,14 +870,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
opcode > op_desc->opcode_last)
continue;
- if (opcode_invalid_in_version(devinfo, op_desc))
+ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
continue;
- if (!(op_desc->mux_b_mask & (1 << mux_b)))
- continue;
+ if (devinfo->ver < 71) {
+ if (!(op_desc->mux.b_mask & (1 << mux_b)))
+ continue;
- if (!(op_desc->mux_a_mask & (1 << mux_a)))
- continue;
+ if (!(op_desc->mux.a_mask & (1 << mux_a)))
+ continue;
+ } else {
+ if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
+ continue;
+ }
return op_desc;
}
@@ -667,6 +934,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
}
}
+static bool
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
+ enum v3d_qpu_input_unpack *unpacked)
+{
+ switch (packed) {
+ case 0:
+ *unpacked = V3D_QPU_UNPACK_NONE;
+ return true;
+ case 1:
+ *unpacked = V3D_QPU_UNPACK_UL;
+ return true;
+ case 2:
+ *unpacked = V3D_QPU_UNPACK_UH;
+ return true;
+ case 3:
+ *unpacked = V3D_QPU_UNPACK_IL;
+ return true;
+ case 4:
+ *unpacked = V3D_QPU_UNPACK_IH;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+ uint32_t *packed)
+{
+ switch (unpacked) {
+ case V3D_QPU_UNPACK_NONE:
+ *packed = 0;
+ return true;
+ case V3D_QPU_UNPACK_UL:
+ *packed = 1;
+ return true;
+ case V3D_QPU_UNPACK_UH:
+ *packed = 2;
+ return true;
+ case V3D_QPU_UNPACK_IL:
+ *packed = 3;
+ return true;
+ case V3D_QPU_UNPACK_IH:
+ *packed = 4;
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool
v3d_qpu_float16_unpack_unpack(uint32_t packed,
enum v3d_qpu_input_unpack *unpacked)
@@ -737,8 +1054,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
}
static bool
-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
- struct v3d_qpu_instr *instr)
+v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
@@ -755,8 +1072,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
map_op = (map_op - 253 + 245);
const struct opcode_desc *desc =
- lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
- map_op, mux_a, mux_b);
+ lookup_opcode_from_packed(devinfo, add_ops_v33,
+ ARRAY_SIZE(add_ops_v33),
+ map_op, mux_a, mux_b, 0);
if (!desc)
return false;
@@ -812,12 +1130,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.add.b_unpack)) {
+ &instr->alu.add.b.unpack)) {
return false;
}
break;
@@ -831,7 +1149,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = mux_b & 0x3;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -843,7 +1161,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -851,23 +1169,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.add.a = mux_a;
- instr->alu.add.b = mux_b;
+ instr->alu.add.a.mux = mux_a;
+ instr->alu.add.b.mux = mux_b;
instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
instr->alu.add.magic_write = false;
@@ -892,8 +1210,194 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
}
static bool
-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+ uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
+ uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
+ uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+ uint32_t map_op = op;
+
+ const struct opcode_desc *desc =
+ lookup_opcode_from_packed(devinfo,
+ add_ops_v71,
+ ARRAY_SIZE(add_ops_v71),
+ map_op, 0, 0,
+ raddr_b);
+ if (!desc)
+ return false;
+
+ instr->alu.add.op = desc->op;
+
+ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
+ * operands.
+ */
+ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
+ if (instr->alu.add.op == V3D_QPU_A_FMIN)
+ instr->alu.add.op = V3D_QPU_A_FMAX;
+ if (instr->alu.add.op == V3D_QPU_A_FADD)
+ instr->alu.add.op = V3D_QPU_A_FADDNF;
+ }
+
+ /* Some QPU ops require a bit more than just basic opcode and mux a/b
+ * comparisons to distinguish them.
+ */
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_STVPMV:
+ case V3D_QPU_A_STVPMD:
+ case V3D_QPU_A_STVPMP:
+ switch (waddr) {
+ case 0:
+ instr->alu.add.op = V3D_QPU_A_STVPMV;
+ break;
+ case 1:
+ instr->alu.add.op = V3D_QPU_A_STVPMD;
+ break;
+ case 2:
+ instr->alu.add.op = V3D_QPU_A_STVPMP;
+ break;
+ default:
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP:
+ case V3D_QPU_A_VFPACK:
+ if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
+ instr->alu.add.op != V3D_QPU_A_FCMP) {
+ instr->alu.add.output_pack = (op >> 4) & 0x3;
+ } else {
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+ &instr->alu.add.b.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ unreachable("pending v71 update");
+ if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+
+ case V3D_QPU_A_MOV:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FMOV:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ /* Mul alu FMOV has one additional variant */
+ int32_t unpack = (raddr_b >> 2) & 0x7;
+ if (unpack == 7)
+ return false;
+
+ if (!v3d_qpu_float32_unpack_unpack(unpack,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ default:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+ }
+
+ instr->alu.add.a.raddr = raddr_a;
+ instr->alu.add.b.raddr = raddr_b;
+ instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+
+ instr->alu.add.magic_write = false;
+ if (packed_inst & V3D_QPU_MA) {
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_LDVPMV_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
+ break;
+ case V3D_QPU_A_LDVPMD_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
+ break;
+ case V3D_QPU_A_LDVPMG_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
+ break;
+ default:
+ instr->alu.add.magic_write = true;
+ break;
+ }
+ }
+
+ return true;
+}
+
+static bool
+v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
struct v3d_qpu_instr *instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
+ else
+ return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
+}
+
+static bool
+v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
@@ -901,9 +1405,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
{
const struct opcode_desc *desc =
- lookup_opcode_from_packed(devinfo, mul_ops,
- ARRAY_SIZE(mul_ops),
- op, mux_a, mux_b);
+ lookup_opcode_from_packed(devinfo,
+ mul_ops_v33,
+ ARRAY_SIZE(mul_ops_v33),
+ op, mux_a, mux_b, 0);
if (!desc)
return false;
@@ -915,12 +1420,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.mul.b_unpack)) {
+ &instr->alu.mul.b.unpack)) {
return false;
}
@@ -931,7 +1436,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
((mux_b >> 2) & 1));
if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
@@ -941,74 +1446,169 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.mul.a = mux_a;
- instr->alu.mul.b = mux_b;
+ instr->alu.mul.a.mux = mux_a;
+ instr->alu.mul.b.mux = mux_b;
instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
return true;
}
-static const struct opcode_desc *
-lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
- const struct opcode_desc *opcodes, size_t num_opcodes,
- uint8_t op)
+static bool
+v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
- for (int i = 0; i < num_opcodes; i++) {
- const struct opcode_desc *op_desc = &opcodes[i];
-
- if (op_desc->op != op)
- continue;
+ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+ uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
+ uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);
- if (opcode_invalid_in_version(devinfo, op_desc))
- continue;
+ {
+ const struct opcode_desc *desc =
+ lookup_opcode_from_packed(devinfo,
+ mul_ops_v71,
+ ARRAY_SIZE(mul_ops_v71),
+ op, 0, 0,
+ raddr_d);
+ if (!desc)
+ return false;
- return op_desc;
+ instr->alu.mul.op = desc->op;
}
- return NULL;
-}
-
+ switch (instr->alu.mul.op) {
+ case V3D_QPU_M_FMUL:
+ instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+ &instr->alu.mul.b.unpack)) {
+ return false;
+ }
+
+ break;
+
+ case V3D_QPU_M_FMOV:
+ instr->alu.mul.output_pack = raddr_d & 0x3;
+
+ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ break;
+
+ case V3D_QPU_M_VFMUL:
+ unreachable("pending v71 update");
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ break;
+
+ case V3D_QPU_M_MOV:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+ break;
+
+ default:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+ }
+
+ instr->alu.mul.a.raddr = raddr_c;
+ instr->alu.mul.b.raddr = raddr_d;
+ instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+ instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+
+ return true;
+}
+
static bool
-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
+ else
+ return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
+}
+
+static const struct opcode_desc *
+lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+ const struct opcode_desc *opcodes, size_t num_opcodes,
+ uint8_t op)
+{
+ for (int i = 0; i < num_opcodes; i++) {
+ const struct opcode_desc *op_desc = &opcodes[i];
+
+ if (op_desc->op != op)
+ continue;
+
+ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
+ continue;
+
+ return op_desc;
+ }
+
+ return NULL;
+}
+
+static bool
+v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
uint32_t waddr = instr->alu.add.waddr;
- uint32_t mux_a = instr->alu.add.a;
- uint32_t mux_b = instr->alu.add.b;
+ uint32_t mux_a = instr->alu.add.a.mux;
+ uint32_t mux_b = instr->alu.add.b.mux;
int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
const struct opcode_desc *desc =
- lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+ lookup_opcode_from_instr(devinfo, add_ops_v33,
+ ARRAY_SIZE(add_ops_v33),
instr->alu.add.op);
if (!desc)
return false;
- uint32_t opcode = desc->opcode_first;
+ uint32_t opcode = opcode = desc->opcode_first;
/* If an operation doesn't use an arg, its mux values may be used to
* identify the operation type.
*/
if (nsrc < 2)
- mux_b = ffs(desc->mux_b_mask) - 1;
+ mux_b = ffs(desc->mux.b_mask) - 1;
if (nsrc < 1)
- mux_a = ffs(desc->mux_a_mask) - 1;
+ mux_a = ffs(desc->mux.a_mask) - 1;
bool no_magic_write = false;
@@ -1061,12 +1661,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
opcode |= output_pack << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
@@ -1100,23 +1700,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
uint32_t a_unpack;
uint32_t b_unpack;
- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
- opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
- opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
break;
}
@@ -1135,13 +1735,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
mux_b |= packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
if (packed == 0)
return false;
- opcode = (opcode & ~(1 << 2)) | packed << 2;
+ opcode = (opcode & ~(0x3 << 2)) | packed << 2;
break;
}
@@ -1153,7 +1753,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
return false;
uint32_t packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1166,11 +1766,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1180,8 +1780,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
default:
if (instr->alu.add.op != V3D_QPU_A_NOP &&
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
return false;
}
break;
@@ -1198,15 +1798,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
static bool
-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
- uint32_t mux_a = instr->alu.mul.a;
- uint32_t mux_b = instr->alu.mul.b;
+ uint32_t waddr = instr->alu.add.waddr;
+ uint32_t raddr_a = instr->alu.add.a.raddr;
+ uint32_t raddr_b = instr->alu.add.b.raddr;
+
+ int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+ const struct opcode_desc *desc =
+ lookup_opcode_from_instr(devinfo, add_ops_v71,
+ ARRAY_SIZE(add_ops_v71),
+ instr->alu.add.op);
+ if (!desc)
+ return false;
+
+ uint32_t opcode = opcode = desc->opcode_first;
+
+ /* If an operation doesn't use an arg, its raddr values may be used to
+ * identify the operation type.
+ */
+ if (nsrc < 2)
+ raddr_b = ffsll(desc->raddr_mask) - 1;
+
+ bool no_magic_write = false;
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_STVPMV:
+ waddr = 0;
+ no_magic_write = true;
+ break;
+ case V3D_QPU_A_STVPMD:
+ waddr = 1;
+ no_magic_write = true;
+ break;
+ case V3D_QPU_A_STVPMP:
+ waddr = 2;
+ no_magic_write = true;
+ break;
+
+ case V3D_QPU_A_LDVPMV_IN:
+ case V3D_QPU_A_LDVPMD_IN:
+ case V3D_QPU_A_LDVPMP:
+ case V3D_QPU_A_LDVPMG_IN:
+ assert(!instr->alu.add.magic_write);
+ break;
+
+ case V3D_QPU_A_LDVPMV_OUT:
+ case V3D_QPU_A_LDVPMD_OUT:
+ case V3D_QPU_A_LDVPMG_OUT:
+ assert(!instr->alu.add.magic_write);
+ *packed_instr |= V3D_QPU_MA;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP: {
+ uint32_t output_pack;
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &output_pack)) {
+ return false;
+ }
+ opcode |= output_pack << 4;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ /* These operations with commutative operands are
+ * distinguished by which order their operands come in.
+ */
+ bool ordering =
+ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
+ if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+ ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
+ uint32_t temp;
+
+ temp = a_unpack;
+ a_unpack = b_unpack;
+ b_unpack = temp;
+
+ temp = raddr_a;
+ raddr_a = raddr_b;
+ raddr_b = temp;
+
+ /* If we are swapping raddr_a/b we also need to swap
+ * small_imm_a/b.
+ */
+ if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
+ assert(instr->sig.small_imm_a !=
+ instr->sig.small_imm_b);
+ struct v3d_qpu_sig new_sig = instr->sig;
+ new_sig.small_imm_a = !instr->sig.small_imm_a;
+ new_sig.small_imm_b = !instr->sig.small_imm_b;
+ uint32_t sig;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+ return false;
+ *packed_instr &= ~V3D_QPU_SIG_MASK;
+ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+ }
+ }
+
+ opcode |= a_unpack << 2;
+ opcode |= b_unpack << 0;
+
+ break;
+ }
+
+ case V3D_QPU_A_VFPACK: {
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+
+ break;
+ }
+
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (packed == 0)
+ return false;
+ raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
+ break;
+ }
+
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ uint32_t packed;
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (packed == 0)
+ return false;
+
+ raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
+
+ break;
+
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+ return false;
+ }
+
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed;
+ break;
+
+ case V3D_QPU_A_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_b |= packed << 2;
+ break;
+ }
+
+ case V3D_QPU_A_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b = packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed << 2;
+ break;
+ }
+
+ default:
+ if (instr->alu.add.op != V3D_QPU_A_NOP &&
+ (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
+ break;
+ }
+
+ *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
+ *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
+ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
+ *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
+ if (instr->alu.add.magic_write && !no_magic_write)
+ *packed_instr |= V3D_QPU_MA;
+
+ return true;
+}
+
+static bool
+v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ uint32_t mux_a = instr->alu.mul.a.mux;
+ uint32_t mux_b = instr->alu.mul.b.mux;
int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
const struct opcode_desc *desc =
- lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
+ lookup_opcode_from_instr(devinfo, mul_ops_v33,
+ ARRAY_SIZE(mul_ops_v33),
instr->alu.mul.op);
if (!desc)
@@ -1218,10 +2083,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
* that here. If mux a/b determine packing, it will be set below.
*/
if (nsrc < 2)
- mux_b = ffs(desc->mux_b_mask) - 1;
+ mux_b = ffs(desc->mux.b_mask) - 1;
if (nsrc < 1)
- mux_a = ffs(desc->mux_a_mask) - 1;
+ mux_a = ffs(desc->mux.a_mask) - 1;
switch (instr->alu.mul.op) {
case V3D_QPU_M_FMUL: {
@@ -1236,13 +2101,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
*/
opcode += packed << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
opcode |= packed << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
&packed)) {
return false;
}
@@ -1260,7 +2125,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
opcode |= (packed >> 1) & 1;
mux_b = (packed & 1) << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
@@ -1274,22 +2139,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
return false;
- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
opcode = 8;
else
opcode |= (packed + 4) & 7;
- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
return false;
break;
}
default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
break;
}
@@ -1304,6 +2175,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
return true;
}
+static bool
+v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ uint32_t raddr_c = instr->alu.mul.a.raddr;
+ uint32_t raddr_d = instr->alu.mul.b.raddr;
+ int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+
+ const struct opcode_desc *desc =
+ lookup_opcode_from_instr(devinfo, mul_ops_v71,
+ ARRAY_SIZE(mul_ops_v71),
+ instr->alu.mul.op);
+ if (!desc)
+ return false;
+
+ uint32_t opcode = desc->opcode_first;
+
+ /* Some opcodes have a single valid value for their raddr_d, so set
+ * that here. If raddr_b determine packing, it will be set below.
+ */
+ if (nsrc < 2)
+ raddr_d = ffsll(desc->raddr_mask) - 1;
+
+ switch (instr->alu.mul.op) {
+ case V3D_QPU_M_FMUL: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+ &packed)) {
+ return false;
+ }
+ /* No need for a +1 because desc->opcode_first has a 1 in this
+ * field.
+ */
+ opcode += packed << 4;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed << 2;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed << 0;
+ break;
+ }
+
+ case V3D_QPU_M_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_d |= packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_d |= packed << 2;
+ break;
+ }
+
+ case V3D_QPU_M_VFMUL: {
+ unreachable("pending v71 update");
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+ opcode = 8;
+ else
+ opcode |= (packed + 4) & 7;
+
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+ return false;
+
+ break;
+ }
+
+ case V3D_QPU_M_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_d |= packed << 2;
+ break;
+ }
+
+ default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
+ break;
+ }
+
+ *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
+ *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
+ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
+ *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
+ if (instr->alu.mul.magic_write)
+ *packed_instr |= V3D_QPU_MM;
+
+ return true;
+}
+
+static bool
+v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
+ else
+ return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
+}
+
+static bool
+v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
+ else
+ return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
+}
+
static bool
v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
uint64_t packed_instr,
@@ -1332,8 +2347,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
return false;
}
- instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
- instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+ if (devinfo->ver <= 71) {
+ /*
+ * For v71 this will be set on add/mul unpack, as raddr are now
+ * part of v3d_qpu_input
+ */
+ instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+ instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+ }
if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
return false;
@@ -1419,8 +2440,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
*packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
- *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
- *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+ if (devinfo->ver < 71) {
+ /*
+ * For v71 this will be set on add/mul unpack, as raddr are now
+ * part of v3d_qpu_input
+ */
+ *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+ *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+ }
if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
return false;
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index 2f8e19c73fe..be7b78d5ef0 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -160,10 +160,10 @@ main(int argc, char **argv)
/* Swap the operands to be sure that we test
* how the QPUs distinguish between these ops.
*/
- swap_mux(&instr.alu.add.a,
- &instr.alu.add.b);
- swap_pack(&instr.alu.add.a_unpack,
- &instr.alu.add.b_unpack);
+ swap_mux(&instr.alu.add.a.mux,
+ &instr.alu.add.b.mux);
+ swap_pack(&instr.alu.add.a.unpack,
+ &instr.alu.add.b.unpack);
break;
default:
break;
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
index eea5d3f050e..c4bbd61abc2 100644
--- a/src/broadcom/simulator/v3d_simulator.c
+++ b/src/broadcom/simulator/v3d_simulator.c
@@ -92,6 +92,9 @@ static struct v3d_simulator_state {
/** Last performance monitor ID. */
uint32_t last_perfid;
+ /** Total performance counters */
+ uint32_t perfcnt_total;
+
struct util_dynarray bin_oom;
int refcount;
} sim_state = {
@@ -436,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
if (perfmon)
- v3d41_simulator_perfmon_stop(sim_state.v3d,
- perfmon->ncounters,
- perfmon->values);
+ v3d_X_simulator(perfmon_stop)(sim_state.v3d,
+ perfmon->ncounters,
+ perfmon->values);
perfmon = v3d_get_simulator_perfmon(fd, perfid);
if (perfmon)
- v3d41_simulator_perfmon_start(sim_state.v3d,
- perfmon->ncounters,
- perfmon->counters);
+ v3d_X_simulator(perfmon_start)(sim_state.v3d,
+ perfmon->ncounters,
+ perfmon->counters);
file->active_perfid = perfid;
}
@@ -489,11 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
bin_fd = fd;
v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
-
- if (sim_state.ver >= 41)
- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
- else
- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);
util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
sim_bo) {
@@ -632,15 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
}
-static int
-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
-{
- if (sim_state.ver >= 41)
- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
- else
- return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
-}
-
static int
v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
{
@@ -652,10 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
- else
- ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+ ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);
v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
@@ -682,11 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
v3d_simulator_perfmon_switch(fd, args->perfmon_id);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
- file->gmp->ofs);
- else
- ret = -1;
+ ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
+ file->gmp->ofs);
for (int i = 0; i < args->bo_handle_count; i++)
v3d_simulator_copy_out_handle(file, bo_handles[i]);
@@ -716,7 +700,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
perfmon->ncounters = args->ncounters;
for (int i = 0; i < args->ncounters; i++) {
- if (args->counters[i] >= V3D_PERFCNT_NUM) {
+ if (args->counters[i] >= sim_state.perfcnt_total) {
ralloc_free(perfmon);
return -EINVAL;
} else {
@@ -797,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
return 0;
case DRM_IOCTL_V3D_GET_PARAM:
- return v3d_simulator_get_param_ioctl(fd, args);
+ return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);
case DRM_IOCTL_GEM_CLOSE:
return v3d_simulator_gem_close_ioctl(fd, args);
@@ -880,10 +864,19 @@ v3d_simulator_init_global()
util_dynarray_init(&sim_state.bin_oom, NULL);
- if (sim_state.ver >= 41)
- v3d41_simulator_init_regs(sim_state.v3d);
- else
- v3d33_simulator_init_regs(sim_state.v3d);
+ v3d_X_simulator(init_regs)(sim_state.v3d);
+
+ switch(sim_state.ver) {
+ case 41:
+ case 42:
+ sim_state.perfcnt_total = 87;
+ break;
+ case 71:
+ sim_state.perfcnt_total = 93;
+ break;
+ default:
+ sim_state.perfcnt_total = 0;
+ }
}
struct v3d_simulator_file *
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
index ddb079c1455..92305634468 100644
--- a/src/broadcom/simulator/v3d_simulator.h
+++ b/src/broadcom/simulator/v3d_simulator.h
@@ -52,6 +52,32 @@ uint32_t v3d_simulator_get_mem_free(void);
# define v3dX(x) v3d41_##x
# include "v3dx_simulator.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dx_simulator.h"
+# undef v3dX
+
#endif
+/* Helper to call simulator ver specific functions */
+#define v3d_X_simulator(thing) ({ \
+ __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\
+ switch (sim_state.ver) { \
+ case 33: \
+ case 40: \
+ v3d_X_sim_thing = &v3d33_simulator_##thing; \
+ break; \
+ case 41: \
+ case 42: \
+ v3d_X_sim_thing = &v3d41_simulator_##thing; \
+ break; \
+ case 71: \
+ v3d_X_sim_thing = &v3d71_simulator_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ v3d_X_sim_thing; \
+})
+
#endif
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index c9322f0397b..01cf6b22663 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -46,11 +46,15 @@
#define HW_REGISTER_RO(x) (x)
#define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
+#if V3D_VERSION == 71
+#include "libs/core/v3d/registers/7.1.5.1/v3d.h"
+#else
+#if V3D_VERSION == 41 || V3D_VERSION == 42
+#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
#else
#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
#endif
+#endif
#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
@@ -178,38 +182,48 @@ v3d_flush_caches(struct v3d_hw *v3d)
v3d_flush_l2t(v3d);
}
+#if V3D_VERSION < 71
+#define TFU_REG(NAME) V3D_TFU_ ## NAME
+#else
+#define TFU_REG(NAME) V3D_IFC_ ## NAME
+#endif
+
+
int
v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_submit_tfu *args)
{
- int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
- V3D_WRITE(V3D_TFU_IIA, args->iia);
- V3D_WRITE(V3D_TFU_IIS, args->iis);
- V3D_WRITE(V3D_TFU_ICA, args->ica);
- V3D_WRITE(V3D_TFU_IUA, args->iua);
- V3D_WRITE(V3D_TFU_IOA, args->ioa);
- V3D_WRITE(V3D_TFU_IOS, args->ios);
- V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
- V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
- V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
- V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
-
- V3D_WRITE(V3D_TFU_ICFG, args->icfg);
-
- while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+ int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET;
+
+ V3D_WRITE(TFU_REG(IIA), args->iia);
+ V3D_WRITE(TFU_REG(IIS), args->iis);
+ V3D_WRITE(TFU_REG(ICA), args->ica);
+ V3D_WRITE(TFU_REG(IUA), args->iua);
+ V3D_WRITE(TFU_REG(IOA), args->ioa);
+#if V3D_VERSION >= 71
+ V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
+#endif
+ V3D_WRITE(TFU_REG(IOS), args->ios);
+ V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
+ V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
+ V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
+ V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
+
+ V3D_WRITE(TFU_REG(ICFG), args->icfg);
+
+ while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
v3d_hw_tick(v3d);
}
return 0;
}
-#if V3D_VERSION >= 41
int
v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_submit_csd *args,
uint32_t gmp_ofs)
{
+#if V3D_VERSION >= 41
int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
g_gmp_ofs = gmp_ofs;
@@ -223,6 +237,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
+#if V3D_VERSION >= 71
+ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
+#endif
/* CFG0 kicks off the job */
V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
@@ -239,8 +256,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
v3d_flush_caches(v3d);
return 0;
-}
+#else
+ return -1;
#endif
+}
int
v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
@@ -310,16 +329,17 @@ v3d_isr_core(struct v3d_hw *v3d,
return;
}
+#if V3D_VERSION <= 42
if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
fprintf(stderr, "GMP violation at 0x%08x\n",
V3D_READ(V3D_GMP_VIO_ADDR));
- abort();
} else {
fprintf(stderr,
"Unexpected ISR with core status 0x%08x\n",
core_status);
}
abort();
+#endif
}
static void
@@ -396,6 +416,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
}
handle_mmu_interruptions(v3d, hub_status);
+
+#if V3D_VERSION == 71
+ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
+ fprintf(stderr, "GMP violation at 0x%08x\n",
+ V3D_READ(V3D_GMP_VIO_ADDR));
+ } else {
+ fprintf(stderr,
+ "Unexpected ISR with status 0x%08x\n",
+ hub_status);
+ }
+ abort();
+#endif
}
static void
@@ -436,8 +468,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
* for tracing. Perhaps we should evaluate to do the same here and add
* some debug options.
*/
- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
+ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
+#if V3D_VERSION <= 42
+ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
+#endif
+
V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
@@ -447,6 +482,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */
V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+#if V3D_VERSION == 71
+ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
+#endif
V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
@@ -509,7 +547,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
#define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
#define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
#define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
- V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
+ V3D_PCTR_0_SRC_N_SHIFT(x) + \
+ V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
#endif
void
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
index ad032d832ad..182388a35b4 100644
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
'--beta', with_vulkan_beta.to_string(),
'--device-prefix', 'ver42',
+ '--device-prefix', 'ver71',
],
depend_files : vk_entrypoints_gen_depend_files,
)
@@ -64,13 +65,11 @@ files_per_version = files(
'v3dvx_pipeline.c',
'v3dvx_meta_common.c',
'v3dvx_pipeline.c',
+ 'v3dvx_query.c',
'v3dvx_queue.c',
)
-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']
v3dv_flags = []
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index a14db073b4f..c6462735fe4 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job,
uint32_t layers,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa,
bool double_buffer)
{
@@ -360,13 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job,
tiling->render_target_count = render_target_count;
tiling->msaa = msaa;
tiling->internal_bpp = max_internal_bpp;
+ tiling->total_color_bpp = total_color_bpp;
tiling->double_buffer = double_buffer;
/* Double-buffer is incompatible with MSAA */
assert(!tiling->msaa || !tiling->double_buffer);
- v3d_choose_tile_size(render_target_count, max_internal_bpp,
- tiling->msaa, tiling->double_buffer,
+ v3d_choose_tile_size(&job->device->devinfo,
+ render_target_count,
+ max_internal_bpp, total_color_bpp, msaa,
+ tiling->double_buffer,
&tiling->tile_width, &tiling->tile_height);
tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
@@ -457,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
bool allocate_tile_state_now,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa)
{
assert(job);
@@ -467,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
const struct v3dv_frame_tiling *tiling =
job_compute_frame_tiling(job, width, height, layers,
render_target_count, max_internal_bpp,
- msaa, false);
+ total_color_bpp, msaa, false);
v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
v3dv_return_if_oom(NULL, job);
@@ -528,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
job->frame_tiling.layers,
job->frame_tiling.render_target_count,
job->frame_tiling.internal_bpp,
+ job->frame_tiling.total_color_bpp,
job->frame_tiling.msaa,
true);
@@ -1374,7 +1380,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
}
uint32_t att_count = 0;
- VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
+ VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
/* We only need to emit subpass clears as draw calls for color attachments
* if the render area is not aligned to tile boundaries.
@@ -1672,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
const struct v3dv_framebuffer *framebuffer = state->framebuffer;
- uint8_t internal_bpp;
+ uint8_t max_internal_bpp, total_color_bpp;
bool msaa;
v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
- (framebuffer, state->attachments, subpass, &internal_bpp, &msaa);
+ (framebuffer, state->attachments, subpass,
+ &max_internal_bpp, &total_color_bpp, &msaa);
/* From the Vulkan spec:
*
@@ -1699,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
layers,
true, false,
subpass->color_count,
- internal_bpp,
+ max_internal_bpp,
+ total_color_bpp,
msaa);
}
@@ -2062,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
}
}
+ if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+ if (memcmp(&dest->depth_bounds, &src->depth_bounds,
+ sizeof(src->depth_bounds))) {
+ memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
+ dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+ }
+ }
+
if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
if (dest->line_width != src->line_width) {
dest->line_width = src->line_width;
@@ -2131,39 +2147,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
}
}
-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
-void
-v3dv_viewport_compute_xform(const VkViewport *viewport,
- float scale[3],
- float translate[3])
-{
- float x = viewport->x;
- float y = viewport->y;
- float half_width = 0.5f * viewport->width;
- float half_height = 0.5f * viewport->height;
- double n = viewport->minDepth;
- double f = viewport->maxDepth;
-
- scale[0] = half_width;
- translate[0] = half_width + x;
- scale[1] = half_height;
- translate[1] = half_height + y;
-
- scale[2] = (f - n);
- translate[2] = n;
-
- /* It seems that if the scale is small enough the hardware won't clip
- * correctly so we work around this my choosing the smallest scale that
- * seems to work.
- *
- * This case is exercised by CTS:
- * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
- */
- const float min_abs_scale = 0.000009f;
- if (fabs(scale[2]) < min_abs_scale)
- scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
-}
-
/* Considers the pipeline's negative_one_to_one state and applies it to the
* current viewport transform if needed to produce the resulting Z translate
* and scale parameters.
@@ -2216,9 +2199,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
viewportCount * sizeof(*pViewports));
for (uint32_t i = firstViewport; i < total_count; i++) {
- v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
- state->dynamic.viewport.scale[i],
- state->dynamic.viewport.translate[i]);
+ v3dv_X(cmd_buffer->device, viewport_compute_xform)
+ (&state->dynamic.viewport.viewports[i],
+ state->dynamic.viewport.scale[i],
+ state->dynamic.viewport.translate[i]);
}
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
@@ -2699,6 +2683,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
true, false,
old_job->frame_tiling.render_target_count,
old_job->frame_tiling.internal_bpp,
+ old_job->frame_tiling.total_color_bpp,
true /* msaa */);
v3dv_job_destroy(old_job);
@@ -2963,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
+ if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
+ v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
+
if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
@@ -3410,9 +3398,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
float minDepthBounds,
float maxDepthBounds)
{
- /* We do not support depth bounds testing so we just ignore this. We are
- * already asserting that pipelines don't enable the feature anyway.
- */
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
+ cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
}
VKAPI_ATTR void VKAPI_CALL
@@ -3844,6 +3834,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
void
v3dv_cmd_buffer_rewrite_indirect_csd_job(
+ struct v3dv_device *device,
struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts)
{
@@ -3863,8 +3854,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
- submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
- (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+ uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
+ (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+ if (device->devinfo.ver < 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+ submit->cfg[4] = num_batches - 1;
+ } else {
+ submit->cfg[4] = num_batches;
+ }
assert(submit->cfg[4] != ~0);
if (info->needs_wg_uniform_rewrite) {
@@ -3897,6 +3895,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t **wg_uniform_offsets_out,
uint32_t *wg_size_out)
{
+ struct v3dv_device *device = cmd_buffer->device;
struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
struct v3dv_shader_variant *cs_variant =
@@ -3955,18 +3954,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
if (wg_size_out)
*wg_size_out = wg_size;
- submit->cfg[4] = num_batches - 1;
+ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+ if (device->devinfo.ver < 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+ submit->cfg[4] = num_batches - 1;
+ } else {
+ submit->cfg[4] = num_batches;
+ }
assert(submit->cfg[4] != ~0);
assert(pipeline->shared_data->assembly_bo);
struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
- submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (cs_variant->prog_data.base->single_seg)
submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
if (cs_variant->prog_data.base->threads == 4)
submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+ /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */
+ if (device->devinfo.ver < 71)
+ submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (cs_variant->prog_data.cs->shared_size > 0) {
job->csd.shared_memory =
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 3bad290e8c5..d013edaa63d 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -91,7 +91,7 @@ static const struct vk_instance_extension_table instance_extensions = {
.KHR_display = true,
.KHR_get_display_properties2 = true,
.EXT_direct_mode_display = true,
- .EXT_acquire_drm_display = true,
+ .EXT_acquire_drm_display = false,
#endif
.KHR_external_fence_capabilities = true,
.KHR_external_memory_capabilities = true,
@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device,
*features = (struct vk_features) {
/* Vulkan 1.0 */
.robustBufferAccess = true, /* This feature is mandatory */
- .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
+ .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
.imageCubeArray = true,
.independentBlend = true,
.geometryShader = true,
@@ -224,10 +224,10 @@ get_features(const struct v3dv_physical_device *physical_device,
.logicOp = true,
.multiDrawIndirect = false,
.drawIndirectFirstInstance = true,
- .depthClamp = false, /* Only available since V3D 4.5.1.1 */
+ .depthClamp = physical_device->devinfo.ver >= 71,
.depthBiasClamp = true,
.fillModeNonSolid = true,
- .depthBounds = false, /* Only available since V3D 4.3.16.2 */
+ .depthBounds = physical_device->devinfo.ver >= 71,
.wideLines = true,
.largePoints = true,
.alphaToOne = true,
@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device,
* problematic, we would always have to scalarize. Overall, this would
* not lead to best performance so let's just not support it.
*/
- .scalarBlockLayout = false,
+ .scalarBlockLayout = physical_device->devinfo.ver >= 71,
/* This tells applications 2 things:
*
* 1. If they can select just one aspect for barriers. For us barriers
@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
device->next_program_id = 0;
ASSERTED int len =
- asprintf(&device->name, "V3D %d.%d",
- device->devinfo.ver / 10, device->devinfo.ver % 10);
+ asprintf(&device->name, "V3D %d.%d.%d",
+ device->devinfo.ver / 10,
+ device->devinfo.ver % 10,
+ device->devinfo.rev);
assert(len != -1);
v3dv_physical_device_init_disk_cache(device);
@@ -1212,6 +1214,12 @@ create_physical_device(struct v3dv_instance *instance,
list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
+ if (device->devinfo.ver != 42) {
+ fprintf(stderr, "WARNING: v3dv support for hw version %i is neither "
+ "a complete nor a conformant Vulkan implementation. Testing "
+ "use only.\n", device->devinfo.ver);
+ }
+
return VK_SUCCESS;
fail:
@@ -1279,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance)
if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+ strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
v3d_idx = i;
break;
}
@@ -1288,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance)
} else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
vc4_idx = i;
break;
}
@@ -1326,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
switch (dev->devinfo.ver) {
case 42:
return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+ case 71:
+ return 0x55701C33; /* Broadcom deviceID for 2712 */
default:
unreachable("Unsupported V3D version");
}
@@ -1354,6 +1366,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
const VkSampleCountFlags supported_sample_counts =
VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
+ const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
+
struct timespec clock_res;
clock_getres(CLOCK_MONOTONIC, &clock_res);
const float timestamp_period =
@@ -1424,7 +1438,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.maxFragmentInputComponents = max_varying_components,
.maxFragmentOutputAttachments = 4,
.maxFragmentDualSrcAttachments = 0,
- .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS +
+ .maxFragmentCombinedOutputResources = max_rts +
MAX_STORAGE_BUFFERS +
MAX_STORAGE_IMAGES,
@@ -1437,7 +1451,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.subPixelPrecisionBits = V3D_COORD_SHIFT,
.subTexelPrecisionBits = 8,
.mipmapPrecisionBits = 8,
- .maxDrawIndexedIndexValue = 0x00ffffff,
+ .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ?
+ 0xffffffff : 0x00ffffff,
.maxDrawIndirectCount = 0x7fffffff,
.maxSamplerLodBias = 14.0f,
.maxSamplerAnisotropy = 16.0f,
@@ -1464,7 +1479,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.framebufferDepthSampleCounts = supported_sample_counts,
.framebufferStencilSampleCounts = supported_sample_counts,
.framebufferNoAttachmentsSampleCounts = supported_sample_counts,
- .maxColorAttachments = MAX_RENDER_TARGETS,
+ .maxColorAttachments = max_rts,
.sampledImageColorSampleCounts = supported_sample_counts,
.sampledImageIntegerSampleCounts = supported_sample_counts,
.sampledImageDepthSampleCounts = supported_sample_counts,
@@ -2031,7 +2046,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
device->instance->default_pipeline_cache_enabled);
device->default_attribute_float =
- v3dv_pipeline_create_default_attribute_values(device, NULL);
+ v3dv_X(device, create_default_attribute_values)(device, NULL);
device->device_address_mem_ctx = ralloc_context(NULL);
util_dynarray_init(&device->device_address_bo_list,
@@ -2975,7 +2990,7 @@ v3dv_CreateSampler(VkDevice _device,
}
}
- v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
+ v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
*pSampler = v3dv_sampler_to_handle(sampler);
diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
index ebbd60e4c03..e01e2e1bd19 100644
--- a/src/broadcom/vulkan/v3dv_image.c
+++ b/src/broadcom/vulkan/v3dv_image.c
@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device,
* makes sense to implement swizzle composition using VkSwizzle directly.
*/
VkFormat format;
- uint8_t image_view_swizzle[4];
if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
format = VK_FORMAT_R8G8B8A8_UINT;
@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device,
vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
- image_view_swizzle);
+ iview->view_swizzle);
} else {
format = pCreateInfo->format;
vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
- image_view_swizzle);
+ iview->view_swizzle);
}
iview->vk.view_format = format;
@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device,
const uint8_t *format_swizzle =
v3dv_get_format_swizzle(device, format, plane);
- util_format_compose_swizzles(format_swizzle, image_view_swizzle,
+ util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
iview->planes[plane].swizzle);
iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
index 9cda9f0d6d2..8ac99724105 100644
--- a/src/broadcom/vulkan/v3dv_limits.h
+++ b/src/broadcom/vulkan/v3dv_limits.h
@@ -50,8 +50,6 @@
#define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
MAX_DYNAMIC_STORAGE_BUFFERS)
-#define MAX_RENDER_TARGETS 4
-
#define MAX_MULTIVIEW_VIEW_COUNT 16
/* These are tunable parameters in the HW design, but all the V3D
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index a200298a898..0b64653000d 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
v3dv_job_start_frame(job, width, height, max_layer,
false, true, 1, internal_bpp,
+ 4 * v3d_internal_bpp_words(internal_bpp),
image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
@@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
uint32_t bit_offset = 0;
key |= rt_idx;
- bit_offset += 2;
+ bit_offset += 3;
key |= ((uint64_t) format) << bit_offset;
bit_offset += 32;
@@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- /* We can only clear attachments in the current subpass */
- assert(attachmentCount <= 5); /* 4 color + D/S */
+ /* We can have at most max_color_RTs + 1 D/S attachments */
+ assert(attachmentCount <=
+ V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
+ /* We can only clear attachments in the current subpass */
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index c0ec888b8c7..2d30c611e17 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers,
- false, true, 1, internal_bpp,
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
(fb_format, region->srcSubresource.aspectMask,
&internal_type, &internal_bpp);
- v3dv_job_start_frame(job, width, height, num_layers, false, true,
- 1, internal_bpp, true);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ true);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 20f5014268d..0583faf6f9a 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device,
/* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
* the clear might get lost. If a subpass has this then we can't emit
- * the clear using the TLB and we have to do it as a draw call.
+ * the clear using the TLB and we have to do it as a draw call. This
+ * issue is fixed since V3D 4.3.18.
*
* FIXME: separate stencil.
*/
- if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+ if (device->devinfo.ver == 42 &&
+ subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
struct v3dv_render_pass_attachment *att =
&pass->attachments[subpass->ds_attachment.attachment];
if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
@@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device,
/* Granularity is defined by the tile size */
assert(subpass_idx < pass->subpass_count);
struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
- const uint32_t color_attachment_count = subpass->color_count;
+ const uint32_t color_count = subpass->color_count;
bool msaa = false;
- uint32_t max_bpp = 0;
- for (uint32_t i = 0; i < color_attachment_count; i++) {
+ uint32_t max_internal_bpp = 0;
+ uint32_t total_color_bpp = 0;
+ for (uint32_t i = 0; i < color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
if (attachment_idx == VK_ATTACHMENT_UNUSED)
continue;
@@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device,
v3dv_X(device, get_internal_type_bpp_for_output_format)
(format->planes[0].rt_type, &internal_type, &internal_bpp);
- max_bpp = MAX2(max_bpp, internal_bpp);
+ max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
+ total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
msaa = true;
@@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device,
* heuristics so we choose a conservative granularity here, with it disabled.
*/
uint32_t width, height;
- v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
+ v3d_choose_tile_size(&device->devinfo, color_count,
+ max_internal_bpp, total_color_bpp, msaa,
false /* double-buffer */, &width, &height);
*granularity = (VkExtent2D) {
.width = width,
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 99fe8c16bfa..d6629c9a4a0 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
return V3DV_DYNAMIC_LINE_WIDTH;
case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
-
- /* Depth bounds testing is not available in in V3D 4.2 so here we are just
- * ignoring this dynamic state. We are already asserting at pipeline creation
- * time that depth bounds testing is not enabled.
- */
case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
- return 0;
+ return V3DV_DYNAMIC_DEPTH_BOUNDS;
default:
unreachable("Unhandled dynamic state");
@@ -2632,6 +2627,7 @@ pipeline_init_dynamic_state(
const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
{
/* Initialize to default values */
+ const struct v3d_device_info *devinfo = &pipeline->device->devinfo;
struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
memset(dynamic, 0, sizeof(*dynamic));
dynamic->stencil_compare_mask.front = ~0;
@@ -2639,7 +2635,9 @@ pipeline_init_dynamic_state(
dynamic->stencil_write_mask.front = ~0;
dynamic->stencil_write_mask.back = ~0;
dynamic->line_width = 1.0f;
- dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
+ dynamic->color_write_enable =
+ (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
+ dynamic->depth_bounds.max = 1.0f;
/* Create a mask of enabled dynamic states */
uint32_t dynamic_states = 0;
@@ -2661,9 +2659,10 @@ pipeline_init_dynamic_state(
pViewportState->viewportCount);
for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
- v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
- dynamic->viewport.scale[i],
- dynamic->viewport.translate[i]);
+ v3dv_X(pipeline->device, viewport_compute_xform)
+ (&dynamic->viewport.viewports[i],
+ dynamic->viewport.scale[i],
+ dynamic->viewport.translate[i]);
}
}
@@ -2691,6 +2690,11 @@ pipeline_init_dynamic_state(
dynamic->stencil_reference.front = pDepthStencilState->front.reference;
dynamic->stencil_reference.back = pDepthStencilState->back.reference;
}
+
+ if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+ dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds;
+ dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds;
+ }
}
if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
@@ -2802,62 +2806,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
}
}
-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
- for (uint8_t i = 0; i < pipeline->va_count; i++) {
- if (vk_format_is_int(pipeline->va[i].vk_format))
- return true;
- }
- return false;
-}
-
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline)
-{
- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
- struct v3dv_bo *bo;
-
- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
- if (!bo) {
- fprintf(stderr, "failed to allocate memory for the default "
- "attribute values\n");
- return NULL;
- }
-
- bool ok = v3dv_bo_map(device, bo, size);
- if (!ok) {
- fprintf(stderr, "failed to map default attribute values buffer\n");
- return false;
- }
-
- uint32_t *attrs = bo->map;
- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
- attrs[i * 4 + 0] = 0;
- attrs[i * 4 + 1] = 0;
- attrs[i * 4 + 2] = 0;
- VkFormat attr_format =
- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
- if (i < va_count && vk_format_is_int(attr_format)) {
- attrs[i * 4 + 3] = 1;
- } else {
- attrs[i * 4 + 3] = fui(1.0);
- }
- }
-
- v3dv_bo_unmap(device, bo);
-
- return bo;
-}
-
static void
pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
const VkPipelineMultisampleStateCreateInfo *ms_info)
@@ -2960,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline,
/* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
* feature and it shouldn't be used by any pipeline.
*/
- assert(!ds_info || !ds_info->depthBoundsTestEnable);
+ assert(device->devinfo.ver >= 71 ||
+ !ds_info || !ds_info->depthBoundsTestEnable);
+ pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable;
enable_depth_bias(pipeline, rs_info);
@@ -2992,9 +2942,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
- if (pipeline_has_integer_vertex_attrib(pipeline)) {
+ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
pipeline->default_attribute_values =
- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
if (!pipeline->default_attribute_values)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
} else {
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index c6707211529..89e2f1c7e5c 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -123,6 +123,9 @@ struct v3d_simulator_file;
/* Minimum required by the Vulkan 1.1 spec */
#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
+/* Maximum performance counters number */
+#define V3D_MAX_PERFCNT 93
+
struct v3dv_physical_device {
struct vk_physical_device vk;
@@ -581,6 +584,10 @@ struct v3dv_device {
* being float being float, allowing us to reuse the same BO for all
* pipelines matching this requirement. Pipelines that need integer
* attributes will create their own BO.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
+ *
*/
struct v3dv_bo *default_attribute_float;
@@ -772,6 +779,8 @@ struct v3dv_image_view {
const struct v3dv_format *format;
+ uint8_t view_swizzle[4];
+
uint8_t plane_count;
struct {
uint8_t image_plane;
@@ -782,8 +791,8 @@ struct v3dv_image_view {
uint32_t internal_type;
uint32_t offset;
- /* Precomputed (composed from createinfo->components and formar swizzle)
- * swizzles to pass in to the shader key.
+ /* Precomputed swizzle (composed from the view swizzle and the format
+ * swizzle).
*
* This could be also included on the descriptor bo, but the shader state
* packet doesn't need it on a bo, so we can just avoid a memory copy
@@ -946,6 +955,7 @@ struct v3dv_frame_tiling {
uint32_t layers;
uint32_t render_target_count;
uint32_t internal_bpp;
+ uint32_t total_color_bpp;
bool msaa;
bool double_buffer;
uint32_t tile_width;
@@ -1040,7 +1050,8 @@ enum v3dv_dynamic_state_bits {
V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6,
V3DV_DYNAMIC_LINE_WIDTH = 1 << 7,
V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8,
- V3DV_DYNAMIC_ALL = (1 << 9) - 1,
+ V3DV_DYNAMIC_DEPTH_BOUNDS = 1 << 9,
+ V3DV_DYNAMIC_ALL = (1 << 10) - 1,
};
/* Flags for dirty pipeline state.
@@ -1065,6 +1076,7 @@ enum v3dv_cmd_dirty_bits {
V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 16,
V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 17,
V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 18,
+ V3DV_CMD_DIRTY_DEPTH_BOUNDS = 1 << 19,
};
struct v3dv_dynamic_state {
@@ -1101,6 +1113,11 @@ struct v3dv_dynamic_state {
float slope_factor;
} depth_bias;
+ struct {
+ float min;
+ float max;
+ } depth_bounds;
+
float line_width;
uint32_t color_write_enable;
@@ -1196,7 +1213,7 @@ struct v3dv_timestamp_query_cpu_job_info {
};
/* Number of perfmons required to handle all supported performance counters */
-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
DRM_V3D_MAX_PERF_COUNTERS)
struct v3dv_perf_query {
@@ -1369,6 +1386,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
bool allocate_tile_state_now,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa);
bool v3dv_job_type_is_gpu(struct v3dv_job *job);
@@ -1667,7 +1685,7 @@ struct v3dv_query_pool {
/* Only used with performance queries */
struct {
uint32_t ncounters;
- uint8_t counters[V3D_PERFCNT_NUM];
+ uint8_t counters[V3D_MAX_PERFCNT];
/* V3D has a limit on the number of counters we can track in a
* single performance monitor, so if too many counters are requested
@@ -1803,7 +1821,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
struct drm_v3d_submit_tfu *tfu);
-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
+void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
+ struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts);
void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
@@ -2289,11 +2308,15 @@ struct v3dv_pipeline {
unsigned char sha1[20];
/* In general we can reuse v3dv_device->default_attribute_float, so note
- * that the following can be NULL.
+ * that the following can be NULL. In 7.x this is not used, so it will be
+ * NULL.
*
* FIXME: the content of this BO will be small, so it could be improved to
* be uploaded to a common BO. But as in most cases it will be NULL, it is
* not a priority.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
*/
struct v3dv_bo *default_attribute_values;
@@ -2323,6 +2346,9 @@ struct v3dv_pipeline {
bool is_z16;
} depth_bias;
+ /* Depth bounds */
+ bool depth_bounds_test_enabled;
+
struct {
void *mem_ctx;
struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
@@ -2338,6 +2364,13 @@ struct v3dv_pipeline {
uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
};
+static inline bool
+v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
+{
+ return device->devinfo.ver > 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
+}
+
static inline VkPipelineBindPoint
v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
{
@@ -2500,10 +2533,6 @@ void
v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
struct v3dv_pipeline_cache *cache);
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline);
-
VkResult
v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
nir_shader *nir,
@@ -2608,12 +2637,32 @@ u64_compare(const void *key1, const void *key2)
case 42: \
v3d_X_thing = &v3d42_##thing; \
break; \
+ case 71: \
+ v3d_X_thing = &v3d71_##thing; \
+ break; \
default: \
unreachable("Unsupported hardware generation"); \
} \
v3d_X_thing; \
})
+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({ \
+ __typeof(V3D42_##thing) V3D_X_THING; \
+ switch (device->devinfo.ver) { \
+ case 42: \
+ V3D_X_THING = V3D42_##thing; \
+ break; \
+ case 71: \
+ V3D_X_THING = V3D71_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ V3D_X_THING; \
+})
+
+
/* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
* define v3dX for each version supported, because when we compile code that
@@ -2626,6 +2675,10 @@ u64_compare(const void *key1, const void *key2)
# define v3dX(x) v3d42_##x
# include "v3dvx_private.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dvx_private.h"
+# undef v3dX
#endif
#ifdef ANDROID
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 3284c467d74..deb7821f02b 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -23,7 +23,6 @@
#include "v3dv_private.h"
-#include "common/v3d_performance_counters.h"
#include "util/timespec.h"
#include "compiler/nir/nir_builder.h"
@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device,
DRM_IOCTL_V3D_PERFMON_CREATE,
&req);
if (ret)
- fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
+ fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
pool->queries[query].perf.kperfmon_ids[i] = req.id;
}
@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device,
QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
assert(pq_info);
- assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
pool->perfmon.ncounters = pq_info->counterIndexCount;
for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device,
assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
struct v3dv_query *q = &pool->queries[query];
- uint64_t counter_values[V3D_PERFCNT_NUM];
+ uint64_t counter_values[V3D_MAX_PERFCNT];
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
struct drm_v3d_perfmon_get_values req = {
@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
VkPerformanceCounterKHR *pCounters,
VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
{
- uint32_t desc_count = *pCounterCount;
+ V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
- out, pCounters, pCounterCount);
- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
- out_desc, pCounterDescriptions, &desc_count);
-
- for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
- vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
- counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
- counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
- counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
-
- unsigned char sha1_result[20];
- _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
- strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
- sha1_result);
-
- memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
- }
-
- vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
- &out_desc, desc) {
- desc->flags = 0;
- snprintf(desc->name, sizeof(desc->name), "%s",
- v3d_performance_counters[i][V3D_PERFCNT_NAME]);
- snprintf(desc->category, sizeof(desc->category), "%s",
- v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
- snprintf(desc->description, sizeof(desc->description), "%s",
- v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
- }
- }
-
- return vk_outarray_status(&out);
+ return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
+ pCounters,
+ pCounterDescriptions);
}
VKAPI_ATTR void VKAPI_CALL
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index b4aae195180..429d14a9196 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
if (memcmp(group_counts, info->csd_job->csd.wg_count,
sizeof(info->csd_job->csd.wg_count)) != 0) {
- v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
+ v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
}
return VK_SUCCESS;
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 72fa9a1b39c..0e681cc4ee2 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
+ float clipper_xy_granularity =
+ V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
for (int i = 0; i < uinfo->count; i++) {
uint32_t data = uinfo->data[i];
@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
break;
case QUNIFORM_VIEWPORT_X_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Y_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Z_OFFSET: {
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index f182b790d36..1bd634f5027 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
};
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
@@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideally we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
}
/* There's definitely nothing in the VCD cache we want. */
@@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
iview->vk.base_array_layer + layer,
image_plane);
+ /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+ * is broken in earlier V3D versions.
+ */
+ assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
+
cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = buffer;
store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
@@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
const VkImageAspectFlags aspects =
vk_format_aspects(ds_attachment->desc.format);
+#if V3D_VERSION <= 42
+ /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+ * for depth/stencil.
+ *
+ * There used to be some confusion regarding the Clear Tile Buffers
+ * Z/S bit also being broken, but we confirmed with Broadcom that this
+ * is not the case, it was just that some other hardware bugs (that we
+ * need to work around, such as GFXH-1461) could cause this bit to behave
+ * incorrectly.
+ *
+ * There used to be another issue where the RTs bit in the Clear Tile
+ * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+ * fixed since V3D 4.1.
+ *
+ * So if we have to emit a clear of depth or stencil we don't use
+ * the per-buffer store clear bit, even if we need to store the buffers,
+ * instead we always have to use the Clear Tile Buffers Z/S bit.
+ * If we have configured the job to do early Z/S clearing, then we
+ * don't want to emit any Clear Tile Buffers command at all here.
+ *
+ * Note that GFXH-1689 is not reproduced in the simulator, where
+ * using the clear buffer bit in depth/stencil stores works fine.
+ */
+
/* Only clear once on the first subpass that uses the attachment */
uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
ds_attachment->first_subpass :
@@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
ds_attachment->desc.stencilLoadOp,
subpass->do_stencil_clear_with_draw);
+ use_global_zs_clear = !state->job->early_zs_clear &&
+ (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+ /* The store command's clear buffer bit cannot be used for Z/S stencil:
+ * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+ * so we don't want to emit redundant clears here.
+ */
+ use_global_zs_clear = false;
+#endif
+
/* Skip the last store if it is not required */
uint32_t ds_last_subpass = !pass->multiview_enabled ?
ds_attachment->last_subpass :
@@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
needs_stencil_store = subpass->resolve_stencil;
}
- /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
- * for depth/stencil.
- *
- * There used to be some confusion regarding the Clear Tile Buffers
- * Z/S bit also being broken, but we confirmed with Broadcom that this
- * is not the case, it was just that some other hardware bugs (that we
- * need to work around, such as GFXH-1461) could cause this bit to behave
- * incorrectly.
- *
- * There used to be another issue where the RTs bit in the Clear Tile
- * Buffers packet also cleared Z/S, but Broadcom confirmed this is
- * fixed since V3D 4.1.
- *
- * So if we have to emit a clear of depth or stencil we don't use
- * the per-buffer store clear bit, even if we need to store the buffers,
- * instead we always have to use the Clear Tile Buffers Z/S bit.
- * If we have configured the job to do early Z/S clearing, then we
- * don't want to emit any Clear Tile Buffers command at all here.
- *
- * Note that GFXH-1689 is not reproduced in the simulator, where
- * using the clear buffer bit in depth/stencil stores works fine.
- */
- use_global_zs_clear = !state->job->early_zs_clear &&
- (needs_depth_clear || needs_stencil_clear);
if (needs_depth_store || needs_stencil_store) {
const uint32_t zs_buffer =
v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
* bit and instead we have to emit a single clear of all tile buffers.
*/
if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = use_global_zs_clear;
clear.clear_all_render_targets = use_global_rt_clear;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
}
@@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
}
}
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary
+ * right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+ if (vk_format_is_int(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_INT;
+ else if (vk_format_is_srgb(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_NORM;
+ else
+ return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+ switch (rt_type) {
+ case V3D_INTERNAL_TYPE_8I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+ case V3D_INTERNAL_TYPE_8UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_8:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ case V3D_INTERNAL_TYPE_16I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+ case V3D_INTERNAL_TYPE_16UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_16F:
+ return vk_format_is_srgb(vk_format) ?
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+ case V3D_INTERNAL_TYPE_32I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+ case V3D_INTERNAL_TYPE_32UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_32F:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+ default:
+ unreachable("Unknown internal render target type");
+ }
+
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+ int rt,
+ uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+ uint32_t *rt_type,
+ uint32_t *rt_clamp)
+#else
+ uint32_t *rt_type_clamp)
+#endif
+{
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ assert(state->subpass_idx < state->pass->subpass_count);
+ const struct v3dv_subpass *subpass =
+ &state->pass->subpasses[state->subpass_idx];
+
+ if (rt >= subpass->color_count)
+ return;
+
+ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+ const uint32_t attachment_idx = attachment->attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ return;
+
+ assert(attachment_idx < state->framebuffer->attachment_count &&
+ attachment_idx < state->attachment_alloc_count);
+ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+ assert(vk_format_is_color(iview->vk.format));
+
+ assert(iview->plane_count == 1);
+ *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+ *rt_type = iview->planes[0].internal_type;
+ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+}
+
void
v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
config.number_of_render_targets = MAX2(subpass->color_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
const struct v3dv_image_view *iview =
@@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* Early-Z/S clearing is independent of Early Z/S testing, so it is
* possible to enable one but not the other so long as their
* respective requirements are met.
+ *
+ * From V3D 4.5.6, Z/S buffers are always cleared automatically
+ * between tiles, but we still want to enable early ZS clears
+ * when Z/S are not loaded or stored.
*/
struct v3dv_render_pass_attachment *ds_attachment =
&pass->attachments[ds_attachment_idx];
@@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const VkImageAspectFlags ds_aspects =
vk_format_aspects(ds_attachment->desc.format);
- bool needs_depth_clear =
- check_needs_clear(state,
- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_attachment->first_subpass,
- ds_attachment->desc.loadOp,
- subpass->do_depth_clear_with_draw);
-
bool needs_depth_store =
v3dv_cmd_buffer_check_needs_store(state,
ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
ds_attachment->last_subpass,
ds_attachment->desc.storeOp) ||
subpass->resolve_depth;
+#if V3D_VERSION <= 42
+ bool needs_depth_clear =
+ check_needs_clear(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ subpass->do_depth_clear_with_draw);
do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+ bool needs_depth_load =
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp);
+ do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
if (do_early_zs_clear &&
vk_format_has_stencil(ds_attachment->desc.format)) {
bool needs_stencil_load =
@@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
*/
job->early_zs_clear = do_early_zs_clear;
+#if V3D_VERSION >= 71
+ uint32_t base_addr = 0;
+#endif
for (uint32_t i = 0; i < subpass->color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.render_target_number = i;
+ rt.stride = 1; /* Unused */
+ }
+#endif
continue;
+ }
struct v3dv_image_view *iview =
state->attachments[attachment_idx].image_view;
@@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const struct v3d_resource_slice *slice =
&image->planes[plane].slices[iview->vk.base_mip_level];
- const uint32_t *clear_color =
+ UNUSED const uint32_t *clear_color =
&state->attachments[attachment_idx].clear_value.color[0];
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
slice->tiling == V3D_TILING_UIF_XOR) {
int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
@@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = clear_color[0];
clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
@@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
clear.render_target_number = i;
};
}
+#endif
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.clear_color_low_bits = clear_color[0];
+ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+ &rt.internal_type_and_clamping);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = base_addr;
+ rt.render_target_number = i;
+
+ /* base_addr in multiples of 512 bits. We divide by 8 because stride
+ * is in 128-bit units, but it is packing 2 rows worth of data, so we
+ * need to divide it by 2 so it is only 1 row, and then again by 4 so
+ * it is in 512-bit units.
+ */
+ base_addr += (tiling->tile_height * rt.stride) / 8;
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) clear_color[1]) |
+ (((uint64_t) (clear_color[2] & 0xff)) << 32);
+ rt.render_target_number = i;
+ }
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (clear_color[3])) << 24);
+ rt.render_target_number = i;
+ }
+ }
+#endif
}
+#if V3D_VERSION >= 71
+ /* If we don't have any color RTs, we still need to emit one and flag
+ * it as not used using stride = 1.
+ */
+ if (subpass->color_count == 0) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.stride = 1;
+ }
+ }
+#endif
+
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
&rt.render_target_0_internal_type, &rt.render_target_0_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 1, &rt.render_target_1_internal_bpp,
&rt.render_target_1_internal_type, &rt.render_target_1_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 2, &rt.render_target_2_internal_bpp,
&rt.render_target_2_internal_type, &rt.render_target_2_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
}
+#endif
/* Ends rendering mode config. */
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
if (cmd_buffer->state.tile_aligned_render_area &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -1054,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
cl_emit(rcl, END_OF_RENDERING, end);
}
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+ float scale[3],
+ float translate[3])
+{
+ float x = viewport->x;
+ float y = viewport->y;
+ float half_width = 0.5f * viewport->width;
+ float half_height = 0.5f * viewport->height;
+ double n = viewport->minDepth;
+ double f = viewport->maxDepth;
+
+ scale[0] = half_width;
+ translate[0] = half_width + x;
+ scale[1] = half_height;
+ translate[1] = half_height + y;
+
+ scale[2] = (f - n);
+ translate[2] = n;
+
+ /* It seems that if the scale is small enough the hardware won't clip
+ * correctly so we work around this my choosing the smallest scale that
+ * seems to work.
+ *
+ * This case is exercised by CTS:
+ * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+ *
+ * V3D 7.x fixes this by using the new
+ * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
+ */
+#if V3D_VERSION <= 42
+ const float min_abs_scale = 0.0005f;
+ if (fabs(scale[2]) < min_abs_scale)
+ scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+#endif
+}
+
void
v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
v3dv_return_if_oom(cmd_buffer, NULL);
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+ }
+#endif
float translate_z, scale_z;
v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
&translate_z, &scale_z);
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
clip.viewport_z_offset_zc_to_zs = translate_z;
clip.viewport_z_scale_zc_to_zs = scale_z;
}
+#endif
+
+#if V3D_VERSION >= 71
+ /* If the Z scale is too small guardband clipping may not clip correctly */
+ if (fabsf(scale_z) < 0.01f) {
+ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
+ } else {
+ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
+ }
+#endif
+
cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
/* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
* we are using OpenGL's [-1, 1] instead.
@@ -1205,14 +1499,48 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+#if V3D_VERSION <= 42
if (pipeline->depth_bias.is_z16)
bias.depth_offset_units *= 256.0f;
+#endif
bias.limit = dynamic->depth_bias.depth_bias_clamp;
}
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
}
+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ /* No depthBounds support for v42, so this method is empty on that case.
+ *
+ * Note that this method is being called as v3dv_job_init flag all state as
+ * dirty. See FIXME note at v3dv_job_init.
+ */
+
+#if V3D_VERSION >= 71
+ struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+ assert(pipeline);
+
+ if (!pipeline->depth_bounds_test_enabled)
+ return;
+
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
+
+ v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+ cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
+ bounds.lower_test_limit = dynamic->depth_bounds.min;
+ bounds.upper_test_limit = dynamic->depth_bounds.max;
+ }
+
+ cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+#endif
+}
+
void
v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -1256,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
+ const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
+ const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
+
const uint32_t blend_packets_size =
cl_packet_length(BLEND_ENABLES) +
cl_packet_length(BLEND_CONSTANT_COLOR) +
- cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+ cl_packet_length(BLEND_CFG) * max_color_rts;
v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
v3dv_return_if_oom(cmd_buffer, NULL);
@@ -1271,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
- for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+ for (uint32_t i = 0; i < max_color_rts; i++) {
if (pipeline->blend.enables & (1 << i))
cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
}
@@ -1298,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+ uint32_t color_write_mask = ~dynamic->color_write_enable |
+ pipeline->blend.color_write_masks;
+#if V3D_VERSION <= 42
+ /* Only 4 RTs */
+ color_write_mask &= 0xffff;
+#endif
+
cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
- mask.mask = (~dynamic->color_write_enable |
- pipeline->blend.color_write_masks) & 0xffff;
+ mask.mask = color_write_mask;
}
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
@@ -1591,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
-
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
v3dv_return_if_oom(cmd_buffer, NULL);
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+#if V3D_VERSION == 42
+ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
config.early_z_enable = enable_ez;
config.early_z_updates_enable = config.early_z_enable &&
pipeline->z_updates_enable;
+#endif
}
}
@@ -1845,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
gs_bin->prog_data.gs->base.threads == 4;
shader.geometry_bin_mode_shader_start_in_final_thread_section =
gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
shader.geometry_bin_mode_shader_uniforms_address =
gs_bin_uniforms;
@@ -1855,7 +2195,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
gs->prog_data.gs->base.threads == 4;
shader.geometry_render_mode_shader_start_in_final_thread_section =
gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
shader.geometry_render_mode_shader_uniforms_address =
gs_render_uniforms;
}
@@ -2031,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
pipeline->vpm_cfg.Gv);
}
+#if V3D_VERSION == 42
struct v3dv_bo *default_attribute_values =
pipeline->default_attribute_values != NULL ?
pipeline->default_attribute_values :
pipeline->device->default_attribute_float;
+#endif
cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
pipeline->shader_state_record, shader) {
@@ -2060,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+#if V3D_VERSION == 42
shader.address_of_default_attribute_values =
v3dv_cl_address(default_attribute_values, 0);
+#endif
shader.any_shader_reads_hardware_written_primitive_id =
(pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -2370,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
buffer->mem_offset + offset);
}
}
-
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp)
-{
- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
- assert(state->subpass_idx < state->pass->subpass_count);
- const struct v3dv_subpass *subpass =
- &state->pass->subpasses[state->subpass_idx];
-
- if (rt >= subpass->color_count)
- return;
-
- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
- const uint32_t attachment_idx = attachment->attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
- return;
-
- assert(attachment_idx < state->framebuffer->attachment_count &&
- attachment_idx < state->attachment_alloc_count);
- struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
- assert(vk_format_is_color(iview->vk.format));
-
- assert(iview->plane_count == 1);
- *rt_bpp = iview->planes[0].internal_bpp;
- *rt_type = iview->planes[0].internal_type;
- if (vk_format_is_int(iview->vk.view_format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
- else if (vk_format_is_srgb(iview->vk.view_format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
- else
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-}
diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
index e235983864c..1b50d51e19f 100644
--- a/src/broadcom/vulkan/v3dvx_device.c
+++ b/src/broadcom/vulkan/v3dvx_device.c
@@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = {
[VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS,
};
-
static union pipe_color_union encode_border_color(
+ const struct v3dv_device *device,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
{
const struct util_format_description *desc =
@@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color(
* colors so we need to fix up the swizzle manually for this case.
*/
uint8_t swizzle[4];
- if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+ const bool v3d_has_reverse_swap_rb_bits =
+ v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
+ if (!v3d_has_reverse_swap_rb_bits &&
+ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
swizzle[0] = PIPE_SWIZZLE_W;
swizzle[1] = PIPE_SWIZZLE_X;
swizzle[2] = PIPE_SWIZZLE_Y;
swizzle[3] = PIPE_SWIZZLE_Z;
+ }
+ /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
+ * we have to use the new reverse and swap_r/b flags in the texture shader
+ * state which will apply the format swizzle automatically when sampling
+ * the border color too and we should not apply it manually here.
+ */
+ else if (v3d_has_reverse_swap_rb_bits &&
+ (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
+ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
+ swizzle[0] = PIPE_SWIZZLE_X;
+ swizzle[1] = PIPE_SWIZZLE_Y;
+ swizzle[2] = PIPE_SWIZZLE_Z;
+ swizzle[3] = PIPE_SWIZZLE_W;
} else {
memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
}
@@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color(
(1 << (desc->channel[i].size - 1)) - 1);
}
- /* convert from float to expected format */
+#if V3D_VERSION <= 42
+ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+ * for us. In V3D 4.x we need to manually convert floating point color
+ * values to the expected format.
+ */
if (vk_format_is_srgb(bc_info->format) ||
vk_format_is_compressed(bc_info->format)) {
for (int i = 0; i < 4; i++)
@@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color(
}
}
}
+#endif
return border;
}
void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+ struct v3dv_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
{
@@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
s.border_color_mode = border_color_mode;
if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
- union pipe_color_union border = encode_border_color(bc_info);
+ union pipe_color_union border = encode_border_color(device, bc_info);
s.border_color_word_0 = border.ui[0];
s.border_color_word_1 = border.ui[1];
@@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
const struct v3dv_framebuffer *framebuffer,
const struct v3dv_cmd_buffer_attachment_state *attachments,
const struct v3dv_subpass *subpass,
- uint8_t *max_bpp,
+ uint8_t *max_internal_bpp,
+ uint8_t *total_color_bpp,
bool *msaa)
{
STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
- *max_bpp = V3D_INTERNAL_BPP_32;
+ *max_internal_bpp = V3D_INTERNAL_BPP_32;
+ *total_color_bpp = 0;
*msaa = false;
if (subpass) {
@@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
assert(att);
assert(att->plane_count == 1);
- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+ const uint32_t internal_bpp = att->planes[0].internal_bpp;
+ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ }
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
@@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
}
-
return;
}
@@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
assert(att);
assert(att->plane_count == 1);
- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+ const uint32_t internal_bpp = att->planes[0].internal_bpp;
+ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ }
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index 80a3e5bfde8..de984e81220 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
-
tex.texture_type = image_view->format->planes[plane].tex_type;
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
@@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
- tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
-
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
* add the bo to the job. This also means that we need to add manually
@@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
iplane);
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+ bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+
+ /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
+ * the reverse and/or swap_r/b swizzle from the format table with the
+ * image view swizzle. This, however, doesn't work for border colors,
+ * for that there is the reverse_standard_border_color.
+ *
+ * In v3d 7.x, however, there is no reverse_standard_border_color bit,
+ * since the reverse and swap_r/b bits also affect border colors. It is
+ * because of this that we absolutely need to use these bits with
+ * reversed and swpaped formats, since that's the only way to ensure
+ * correct border colors. In that case we don't want to program the
+ * swizzle to the composition of the format swizzle and the view
+ * swizzle like we do in v3d 4.x, since the format swizzle is applied
+ * via the reverse and swap_r/b bits.
+ */
+#if V3D_VERSION == 42
+ tex.srgb = is_srgb;
+ tex.reverse_standard_border_color =
+ image_view->planes[plane].channel_reverse;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+
+ tex.reverse = image_view->planes[plane].channel_reverse;
+ tex.r_b_swap = image_view->planes[plane].swap_rb;
+
+ if (tex.reverse || tex.r_b_swap) {
+ tex.swizzle_r =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
+ tex.swizzle_g =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
+ tex.swizzle_b =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
+ tex.swizzle_a =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
+ }
+
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
}
@@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
assert(buffer_view->format->plane_count == 1);
tex.texture_type = buffer_view->format->planes[0].tex_type;
- tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+
+ bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+#if V3D_VERSION == 42
+ tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
buffer_view->offset;
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 04147b82cbd..858096f9e4b 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -26,6 +26,7 @@
#include "broadcom/common/v3d_macros.h"
#include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
@@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job,
config.number_of_render_targets = 1;
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
config.internal_depth_type = fb->internal_depth_type;
}
+ const uint32_t *color = NULL;
if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (clear_info->image) {
const struct v3dv_image *image = clear_info->image;
@@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
}
}
- const uint32_t *color = &clear_info->clear_value->color[0];
+ color = &clear_info->clear_value->color[0];
+
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = color[0];
clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
@@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
clear.render_target_number = 0;
};
}
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = tiling->internal_bpp;
rt.render_target_0_internal_type = fb->internal_type;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ if (color)
+ rt.clear_color_low_bits = color[0];
+ rt.internal_bpp = tiling->internal_bpp;
+ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+ fb->vk_format);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = 0;
+ rt.render_target_number = 0;
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) color[1]) |
+ (((uint64_t) (color[2] & 0xff)) << 32);
+ rt.render_target_number = 0;
+ }
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (color[3])) << 24);
+ rt.render_target_number = 0;
+ }
+ }
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job,
*/
if (clear_value &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = true;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
tfu.iia |= src_offset;
+#if V3D_VERSION <= 42
if (src_tiling == V3D_TILING_RASTER) {
tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
} else {
@@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
V3D33_TFU_ICFG_FORMAT_SHIFT;
}
tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
+#endif
+#if V3D_VERSION >= 71
+ if (src_tiling == V3D_TILING_RASTER) {
+ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ } else {
+ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ }
+ tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
+#endif
tfu.ioa = dst_offset;
+#if V3D_VERSION <= 42
tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
(dst_tiling - V3D_TILING_LINEARTILE)) <<
V3D33_TFU_IOA_FORMAT_SHIFT;
+#endif
+
+#if V3D_VERSION >= 71
+ tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
+ (dst_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_IOC_FORMAT_SHIFT;
+
+ switch (dst_tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.v71.ioc |=
+ (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ case V3D_TILING_RASTER:
+ tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ default:
+ break;
+ }
+#endif
switch (src_tiling) {
case V3D_TILING_UIF_NO_XOR:
@@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
/* The TFU can handle raster sources but always produces UIF results */
assert(dst_tiling != V3D_TILING_RASTER);
+#if V3D_VERSION <= 42
/* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
* OPAD field for the destination (how many extra UIF blocks beyond
* those necessary to cover the height).
@@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
uif_block_h;
tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
}
+#endif
v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
}
@@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t width, height;
framebuffer_size_for_pixel_count(num_items, &width, &height);
- v3dv_job_start_frame(job, width, height, 1, true, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
@@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t width, height;
framebuffer_size_for_pixel_count(num_items, &width, &height);
- v3dv_job_start_frame(job, width, height, 1, true, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 5d32d414ed8..ad22add155d 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -227,6 +227,45 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
pipeline->z_updates_enable = config.z_updates_enable;
+
+#if V3D_VERSION >= 71
+ /* From the Vulkan spec:
+ *
+ * "depthClampEnable controls whether to clamp the fragments depth
+ * values as described in Depth Test. If the pipeline is not created
+ * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+ * then enabling depth clamp will also disable clipping primitives to
+ * the z planes of the frustrum as described in Primitive Clipping.
+ * Otherwise depth clipping is controlled by the state set in
+ * VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+ *
+ * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
+ * supported in the driver yet, so in practice we are always enabling Z
+ * clipping for now.
+ */
+ bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+ bool z_clip_enable = false;
+ const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+ ds_info ? vk_find_struct_const(ds_info->pNext,
+ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+ NULL;
+ if (clip_info)
+ z_clip_enable = clip_info->depthClipEnable;
+ else if (!z_clamp_enable)
+ z_clip_enable = true;
+
+ if (z_clip_enable) {
+ config.z_clipping_mode = pipeline->negative_one_to_one ?
+ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
+ } else {
+ config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+ }
+
+ config.z_clamp_mode = z_clamp_enable;
+
+ config.depth_bounds_test_enable =
+ ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
+#endif
};
}
@@ -360,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
static void
pack_shader_state_record(struct v3dv_pipeline *pipeline)
{
- assert(sizeof(pipeline->shader_state_record) ==
+ assert(sizeof(pipeline->shader_state_record) >=
cl_packet_length(GL_SHADER_STATE_RECORD));
struct v3d_fs_prog_data *prog_data_fs =
@@ -435,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
shader.number_of_varyings_in_fragment_shader =
prog_data_fs->num_inputs;
- shader.coordinate_shader_propagate_nans = true;
- shader.vertex_shader_propagate_nans = true;
- shader.fragment_shader_propagate_nans = true;
-
/* Note: see previous note about addresses */
/* shader.coordinate_shader_code_address */
/* shader.vertex_shader_code_address */
/* shader.fragment_shader_code_address */
+#if V3D_VERSION == 42
+ shader.coordinate_shader_propagate_nans = true;
+ shader.vertex_shader_propagate_nans = true;
+ shader.fragment_shader_propagate_nans = true;
+
/* FIXME: Use combined input/output size flag in the common case (also
* on v3d, see v3dx_draw).
*/
@@ -451,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
prog_data_vs_bin->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs->separate_segments;
-
shader.coordinate_shader_input_vpm_segment_size =
prog_data_vs_bin->separate_segments ?
prog_data_vs_bin->vpm_input_size : 1;
shader.vertex_shader_input_vpm_segment_size =
prog_data_vs->separate_segments ?
prog_data_vs->vpm_input_size : 1;
+#endif
+
+ /* On V3D 7.1 there isn't a specific flag to set if we are using
+ * shared/separate segments or not. We just set the value of
+ * vpm_input_size to 0, and set output to the max needed. That should be
+ * already properly set on prog_data_vs_bin
+ */
+#if V3D_VERSION == 71
+ shader.coordinate_shader_input_vpm_segment_size =
+ prog_data_vs_bin->vpm_input_size;
+ shader.vertex_shader_input_vpm_segment_size =
+ prog_data_vs->vpm_input_size;
+#endif
shader.coordinate_shader_output_vpm_segment_size =
prog_data_vs_bin->vpm_output_size;
@@ -659,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
}
}
}
+
+#if V3D_VERSION == 42
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+ for (uint8_t i = 0; i < pipeline->va_count; i++) {
+ if (vk_format_is_int(pipeline->va[i].vk_format))
+ return true;
+ }
+ return false;
+}
+#endif
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION == 42
+ return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+ return false;
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION >= 71
+ return NULL;
+#endif
+
+ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+ struct v3dv_bo *bo;
+
+ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+ if (!bo) {
+ fprintf(stderr, "failed to allocate memory for the default "
+ "attribute values\n");
+ return NULL;
+ }
+
+ bool ok = v3dv_bo_map(device, bo, size);
+ if (!ok) {
+ fprintf(stderr, "failed to map default attribute values buffer\n");
+ return NULL;
+ }
+
+ uint32_t *attrs = bo->map;
+ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+ attrs[i * 4 + 0] = 0;
+ attrs[i * 4 + 1] = 0;
+ attrs[i * 4 + 2] = 0;
+ VkFormat attr_format =
+ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+ if (i < va_count && vk_format_is_int(attr_format)) {
+ attrs[i * 4 + 3] = 1;
+ } else {
+ attrs[i * 4 + 3] = fui(1.0);
+ }
+ }
+
+ v3dv_bo_unmap(device, bo);
+
+ return bo;
+}
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ad8ddfa5731..0f5887eab93 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer);
void
v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
+
void
v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
@@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
uint32_t internal_size,
uint32_t *hw_color);
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp);
-
/* Used at v3dv_device */
void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+ struct v3dv_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
@@ -143,7 +140,9 @@ void
v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
const struct v3dv_cmd_buffer_attachment_state *attachments,
const struct v3dv_subpass *subpass,
- uint8_t *max_bpp, bool *msaa);
+ uint8_t *max_internal_bpp,
+ uint8_t *total_color_bpp,
+ bool *msaa);
#ifdef DEBUG
void
@@ -313,10 +312,24 @@ void
v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
const VkPipelineVertexInputStateCreateInfo *vi_info,
const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline);
+
/* Used at v3dv_queue */
void
v3dX(job_emit_noop)(struct v3dv_job *job);
+/* Used at v3dv_query */
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
+
/* Used at v3dv_descriptor_set, and other descriptor set utils */
uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
@@ -325,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
+
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+ float scale[3],
+ float translate[3]);
diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
new file mode 100644
index 00000000000..e59a1e84ff6
--- /dev/null
+++ b/src/broadcom/vulkan/v3dvx_query.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "common/v3d_performance_counters.h"
+
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+ uint32_t desc_count = *pCounterCount;
+
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+ out, pCounters, pCounterCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+ out_desc, pCounterDescriptions, &desc_count);
+
+ for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+ counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+ counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+ counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+ unsigned char sha1_result[20];
+ _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+ strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+ sha1_result);
+
+ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+ }
+
+ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+ &out_desc, desc) {
+ desc->flags = 0;
+ snprintf(desc->name, sizeof(desc->name), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+ snprintf(desc->category, sizeof(desc->category), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+ snprintf(desc->description, sizeof(desc->description), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index efe63de425c..6eed2de9d54 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -29,7 +29,8 @@
void
v3dX(job_emit_noop)(struct v3dv_job *job)
{
- v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false);
+ v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
+ V3D_INTERNAL_BPP_32, 4, false);
v3dX(job_emit_binning_flush)(job);
struct v3dv_cl *rcl = &job->rcl;
@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
config.image_height_pixels = 1;
config.number_of_render_targets = 1;
config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = 3; /* Tile size 64 */
+ config.log2_tile_height = 3; /* Tile size 64 */
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.internal_bpp = V3D_INTERNAL_BPP_32;
+ rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ rt.stride = 1; /* Unused RT */
+ }
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = 1.0f;
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index e6383b67737..46395d79a89 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -62,6 +62,8 @@ template = """\
#include "util/softfloat.h"
#include "util/bigmath.h"
#include "util/format/format_utils.h"
+#include "util/format_r11g11b10f.h"
+#include "util/u_math.h"
#include "nir_constant_expressions.h"
/**
@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u)
return _mesa_half_to_float(u);
}
+/* Broadcom v3d specific instructions */
+/**
+ * Packs 2 2x16 floating split into a r11g11b10f
+ */
+static uint32_t v11fpack_v3d(const uint32_t src0,
+ const uint32_t src1)
+{
+ float rgb[3];
+
+ rgb[0] = unpack_half_1x16((src0 & 0xffff));
+ rgb[1] = unpack_half_1x16((src0 >> 16));
+ rgb[2] = unpack_half_1x16((src1 & 0xffff));
+
+ return float3_to_r11g11b10f(rgb);
+}
+
+/**
+ * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
+ * as it receives a uint16_t val instead of a float
+ */
+static uint8_t _mesa_half_to_snorm8(uint16_t val)
+{
+ float x = _mesa_half_to_float(val);
+
+ return pack_snorm_1x8(x);
+}
+
+static uint16_t _mesa_float_to_snorm16(uint32_t val)
+{
+ union fi aux;
+ aux.ui = val;
+ return pack_snorm_1x16(aux.f);
+}
+
+static uint16_t _mesa_float_to_unorm16(uint32_t val)
+{
+ union fi aux;
+ aux.ui = val;
+ return pack_unorm_1x16(aux.f);
+}
+
+/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too
+ * verbose. It is likely that there would be a simpler way to implement
+ * it.
+ */
+static uint32_t float_pack16_v3d(uint32_t f32)
+{
+ float f = uif(f32);
+ return _mesa_float_to_half(f);
+}
+
+static uint32_t float_unpack16_v3d(uint32_t f16)
+{
+ float f = _mesa_half_to_float(f16);
+ return fui(f);
+}
+
+static uint32_t vfpack_v3d(uint32_t a, uint32_t b)
+{
+ return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
+}
+
+static uint32_t vfsat_v3d(uint32_t a)
+{
+ return vfpack_v3d(
+ fui(SATURATE(_mesa_half_to_float(a & 0xffff))),
+ fui(SATURATE(_mesa_half_to_float(a >> 16))));
+}
+
+static uint32_t fmul_v3d(uint32_t a, uint32_t b)
+{
+ float f = uif(a);
+ float g = uif(b);
+
+ float x = f * g;
+
+ return fui(x);
+}
+
+#define L(x) float_unpack16_v3d((x) & 0xffff)
+#define H(x) float_unpack16_v3d((x) >> 16)
+#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b)))
+
+static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
+{
+ return V(fmul_v3d, a, b);
+}
+
+/* Convert 2x16-bit floating point to 2x10-bit unorm */
+static uint32_t vftounorm10lo(uint32_t src0)
+{
+ return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
+}
+
+/*
+ * Convert 2x16-bit floating point to one 2-bit and one
+ * 10-bit unorm
+ */
+static uint32_t vftounorm10hi(uint32_t src0)
+{
+ return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
+}
+
+
/* Some typed vector structures to make things like src0.y work */
typedef int8_t int1_t;
typedef uint8_t uint1_t;
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index e4d87aa6126..63aa7cfa315 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) {
}
""")
+# v3d-specific opcodes
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into
+# r11g11b10 bits, rounding to nearest even
+binop_convert("v11fpack_v3d", tuint32, tuint32, "",
+ "v11fpack_v3d(src0, src1)")
+
+# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
+# difference with pack_32_2x16_split is that the sources are 32bit too. So it
+# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit
+# pack.
+binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
+ "(src0.x & 0xffff) | (src1.x << 16)")
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2
+binop_convert("v10pack_v3d", tuint32, tuint32, "",
+ "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
+
+# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
+# dst[7:0] = src0[7:0]
+# dst[15:8] = src0[23:16]
+# dst[23:16] = src1[7:0]
+# dst[31:24] = src1[23:16]
+opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
+ False, "",
+ "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
+
+# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
+unop("vftounorm8_v3d", tuint32,
+ "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
+unop("vftosnorm8_v3d", tuint32,
+ "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
+
+# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
+unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
+unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
+unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
+# and one 10 bit unorm
+unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)")
+
# Mali-specific opcodes
unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
diff --git a/src/gallium/drivers/v3d/driinfo_v3d.h b/src/gallium/drivers/v3d/driinfo_v3d.h
index 147ad0b49bd..8f989e8aa57 100644
--- a/src/gallium/drivers/v3d/driinfo_v3d.h
+++ b/src/gallium/drivers/v3d/driinfo_v3d.h
@@ -2,4 +2,6 @@
DRI_CONF_SECTION_MISCELLANEOUS
DRI_CONF_V3D_NONMSAA_TEXTURE_SIZE_LIMIT(false)
+ DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(false)
+ DRI_CONF_V3D_IS_XSERVER_PROCESS(false)
DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
index dfa1e88097b..e47682db1aa 100644
--- a/src/gallium/drivers/v3d/meson.build
+++ b/src/gallium/drivers/v3d/meson.build
@@ -34,7 +34,6 @@ files_libv3d = files(
'v3d_query.c',
'v3d_query.h',
'v3d_query_pipe.c',
- 'v3d_query_perfcnt.c',
'v3d_resource.c',
'v3d_resource.h',
'v3d_screen.c',
@@ -47,8 +46,10 @@ files_per_version = files(
'v3dx_emit.c',
'v3dx_format_table.c',
'v3dx_job.c',
+ 'v3dx_query_perfcnt.c',
'v3dx_rcl.c',
'v3dx_state.c',
+ 'v3dx_tfu.c',
)
v3d_args = ['-DV3D_BUILD_NEON']
@@ -58,7 +59,17 @@ if dep_v3dv3.found()
v3d_args += '-DUSE_V3D_SIMULATOR'
endif
-v3d_versions = ['33', '42']
+v3d_versions = ['33', '42', '71']
+
+v3d_deps = [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers]
+
+if with_platform_x11
+ v3d_deps += dep_xcb
+endif
+
+if with_platform_wayland
+ v3d_deps += dep_wayland_client
+endif
per_version_libs = []
foreach ver : v3d_versions
@@ -71,7 +82,7 @@ foreach ver : v3d_versions
],
c_args : [v3d_args, '-DV3D_VERSION=' + ver],
gnu_symbol_visibility : 'hidden',
- dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers],
+ dependencies : v3d_deps,
)
endforeach
@@ -94,10 +105,7 @@ libv3d = static_library(
c_args : [v3d_args],
cpp_args : [v3d_args],
gnu_symbol_visibility : 'hidden',
- dependencies : [
- dep_v3dv3, dep_libdrm, dep_valgrind,
- idep_nir_headers, idep_mesautil,
- ],
+ dependencies : v3d_deps + idep_mesautil,
link_with: [per_version_libs],
)
diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
index b7dc56a044e..ee3c14b154c 100644
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
info->mask &= ~PIPE_MASK_S;
}
-static bool
-v3d_tfu(struct pipe_context *pctx,
- struct pipe_resource *pdst,
- struct pipe_resource *psrc,
- unsigned int src_level,
- unsigned int base_level,
- unsigned int last_level,
- unsigned int src_layer,
- unsigned int dst_layer,
- bool for_mipmap)
-{
- struct v3d_context *v3d = v3d_context(pctx);
- struct v3d_screen *screen = v3d->screen;
- struct v3d_resource *src = v3d_resource(psrc);
- struct v3d_resource *dst = v3d_resource(pdst);
- struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
- struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
- int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
- int width = u_minify(pdst->width0, base_level) * msaa_scale;
- int height = u_minify(pdst->height0, base_level) * msaa_scale;
- enum pipe_format pformat;
-
- if (psrc->format != pdst->format)
- return false;
- if (psrc->nr_samples != pdst->nr_samples)
- return false;
-
- /* Can't write to raster. */
- if (dst_base_slice->tiling == V3D_TILING_RASTER)
- return false;
-
- /* When using TFU for blit, we are doing exact copies (both input and
- * output format must be the same, no scaling, etc), so there is no
- * pixel format conversions. Thus we can rewrite the format to use one
- * that is TFU compatible based on its texel size.
- */
- if (for_mipmap) {
- pformat = pdst->format;
- } else {
- switch (dst->cpp) {
- case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
- case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break;
- case 4: pformat = PIPE_FORMAT_R32_FLOAT; break;
- case 2: pformat = PIPE_FORMAT_R16_FLOAT; break;
- case 1: pformat = PIPE_FORMAT_R8_UNORM; break;
- default: unreachable("unsupported format bit-size"); break;
- };
- }
-
- uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
- struct v3d_device_info *devinfo = &screen->devinfo;
-
- if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) {
- assert(for_mipmap);
- return false;
- }
-
- v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
- v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
-
- struct drm_v3d_submit_tfu tfu = {
- .ios = (height << 16) | width,
- .bo_handles = {
- dst->bo->handle,
- src != dst ? src->bo->handle : 0
- },
- .in_sync = v3d->out_sync,
- .out_sync = v3d->out_sync,
- };
- uint32_t src_offset = (src->bo->offset +
- v3d_layer_offset(psrc, src_level, src_layer));
- tfu.iia |= src_offset;
- if (src_base_slice->tiling == V3D_TILING_RASTER) {
- tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
- V3D33_TFU_ICFG_FORMAT_SHIFT);
- } else {
- tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
- (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
- V3D33_TFU_ICFG_FORMAT_SHIFT);
- }
-
- uint32_t dst_offset = (dst->bo->offset +
- v3d_layer_offset(pdst, base_level, dst_layer));
- tfu.ioa |= dst_offset;
- if (last_level != base_level)
- tfu.ioa |= V3D33_TFU_IOA_DIMTW;
- tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
- (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
- V3D33_TFU_IOA_FORMAT_SHIFT);
-
- tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
- tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
-
- switch (src_base_slice->tiling) {
- case V3D_TILING_UIF_NO_XOR:
- case V3D_TILING_UIF_XOR:
- tfu.iis |= (src_base_slice->padded_height /
- (2 * v3d_utile_height(src->cpp)));
- break;
- case V3D_TILING_RASTER:
- tfu.iis |= src_base_slice->stride / src->cpp;
- break;
- case V3D_TILING_LINEARTILE:
- case V3D_TILING_UBLINEAR_1_COLUMN:
- case V3D_TILING_UBLINEAR_2_COLUMN:
- break;
- }
-
- /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
- * OPAD field for the destination (how many extra UIF blocks beyond
- * those necessary to cover the height). When filling mipmaps, the
- * miplevel 1+ tiling state is inferred.
- */
- if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
- dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
- int uif_block_h = 2 * v3d_utile_height(dst->cpp);
- int implicit_padded_height = align(height, uif_block_h);
-
- tfu.icfg |= (((dst_base_slice->padded_height -
- implicit_padded_height) / uif_block_h) <<
- V3D33_TFU_ICFG_OPAD_SHIFT);
- }
-
- int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
- if (ret != 0) {
- fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
- return false;
- }
-
- dst->writes++;
-
- return true;
-}
-
bool
v3d_generate_mipmap(struct pipe_context *pctx,
struct pipe_resource *prsc,
@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx,
if (first_layer != last_layer)
return false;
- return v3d_tfu(pctx,
- prsc, prsc,
- base_level,
- base_level, last_level,
- first_layer, first_layer,
- true);
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_device_info *devinfo = &screen->devinfo;
+
+ return v3d_X(devinfo, tfu)(pctx,
+ prsc, prsc,
+ base_level,
+ base_level, last_level,
+ first_layer, first_layer,
+ true);
}
static void
@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
if (info->dst.format != info->src.format)
return;
- if (v3d_tfu(pctx, info->dst.resource, info->src.resource,
- info->src.level,
- info->dst.level, info->dst.level,
- info->src.box.z, info->dst.box.z,
- false)) {
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_device_info *devinfo = &screen->devinfo;
+
+ if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource,
+ info->src.level,
+ info->dst.level, info->dst.level,
+ info->src.box.z, info->dst.box.z,
+ false)) {
info->mask &= ~PIPE_MASK_RGBA;
}
}
@@ -495,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa;
uint32_t tile_width, tile_height, max_bpp;
- v3d_get_tile_buffer_size(msaa, double_buffer,
+ v3d_get_tile_buffer_size(devinfo, msaa, double_buffer,
is_color_blit ? 1 : 0, surfaces, src_surf,
&tile_width, &tile_height, &max_bpp);
diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
index f12e8c92139..1dc4bd017fe 100644
--- a/src/gallium/drivers/v3d/v3d_context.c
+++ b/src/gallium/drivers/v3d/v3d_context.c
@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
}
void
-v3d_get_tile_buffer_size(bool is_msaa,
+v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+ bool is_msaa,
bool double_buffer,
uint32_t nr_cbufs,
struct pipe_surface **cbufs,
@@ -232,11 +233,13 @@ v3d_get_tile_buffer_size(bool is_msaa,
assert(!is_msaa || !double_buffer);
uint32_t max_cbuf_idx = 0;
+ uint32_t total_bpp = 0;
*max_bpp = 0;
for (int i = 0; i < nr_cbufs; i++) {
if (cbufs[i]) {
struct v3d_surface *surf = v3d_surface(cbufs[i]);
*max_bpp = MAX2(*max_bpp, surf->internal_bpp);
+ total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp);
max_cbuf_idx = MAX2(i, max_cbuf_idx);
}
}
@@ -245,9 +248,11 @@ v3d_get_tile_buffer_size(bool is_msaa,
struct v3d_surface *bsurf = v3d_surface(bbuf);
assert(bbuf->texture->nr_samples <= 1 || is_msaa);
*max_bpp = MAX2(*max_bpp, bsurf->internal_bpp);
+ total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp);
}
- v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp,
+ v3d_choose_tile_size(devinfo, max_cbuf_idx + 1,
+ *max_bpp, total_bpp,
is_msaa, double_buffer,
tile_width, tile_height);
}
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
index 97850b0363e..eb184b4b203 100644
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj {
unsigned num_elements;
uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)];
+ /* defaults can be NULL for some hw generation */
struct pipe_resource *defaults;
uint32_t defaults_offset;
};
@@ -794,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx);
void v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
enum pipe_shader_type shader);
-void v3d_get_tile_buffer_size(bool is_msaa,
+void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+ bool is_msaa,
bool double_buffer,
uint32_t nr_cbufs,
struct pipe_surface **cbufs,
@@ -818,16 +820,52 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
/* Helper to call hw ver specific functions */
#define v3d_X(devinfo, thing) ({ \
- __typeof(&v3d42_##thing) v3d_X_thing; \
- if ((devinfo)->ver >= 42) \
- v3d_X_thing = &v3d42_##thing; \
- else if ((devinfo)->ver >= 33) \
+ __typeof(&v3d33_##thing) v3d_X_thing; \
+ switch (devinfo->ver) { \
+ case 33: \
+ case 40: \
v3d_X_thing = &v3d33_##thing; \
- else \
+ break; \
+ case 42: \
+ v3d_X_thing = &v3d42_##thing; \
+ break; \
+ case 71: \
+ v3d_X_thing = &v3d71_##thing; \
+ break; \
+ default: \
unreachable("Unsupported hardware generation"); \
+ } \
v3d_X_thing; \
})
+/* FIXME: The same for vulkan/opengl. Common place? define it at the
+ * v3d_packet files?
+ */
+#define V3D33_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+/* Helper to get hw-specific macro values */
+#define V3DV_X(devinfo, thing) ({ \
+ __typeof(V3D33_##thing) V3D_X_THING; \
+ switch (devinfo->ver) { \
+ case 33: \
+ case 40: \
+ V3D_X_THING = V3D33_##thing; \
+ break; \
+ case 41: \
+ case 42: \
+ V3D_X_THING = V3D42_##thing; \
+ break; \
+ case 71: \
+ V3D_X_THING = V3D71_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ V3D_X_THING; \
+})
+
#ifdef v3dX
# include "v3dx_context.h"
#else
@@ -838,6 +876,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
# define v3dX(x) v3d42_##x
# include "v3dx_context.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dx_context.h"
+# undef v3dX
#endif
#endif /* V3D_CONTEXT_H */
diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c
index b022ed45073..577890a06c3 100644
--- a/src/gallium/drivers/v3d/v3d_job.c
+++ b/src/gallium/drivers/v3d/v3d_job.c
@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
job->double_buffer = false;
}
- v3d_get_tile_buffer_size(job->msaa, job->double_buffer,
+ v3d_get_tile_buffer_size(&v3d->screen->devinfo,
+ job->msaa, job->double_buffer,
job->nr_cbufs, job->cbufs, job->bbuf,
- &job->tile_width, &job->tile_height,
+ &job->tile_width,
+ &job->tile_height,
&job->internal_bpp);
/* The dirty flags are tracking what's been updated while v3d->job has
diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c
index db98c89625f..83f82e44a3d 100644
--- a/src/gallium/drivers/v3d/v3d_query.c
+++ b/src/gallium/drivers/v3d/v3d_query.c
@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index,
struct pipe_driver_query_group_info *info)
{
struct v3d_screen *screen = v3d_screen(pscreen);
+ struct v3d_device_info *devinfo = &screen->devinfo;
- return v3d_get_driver_query_group_info_perfcnt(screen, index, info);
+ return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen,
+ index,
+ info);
}
int
@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
struct pipe_driver_query_info *info)
{
struct v3d_screen *screen = v3d_screen(pscreen);
+ struct v3d_device_info *devinfo = &screen->devinfo;
- return v3d_get_driver_query_info_perfcnt(screen, index, info);
+ return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen,
+ index,
+ info);
}
static struct pipe_query *
@@ -53,9 +59,13 @@ static struct pipe_query *
v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
unsigned *query_types)
{
- return v3d_create_batch_query_perfcnt(v3d_context(pctx),
- num_queries,
- query_types);
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_device_info *devinfo = &screen->devinfo;
+
+ return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx),
+ num_queries,
+ query_types);
}
static void
diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h
index 3e1426b8d86..605ed1a12f9 100644
--- a/src/gallium/drivers/v3d/v3d_query.h
+++ b/src/gallium/drivers/v3d/v3d_query.h
@@ -42,11 +42,5 @@ struct v3d_query
};
struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index);
-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
- unsigned *query_types);
-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_group_info *info);
-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_info *info);
#endif /* V3D_QUERY_H */
diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
index a0a210ccad5..8e31acb0ff0 100644
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -439,7 +439,7 @@ v3d_resource_get_handle(struct pipe_screen *pscreen,
case WINSYS_HANDLE_TYPE_SHARED:
return v3d_bo_flink(bo, &whandle->handle);
case WINSYS_HANDLE_TYPE_KMS:
- if (screen->ro) {
+ if (screen->ro && rsc->scanout) {
if (renderonly_get_handle(rsc->scanout, whandle)) {
whandle->stride = rsc->slices[0].stride;
return true;
@@ -785,6 +785,27 @@ v3d_resource_setup(struct pipe_screen *pscreen,
return rsc;
}
+static bool
+v3d_resource_should_scanout(struct pipe_screen *pscreen,
+ const struct pipe_resource *tmpl,
+ const uint64_t *modifiers,
+ int count)
+{
+ struct v3d_screen *screen = v3d_screen(pscreen);
+
+ if (tmpl->bind & PIPE_BIND_SCANOUT) {
+ if (screen->maintain_ignorable_scanout)
+ return true;
+ if (screen->has_x_session && screen->ignore_scanout_usages) {
+ if (drm_find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF,
+ modifiers, count))
+ return false;
+ }
+ return true;
+ }
+ return false;
+}
+
static struct pipe_resource *
v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
const struct pipe_resource *tmpl,
@@ -798,6 +819,8 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
struct pipe_resource *prsc = &rsc->base;
/* Use a tiled layout if we can, for better 3D performance. */
bool should_tile = true;
+ bool should_scanout = v3d_resource_should_scanout(pscreen, tmpl,
+ modifiers, count);
assert(tmpl->target != PIPE_BUFFER ||
(tmpl->format == PIPE_FORMAT_NONE ||
@@ -827,7 +850,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
/* If using the old-school SCANOUT flag, we don't know what the screen
* might support other than linear. Just force linear.
*/
- if (tmpl->bind & PIPE_BIND_SCANOUT)
+ if ((tmpl->bind & PIPE_BIND_SCANOUT) && should_scanout)
should_tile = false;
/* No user-specified modifier; determine our own. */
@@ -849,7 +872,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
v3d_setup_slices(rsc, 0, tmpl->bind & PIPE_BIND_SHARED);
- if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) {
+ if (screen->ro && should_scanout) {
struct winsys_handle handle;
struct pipe_resource scanout_tmpl = {
.target = prsc->target,
@@ -979,7 +1002,7 @@ v3d_resource_from_handle(struct pipe_screen *pscreen,
}
}
- if (screen->ro) {
+ if (screen->ro && !rsc->tiled) {
/* Make sure that renderonly has a handle to our buffer in the
* display's fd, so that a later renderonly_get_handle()
* returns correct handles or GEM names.
@@ -1025,7 +1048,9 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
assert(view->texture != pview->texture);
- if (shadow->writes == orig->writes && orig->bo->private)
+ if (shadow->writes == orig->writes &&
+ orig->base.sync_status == 0 &&
+ (orig->bo->private || orig->base.sync_condition))
return;
perf_debug("Updating %dx%d@%d shadow for linear texture\n",
@@ -1068,6 +1093,7 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
}
shadow->writes = orig->writes;
+ orig->base.sync_status = 0;
}
static struct pipe_surface *
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
index bce1eeafcd9..4d2478b130d 100644
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -47,6 +47,42 @@
#include "compiler/v3d_compiler.h"
#include "drm-uapi/drm_fourcc.h"
+#ifdef HAVE_WAYLAND_PLATFORM
+#include <wayland-client.h>
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+#include <xcb/xcb.h>
+#endif
+
+static bool
+check_x_session()
+{
+ bool xcb_connection = false;
+
+#ifdef HAVE_WAYLAND_PLATFORM
+ struct wl_display *display;
+
+ display = wl_display_connect(NULL);
+
+ if (display) {
+ wl_display_disconnect(display);
+ return xcb_connection;
+ }
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+ xcb_connection_t *conn;
+
+ conn = xcb_connect(NULL, NULL);
+
+ if (!xcb_connection_has_error(conn))
+ xcb_connection = true;
+ xcb_disconnect(conn);
+#endif
+ return xcb_connection;
+}
+
static const char *
v3d_screen_get_name(struct pipe_screen *pscreen)
{
@@ -255,9 +291,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
return V3D_MAX_ARRAY_LAYERS;
- /* Render targets. */
case PIPE_CAP_MAX_RENDER_TARGETS:
- return 4;
+ return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver);
case PIPE_CAP_VENDOR_ID:
return 0x14E4;
@@ -919,6 +954,12 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
if (!v3d_get_device_info(screen->fd, &screen->devinfo, &v3d_ioctl))
goto fail;
+ if (screen->devinfo.ver >= 71) {
+ fprintf(stderr, "WARNING: v3d support for hw version %i is neither "
+ "a complete nor a conformant OpenGL implementation. Testing "
+ "use only.\n", screen->devinfo.ver);
+ }
+
driParseConfigFiles(config->options, config->options_info, 0, "v3d",
NULL, NULL, NULL, 0, NULL, 0);
@@ -937,6 +978,29 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH);
screen->has_perfmon = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+ screen->ignore_scanout_usages = getenv("V3D_IGNORE_SCANOUT_USAGES");
+
+ const char *is_xserver_process =
+ "v3d_is_xserver_process";
+ screen->is_xserver_process =
+ driCheckOption(config->options,
+ is_xserver_process,
+ DRI_BOOL) &&
+ driQueryOptionb(config->options,
+ is_xserver_process);
+
+ const char *maintain_ignorable_scanout_name =
+ "v3d_maintain_ignorable_scanout";
+ screen->maintain_ignorable_scanout =
+ driCheckOption(config->options,
+ maintain_ignorable_scanout_name,
+ DRI_BOOL) &&
+ driQueryOptionb(config->options,
+ maintain_ignorable_scanout_name);
+
+ screen->has_x_session = !screen->is_xserver_process &&
+ check_x_session();
+
v3d_fence_init(screen);
v3d_process_debug_variable();
diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h
index 1da9b83c965..c0f22707075 100644
--- a/src/gallium/drivers/v3d/v3d_screen.h
+++ b/src/gallium/drivers/v3d/v3d_screen.h
@@ -83,6 +83,12 @@ struct v3d_screen {
bool has_cache_flush;
bool has_perfmon;
bool nonmsaa_texture_size_limit;
+ bool ignore_scanout_usages;
+ bool is_xserver_process;
+ bool maintain_ignorable_scanout;
+
+ /* Are we running in an X session? */
+ bool has_x_session;
struct v3d_simulator_file *sim_file;
diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
index 95eb838954f..1b8758bae7d 100644
--- a/src/gallium/drivers/v3d/v3d_uniforms.c
+++ b/src/gallium/drivers/v3d/v3d_uniforms.c
@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
struct v3d_compiled_shader *shader,
enum pipe_shader_type stage)
{
+ struct v3d_device_info *devinfo = &v3d->screen->devinfo;
struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage];
struct v3d_texture_stateobj *texstate = &v3d->tex[stage];
struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
@@ -282,6 +283,9 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
struct v3d_cl_out *uniforms =
cl_start(&job->indirect);
+ float clipper_xy_granularity =
+ V3DV_X(devinfo, CLIPPER_XY_GRANULARITY);
+
for (int i = 0; i < uinfo->count; i++) {
uint32_t data = uinfo->data[i];
@@ -293,10 +297,10 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
cl_aligned_u32(&uniforms, gallium_uniforms[data]);
break;
case QUNIFORM_VIEWPORT_X_SCALE:
- cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f);
+ cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Y_SCALE:
- cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f);
+ cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Z_OFFSET:
diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
index 03d7c244ea2..c487ac3b996 100644
--- a/src/gallium/drivers/v3d/v3dx_context.h
+++ b/src/gallium/drivers/v3d/v3dx_context.h
@@ -51,3 +51,23 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
*/
bool v3dX(tfu_supports_tex_format)(uint32_t tex_format,
bool for_mipmap);
+
+bool v3dX(tfu)(struct pipe_context *pctx,
+ struct pipe_resource *pdst,
+ struct pipe_resource *psrc,
+ unsigned int src_level,
+ unsigned int base_level,
+ unsigned int last_level,
+ unsigned int src_layer,
+ unsigned int dst_layer,
+ bool for_mipmap);
+
+int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_info *info);
+struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d,
+ unsigned num_queries,
+ unsigned *query_types);
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 0640dab1884..85083035ea6 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -95,7 +95,25 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
#endif
assert(!job->msaa || !job->double_buffer);
-#if V3D_VERSION >= 40
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+ config.width_in_pixels = job->draw_width;
+ config.height_in_pixels = job->draw_height;
+
+ config.log2_tile_width = log2_tile_size(job->tile_width);
+ config.log2_tile_height = log2_tile_size(job->tile_height);
+
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+ }
+
+#endif
+
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = job->draw_width;
config.height_in_pixels = job->draw_height;
@@ -107,7 +125,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
config.maximum_bpp_of_all_render_targets = job->internal_bpp;
}
-#else /* V3D_VERSION < 40 */
+#endif
+#if V3D_VERSION < 40
/* "Binning mode lists start with a Tile Binning Mode Configuration
* item (120)"
*
@@ -134,7 +153,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
config.maximum_bpp_of_all_render_targets = job->internal_bpp;
}
-#endif /* V3D_VERSION < 40 */
+#endif
/* There's definitely nothing in the VCD cache we want. */
cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
@@ -377,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
gs_bin->prog_data.gs->base.threads == 4;
shader.geometry_bin_mode_shader_start_in_final_thread_section =
gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
shader.geometry_bin_mode_shader_uniforms_address =
gs_bin_uniforms;
@@ -387,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
gs->prog_data.gs->base.threads == 4;
shader.geometry_render_mode_shader_start_in_final_thread_section =
gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
shader.geometry_render_mode_shader_uniforms_address =
gs_render_uniforms;
}
@@ -638,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
shader.number_of_varyings_in_fragment_shader =
v3d->prog.fs->prog_data.fs->num_inputs;
- shader.coordinate_shader_propagate_nans = true;
- shader.vertex_shader_propagate_nans = true;
- shader.fragment_shader_propagate_nans = true;
-
shader.coordinate_shader_code_address =
cl_address(v3d_resource(v3d->prog.cs->resource)->bo,
v3d->prog.cs->offset);
@@ -652,6 +671,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
cl_address(v3d_resource(v3d->prog.fs->resource)->bo,
v3d->prog.fs->offset);
+#if V3D_VERSION <= 42
+ shader.coordinate_shader_propagate_nans = true;
+ shader.vertex_shader_propagate_nans = true;
+ shader.fragment_shader_propagate_nans = true;
+
/* XXX: Use combined input/output size flag in the common
* case.
*/
@@ -659,13 +683,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
v3d->prog.cs->prog_data.vs->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
v3d->prog.vs->prog_data.vs->separate_segments;
-
shader.coordinate_shader_input_vpm_segment_size =
v3d->prog.cs->prog_data.vs->separate_segments ?
v3d->prog.cs->prog_data.vs->vpm_input_size : 1;
shader.vertex_shader_input_vpm_segment_size =
v3d->prog.vs->prog_data.vs->separate_segments ?
v3d->prog.vs->prog_data.vs->vpm_input_size : 1;
+#endif
+ /* On V3D 7.1 there isn't a specific flag to set if we are using
+ * shared/separate segments or not. We just set the value of
+ * vpm_input_size to 0, and set output to the max needed. That should be
+ * already properly set on prog_data_vs_bin
+ */
+#if V3D_VERSION == 71
+ shader.coordinate_shader_input_vpm_segment_size =
+ v3d->prog.cs->prog_data.vs->vpm_input_size;
+ shader.vertex_shader_input_vpm_segment_size =
+ v3d->prog.vs->prog_data.vs->vpm_input_size;
+#endif
shader.coordinate_shader_output_vpm_segment_size =
v3d->prog.cs->prog_data.vs->vpm_output_size;
@@ -724,9 +759,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
shader.instance_id_read_by_vertex_shader =
v3d->prog.vs->prog_data.vs->uses_iid;
+#if V3D_VERSION <= 42
shader.address_of_default_attribute_values =
cl_address(v3d_resource(vtx->defaults)->bo,
vtx->defaults_offset);
+#endif
}
bool cs_loaded_any = false;
@@ -1436,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
- /* Number of batches the dispatch will invoke (minus 1). */
- submit.cfg[4] = num_batches - 1;
+ /* Number of batches the dispatch will invoke.
+ * V3D 7.1.6 and later don't subtract 1 from the number of batches
+ */
+ if (v3d->screen->devinfo.ver < 71 ||
+ (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) {
+ submit.cfg[4] = num_batches - 1;
+ } else {
+ submit.cfg[4] = num_batches;
+ }
/* Make sure we didn't accidentally underflow. */
assert(submit.cfg[4] != ~0);
@@ -1445,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo);
submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset +
v3d->prog.compute->offset);
- submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+ if (v3d->screen->devinfo.ver < 71)
+ submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (v3d->prog.compute->prog_data.base->single_seg)
submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
if (v3d->prog.compute->prog_data.base->threads == 4)
@@ -1560,9 +1605,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
/* GFXH-1461: If we were to emit a load of just depth or just stencil,
* then the clear for the other may get lost. We need to decide now
* if it would be possible to need to emit a load of just one after
- * we've set up our TLB clears.
+ * we've set up our TLB clears. This issue is fixed since V3D 4.3.18.
*/
- if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
+ if (v3d->screen->devinfo.ver <= 42 &&
+ buffers & PIPE_CLEAR_DEPTHSTENCIL &&
(buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
job->zsbuf &&
util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
index 0ad3fb68b1e..82a45e44f82 100644
--- a/src/gallium/drivers/v3d/v3dx_emit.c
+++ b/src/gallium/drivers/v3d/v3dx_emit.c
@@ -512,13 +512,17 @@ v3dX(emit_state)(struct pipe_context *pctx)
/* Note: EZ state may update based on the compiled FS,
* along with ZSA
*/
+#if V3D_VERSION <= 42
config.early_z_updates_enable =
(job->ez_state != V3D_EZ_DISABLED);
+#endif
if (v3d->zsa->base.depth_enabled) {
config.z_updates_enable =
v3d->zsa->base.depth_writemask;
+#if V3D_VERSION <= 42
config.early_z_enable =
config.early_z_updates_enable;
+#endif
config.depth_test_function =
v3d->zsa->base.depth_func;
} else {
@@ -535,13 +539,27 @@ v3dX(emit_state)(struct pipe_context *pctx)
v3d_line_smoothing_enabled(v3d) ?
V3D_LINE_RASTERIZATION_PERP_END_CAPS :
V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
- }
+#if V3D_VERSION >= 71
+ /* The following follows the logic implemented at v3dv
+ * plus the definition of depth_clip_near/far and
+ * depth_clamp.
+ *
+ * Note: some extensions are not supported by v3d
+ * (like ARB_depth_clamp) that would affect this, but
+ * the values on rasterizer are taking that into
+ * account.
+ */
+ config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near ||
+ v3d->rasterizer->base.depth_clip_far;
+#endif
+ }
}
if (v3d->dirty & V3D_DIRTY_RASTERIZER &&
v3d->rasterizer->base.offset_tri) {
- if (job->zsbuf &&
+ if (v3d->screen->devinfo.ver <= 42 &&
+ job->zsbuf &&
job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
cl_emit_prepacked_sized(&job->bcl,
v3d->rasterizer->depth_offset_z16,
@@ -564,12 +582,23 @@ v3dX(emit_state)(struct pipe_context *pctx)
}
if (v3d->dirty & V3D_DIRTY_VIEWPORT) {
+#if V3D_VERSION <= 42
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
clip.viewport_half_width_in_1_256th_of_pixel =
v3d->viewport.scale[0] * 256.0f;
clip.viewport_half_height_in_1_256th_of_pixel =
v3d->viewport.scale[1] * 256.0f;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+ clip.viewport_half_width_in_1_64th_of_pixel =
+ v3d->viewport.scale[0] * 64.0f;
+ clip.viewport_half_height_in_1_64th_of_pixel =
+ v3d->viewport.scale[1] * 64.0f;
+ }
+#endif
+
cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
clip.viewport_z_offset_zc_to_zs =
@@ -633,8 +662,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
}
#endif
+ const uint32_t max_rts =
+ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
if (blend->base.independent_blend_enable) {
- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+ for (int i = 0; i < max_rts; i++)
emit_rt_blend(v3d, job, &blend->base, i,
(1 << i),
v3d->blend_dst_alpha_one & (1 << i));
@@ -650,16 +681,16 @@ v3dX(emit_state)(struct pipe_context *pctx)
* RTs without.
*/
emit_rt_blend(v3d, job, &blend->base, 0,
- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
+ ((1 << max_rts) - 1) &
v3d->blend_dst_alpha_one,
true);
emit_rt_blend(v3d, job, &blend->base, 0,
- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
+ ((1 << max_rts) - 1) &
~v3d->blend_dst_alpha_one,
false);
} else {
emit_rt_blend(v3d, job, &blend->base, 0,
- (1 << V3D_MAX_DRAW_BUFFERS) - 1,
+ (1 << max_rts) - 1,
v3d->blend_dst_alpha_one);
}
}
@@ -668,8 +699,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
if (v3d->dirty & V3D_DIRTY_BLEND) {
struct pipe_blend_state *blend = &v3d->blend->base;
+ const uint32_t max_rts =
+ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
- for (int i = 0; i < 4; i++) {
+ for (int i = 0; i < max_rts; i++) {
int rt = blend->independent_blend_enable ? i : 0;
int rt_mask = blend->rt[rt].colormask;
diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
similarity index 94%
rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c
rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c
index e00d84e375f..431aad14b4f 100644
--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c
+++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon)
}
int
-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_group_info *info)
+v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index,
+ struct pipe_driver_query_group_info *info)
{
if (!screen->has_perfmon)
return 0;
@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde
}
int
-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_info *info)
+v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index,
+ struct pipe_driver_query_info *info)
{
if (!screen->has_perfmon)
return 0;
@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = {
};
struct pipe_query *
-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
- unsigned *query_types)
+v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries,
+ unsigned *query_types)
{
struct v3d_query_perfcnt *pquery = NULL;
struct v3d_query *query;
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
index 82547437c25..d3fbc9aff5d 100644
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -23,8 +23,9 @@
#include "util/format/u_format.h"
#include "v3d_context.h"
-#include "broadcom/common/v3d_tiling.h"
#include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_tiling.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 | \
@@ -419,10 +420,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
* clearing Z/S.
*/
if (job->clear) {
+#if V3D_VERSION <= 42
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
+
}
#endif /* V3D_VERSION >= 40 */
}
@@ -483,10 +490,64 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
}
}
-#if V3D_VERSION >= 40
+#if V3D_VERSION > 33
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ */
+static uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ enum pipe_format format)
+{
+#if V3D_VERSION == 42
+ if (util_format_is_pure_integer(format)) {
+ return V3D_RENDER_TARGET_CLAMP_INT;
+ } else if (util_format_is_srgb(format)) {
+ return V3D_RENDER_TARGET_CLAMP_NORM;
+ } else {
+ return V3D_RENDER_TARGET_CLAMP_NONE;
+ }
+#endif
+#if V3D_VERSION >= 71
+ switch (rt_type) {
+ case V3D_INTERNAL_TYPE_8I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+ case V3D_INTERNAL_TYPE_8UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_8:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ case V3D_INTERNAL_TYPE_16I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+ case V3D_INTERNAL_TYPE_16UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_16F:
+ return util_format_is_srgb(format) ?
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+ case V3D_INTERNAL_TYPE_32I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+ case V3D_INTERNAL_TYPE_32UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_32F:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+ default:
+ unreachable("Unknown internal render target type");
+ }
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+ return 0;
+}
+#endif
+
+#if V3D_VERSION >= 71
static void
-v3d_setup_render_target(struct v3d_job *job, int cbuf,
- uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
+v3d_setup_render_target(struct v3d_job *job,
+ int cbuf,
+ uint32_t *rt_bpp,
+ uint32_t *rt_type_clamp)
{
if (!job->cbufs[cbuf])
return;
@@ -497,19 +558,35 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf,
struct v3d_surface *bsurf = v3d_surface(job->bbuf);
*rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
}
- *rt_type = surf->internal_type;
- if (util_format_is_srgb(surf->base.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
-#if V3D_VERSION >= 42
- else if (util_format_is_pure_integer(surf->base.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-#endif
- else
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
+ surf->base.format);
}
+#endif
-#else /* V3D_VERSION < 40 */
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+static void
+v3d_setup_render_target(struct v3d_job *job,
+ int cbuf,
+ uint32_t *rt_bpp,
+ uint32_t *rt_type,
+ uint32_t *rt_clamp)
+{
+ if (!job->cbufs[cbuf])
+ return;
+
+ struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]);
+ *rt_bpp = surf->internal_bpp;
+ if (job->bbuf) {
+ struct v3d_surface *bsurf = v3d_surface(job->bbuf);
+ *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
+ }
+ *rt_type = surf->internal_type;
+ *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
+ surf->base.format);
+}
+#endif
+#if V3D_VERSION < 40
static void
v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
struct v3d_resource *rsc, bool is_separate_stencil)
@@ -656,7 +733,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
-#else
+#endif
+#if V3D_VERSION >= 40
for (int i = 0; i < 2; i++) {
if (i > 0)
cl_emit(&job->rcl, TILE_COORDINATES, coords);
@@ -664,16 +742,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
+
if (i == 0 || do_double_initial_tile_clear(job)) {
+#if V3D_VERSION < 71
cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#else
+ cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
}
#endif
-
cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
v3d_rcl_emit_generic_per_tile_list(job, layer);
@@ -775,18 +857,52 @@ v3dX(emit_rcl)(struct v3d_job *job)
config.multisample_mode_4x = job->msaa;
config.double_buffer_in_non_ms_mode = job->double_buffer;
+#if V3D_VERSION <= 42
config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(job->tile_width);
+ config.log2_tile_height = log2_tile_size(job->tile_height);
+
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
+
}
+#if V3D_VERSION >= 71
+ uint32_t base_addr = 0;
+
+ /* If we don't have any color RTs, we sill need to emit one and flat
+ * it as not used using stride = 1
+ */
+ if (job->nr_cbufs == 0) {
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.stride = 1; /* Unused */
+ }
+ }
+#endif
for (int i = 0; i < job->nr_cbufs; i++) {
struct pipe_surface *psurf = job->cbufs[i];
- if (!psurf)
+ if (!psurf) {
+#if V3D_VERSION >= 71
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.render_target_number = i;
+ rt.stride = 1; /* Unused */
+ }
+#endif
continue;
+ }
+
struct v3d_surface *surf = v3d_surface(psurf);
struct v3d_resource *rsc = v3d_resource(psurf->texture);
UNUSED uint32_t config_pad = 0;
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
/* XXX: Set the pad for raster. */
if (surf->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -819,6 +935,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
}
#endif /* V3D_VERSION < 40 */
+#if V3D_VERSION <= 42
cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
clear) {
clear.clear_color_low_32_bits = job->clear_color[i][0];
@@ -847,9 +964,42 @@ v3dX(emit_rcl)(struct v3d_job *job)
clear.render_target_number = i;
};
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.clear_color_low_bits = job->clear_color[i][0];
+ v3d_setup_render_target(job, i, &rt.internal_bpp,
+ &rt.internal_type_and_clamping);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(job->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = base_addr;
+ rt.render_target_number = i;
+
+ base_addr += (job->tile_height * rt.stride) / 8;
+ }
+
+ if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) job->clear_color[i][1]) |
+ (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32);
+ rt.render_target_number = i;
+ }
+ }
+
+ if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (job->clear_color[i][3])) << 24);
+ rt.render_target_number = i;
+ }
+ }
+#endif
}
-#if V3D_VERSION >= 40
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
v3d_setup_render_target(job, 0,
&rt.render_target_0_internal_bpp,
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
index 0f1735fee66..a7fad572a2d 100644
--- a/src/gallium/drivers/v3d/v3dx_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
#endif
}
- /* The HW treats polygon offset units based on a Z24 buffer, so we
+ /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we
* need to scale up offset_units if we're only Z16.
*/
+#if V3D_VERSION <= 42
v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) {
depth.depth_offset_factor = cso->offset_scale;
depth.depth_offset_units = cso->offset_units * 256.0;
@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
depth.limit = cso->offset_clamp;
#endif
}
+#endif
return so;
}
@@ -138,8 +140,9 @@ v3d_create_blend_state(struct pipe_context *pctx,
so->base = *cso;
+ uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION);
if (cso->independent_blend_enable) {
- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+ for (int i = 0; i < max_rts; i++) {
so->blend_enables |= cso->rt[i].blend_enable << i;
/* V3D 4.x is when we got independent blend enables. */
@@ -148,7 +151,7 @@ v3d_create_blend_state(struct pipe_context *pctx,
}
} else {
if (cso->rt[0].blend_enable)
- so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1;
+ so->blend_enables = (1 << max_rts) - 1;
}
return so;
@@ -337,6 +340,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
v3d->dirty |= V3D_DIRTY_ZSA;
}
+
+static bool
+needs_default_attribute_values(void)
+{
+#if V3D_VERSION <= 42
+ /* FIXME: on vulkan we are able to refine even further, as we know in
+ * advance when we create the pipeline if we have a integer vertex
+ * attrib. Pending to check if we could do something similar here.
+ */
+ return true;
+#endif
+ return false;
+}
+
static void *
v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
const struct pipe_vertex_element *elements)
@@ -414,24 +431,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
}
}
- /* Set up the default attribute values in case any of the vertex
- * elements use them.
- */
- uint32_t *attrs;
- u_upload_alloc(v3d->state_uploader, 0,
- V3D_MAX_VS_INPUTS * sizeof(float), 16,
- &so->defaults_offset, &so->defaults, (void **)&attrs);
-
- for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
- attrs[i * 4 + 0] = 0;
- attrs[i * 4 + 1] = 0;
- attrs[i * 4 + 2] = 0;
- if (i < so->num_elements &&
- util_format_is_pure_integer(so->pipe[i].src_format)) {
- attrs[i * 4 + 3] = 1;
- } else {
- attrs[i * 4 + 3] = fui(1.0);
+ if (needs_default_attribute_values()) {
+ /* Set up the default attribute values in case any of the vertex
+ * elements use them.
+ */
+ uint32_t *attrs;
+ u_upload_alloc(v3d->state_uploader, 0,
+ V3D_MAX_VS_INPUTS * sizeof(float), 16,
+ &so->defaults_offset, &so->defaults, (void **)&attrs);
+
+ for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
+ attrs[i * 4 + 0] = 0;
+ attrs[i * 4 + 1] = 0;
+ attrs[i * 4 + 2] = 0;
+ if (i < so->num_elements &&
+ util_format_is_pure_integer(so->pipe[i].src_format)) {
+ attrs[i * 4 + 3] = 1;
+ } else {
+ attrs[i * 4 + 3] = fui(1.0);
+ }
}
+ } else {
+ so->defaults = NULL;
+ so->defaults_offset = 0;
}
u_upload_unmap(v3d->state_uploader);
@@ -699,21 +721,22 @@ v3d_upload_sampler_state_variant(void *map,
break;
}
- if (variant >= V3D_SAMPLER_STATE_32) {
- sampler.border_color_word_0 = border.ui[0];
- sampler.border_color_word_1 = border.ui[1];
- sampler.border_color_word_2 = border.ui[2];
- sampler.border_color_word_3 = border.ui[3];
- } else {
- sampler.border_color_word_0 =
- _mesa_float_to_half(border.f[0]);
- sampler.border_color_word_1 =
- _mesa_float_to_half(border.f[1]);
- sampler.border_color_word_2 =
- _mesa_float_to_half(border.f[2]);
- sampler.border_color_word_3 =
- _mesa_float_to_half(border.f[3]);
+#if V3D_VERSION <= 42
+ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+ * for us. In V3D 4.x we need to manually convert floating point color
+ * values to the expected format.
+ */
+ if (variant < V3D_SAMPLER_STATE_32) {
+ border.ui[0] = _mesa_float_to_half(border.f[0]);
+ border.ui[1] = _mesa_float_to_half(border.f[1]);
+ border.ui[2] = _mesa_float_to_half(border.f[2]);
+ border.ui[3] = _mesa_float_to_half(border.f[3]);
}
+#endif
+ sampler.border_color_word_0 = border.ui[0];
+ sampler.border_color_word_1 = border.ui[1];
+ sampler.border_color_word_2 = border.ui[2];
+ sampler.border_color_word_3 = border.ui[3];
}
}
}
@@ -869,7 +892,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te
}
static void
-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
+v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo,
+ struct V3DX(TEXTURE_SHADER_STATE) *tex,
struct pipe_resource *prsc,
int base_level, int last_level,
int first_layer, int last_layer,
@@ -917,19 +941,29 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
}
tex->base_level = base_level;
+
#if V3D_VERSION >= 40
tex->max_level = last_level;
/* Note that we don't have a job to reference the texture's sBO
* at state create time, so any time this sampler view is used
* we need to add the texture to the job.
*/
- tex->texture_base_pointer =
- cl_address(NULL,
- rsc->bo->offset +
- v3d_layer_offset(prsc, 0, first_layer));
+ const uint32_t base_offset = rsc->bo->offset +
+ v3d_layer_offset(prsc, 0, first_layer);
+
+ tex->texture_base_pointer = cl_address(NULL, base_offset);
#endif
+
tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
+#if V3D_VERSION >= 71
+ tex->chroma_offset_x = 1;
+ tex->chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex->texture_base_pointer_cb = base_offset >> 6;
+ tex->texture_base_pointer_cr = base_offset >> 6;
+#endif
+
/* Since other platform devices may produce UIF images even
* when they're not big enough for V3D to assume they're UIF,
* we force images with level 0 as UIF to be always treated
@@ -977,7 +1011,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
if (prsc->target != PIPE_BUFFER) {
- v3d_setup_texture_shader_state(&tex, prsc,
+ v3d_setup_texture_shader_state(&v3d->screen->devinfo,
+ &tex, prsc,
cso->u.tex.first_level,
cso->u.tex.last_level,
cso->u.tex.first_layer,
@@ -990,7 +1025,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
cso->u.buf.size);
}
- tex.srgb = util_format_is_srgb(cso->format);
+ bool is_srgb = util_format_is_srgb(cso->format);
+#if V3D_VERSION <= 42
+ tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
#if V3D_VERSION >= 40
tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]);
@@ -1040,7 +1081,10 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
* shader code if we wanted to read an MSAA sRGB
* texture without sRGB decode.
*/
+#if V3D_VERSION <= 42
tex.srgb = false;
+#endif
+
} else {
tex.texture_type = v3d_get_tex_format(&screen->devinfo,
cso->format);
@@ -1404,7 +1448,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d,
v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
if (prsc->target != PIPE_BUFFER) {
- v3d_setup_texture_shader_state(&tex, prsc,
+ v3d_setup_texture_shader_state(&v3d->screen->devinfo,
+ &tex, prsc,
iview->base.u.tex.level,
iview->base.u.tex.level,
iview->base.u.tex.first_layer,
diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c
new file mode 100644
index 00000000000..d6b51390a11
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_tfu.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright © 2021 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_context.h"
+#include "broadcom/common/v3d_tfu.h"
+
+bool
+v3dX(tfu)(struct pipe_context *pctx,
+ struct pipe_resource *pdst,
+ struct pipe_resource *psrc,
+ unsigned int src_level,
+ unsigned int base_level,
+ unsigned int last_level,
+ unsigned int src_layer,
+ unsigned int dst_layer,
+ bool for_mipmap)
+{
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_resource *src = v3d_resource(psrc);
+ struct v3d_resource *dst = v3d_resource(pdst);
+ struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
+ struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
+ int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
+ int width = u_minify(pdst->width0, base_level) * msaa_scale;
+ int height = u_minify(pdst->height0, base_level) * msaa_scale;
+ enum pipe_format pformat;
+
+ if (psrc->format != pdst->format)
+ return false;
+ if (psrc->nr_samples != pdst->nr_samples)
+ return false;
+
+ if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D)
+ return false;
+
+ /* Can't write to raster. */
+ if (dst_base_slice->tiling == V3D_TILING_RASTER)
+ return false;
+
+ /* When using TFU for blit, we are doing exact copies (both input and
+ * output format must be the same, no scaling, etc), so there is no
+ * pixel format conversions. Thus we can rewrite the format to use one
+ * that is TFU compatible based on its texel size.
+ */
+ if (for_mipmap) {
+ pformat = pdst->format;
+ } else {
+ switch (dst->cpp) {
+ case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
+ case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break;
+ case 4: pformat = PIPE_FORMAT_R32_FLOAT; break;
+ case 2: pformat = PIPE_FORMAT_R16_FLOAT; break;
+ case 1: pformat = PIPE_FORMAT_R8_UNORM; break;
+ default: unreachable("unsupported format bit-size"); break;
+ };
+ }
+
+ uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
+
+ if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) {
+ assert(for_mipmap);
+ return false;
+ }
+
+ v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
+ v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
+
+ struct drm_v3d_submit_tfu tfu = {
+ .ios = (height << 16) | width,
+ .bo_handles = {
+ dst->bo->handle,
+ src != dst ? src->bo->handle : 0
+ },
+ .in_sync = v3d->out_sync,
+ .out_sync = v3d->out_sync,
+ };
+ uint32_t src_offset = (src->bo->offset +
+ v3d_layer_offset(psrc, src_level, src_layer));
+ tfu.iia |= src_offset;
+
+ uint32_t dst_offset = (dst->bo->offset +
+ v3d_layer_offset(pdst, base_level, dst_layer));
+ tfu.ioa |= dst_offset;
+
+ switch (src_base_slice->tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.iis |= (src_base_slice->padded_height /
+ (2 * v3d_utile_height(src->cpp)));
+ break;
+ case V3D_TILING_RASTER:
+ tfu.iis |= src_base_slice->stride / src->cpp;
+ break;
+ case V3D_TILING_LINEARTILE:
+ case V3D_TILING_UBLINEAR_1_COLUMN:
+ case V3D_TILING_UBLINEAR_2_COLUMN:
+ break;
+ }
+
+#if V3D_VERSION <= 42
+ if (src_base_slice->tiling == V3D_TILING_RASTER) {
+ tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
+ V3D33_TFU_ICFG_FORMAT_SHIFT);
+ } else {
+ tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D33_TFU_ICFG_FORMAT_SHIFT);
+ }
+ tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
+
+ if (last_level != base_level)
+ tfu.ioa |= V3D33_TFU_IOA_DIMTW;
+
+ tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
+ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D33_TFU_IOA_FORMAT_SHIFT);
+
+ tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
+
+ /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+ * OPAD field for the destination (how many extra UIF blocks beyond
+ * those necessary to cover the height). When filling mipmaps, the
+ * miplevel 1+ tiling state is inferred.
+ */
+ if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
+ dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
+ int uif_block_h = 2 * v3d_utile_height(dst->cpp);
+ int implicit_padded_height = align(height, uif_block_h);
+
+ tfu.icfg |= (((dst_base_slice->padded_height -
+ implicit_padded_height) / uif_block_h) <<
+ V3D33_TFU_ICFG_OPAD_SHIFT);
+ }
+#endif /* V3D_VERSION <= 42 */
+
+#if V3D_VERSION >= 71
+ if (src_base_slice->tiling == V3D_TILING_RASTER) {
+ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ } else {
+ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ }
+ tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT;
+
+ if (last_level != base_level)
+ tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW;
+
+ tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
+ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_IOC_FORMAT_SHIFT);
+
+ switch (dst_base_slice->tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.v71.ioc |=
+ (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ case V3D_TILING_RASTER:
+ tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ default:
+ break;
+ }
+
+ tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT;
+#endif /* V3D_VERSION >= 71*/
+
+ int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
+ return false;
+ }
+
+ dst->writes++;
+
+ return true;
+}
+
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 1c3f77f6588..9bdefb55194 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -610,6 +610,10 @@ struct pipe_resource
unsigned bind; /**< bitmask of PIPE_BIND_x */
unsigned flags; /**< bitmask of PIPE_RESOURCE_FLAG_x */
+ /* Hack for avoiding sync on v3d */
+ unsigned sync_condition;
+ unsigned sync_status;
+
/**
* For planar images, ie. YUV EGLImage external, etc, pointer to the
* next plane.
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 32135770e9d..2534c817dcc 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -275,7 +275,7 @@ dri3_update_max_num_back(struct loader_dri3_drawable *draw)
if (draw->swap_interval == 0)
draw->max_num_back = 4;
else
- draw->max_num_back = 3;
+ draw->max_num_back = 2;
assert(draw->max_num_back <= LOADER_DRI3_MAX_BACK);
break;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 77c38bf48d5..1eb2dac8018 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1058,6 +1058,9 @@ struct gl_texture_object
* the pipe_resource *pt above.
*/
bool needs_validation;
+
+ /* Hack for avoiding sync on v3d */
+ GLboolean SyncCondition;
};
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index d8fb1ed4317..048deaa02f6 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -273,6 +273,13 @@ set_tex_parameteri(struct gl_context *ctx,
}
switch (pname) {
+ case GL_SYNC_CONDITION:
+ if (!!texObj->SyncCondition == !!params[0])
+ return GL_FALSE;
+ texObj->SyncCondition = !!params[0];
+ return GL_TRUE;
+ case GL_SYNC_STATUS:
+ return GL_TRUE;
case GL_TEXTURE_MIN_FILTER:
if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
goto invalid_dsa;
@@ -930,6 +937,17 @@ _mesa_texture_parameter_invalidate(struct gl_context *ctx,
{
if (texparam_invalidates_sampler_views(pname))
st_texture_release_all_sampler_views(st_context(ctx), texObj);
+
+ switch (pname) {
+ case GL_SYNC_CONDITION:
+ texObj->pt->sync_condition = texObj->SyncCondition;
+ break;
+ case GL_SYNC_STATUS:
+ texObj->pt->sync_status = 1;
+ break;
+ default:
+ ; /* nothing */
+ }
}
void
diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
index 24cc2888755..2bc2748e7fe 100644
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -77,6 +77,7 @@ TODO: document the other workarounds.
<!-- using vulkan wsi for xservers causes deadlocks -->
<application name="Xwayland" executable="Xwayland">
<option name="disable_xcb_surface" value="true" />
+ <option name="v3d_is_xserver_process" value="true" />
</application>
<application name="Unigine Heaven (32-bit)" executable="heaven_x86">
@@ -750,6 +751,7 @@ TODO: document the other workarounds.
<application name="mutter" executable="mutter">
<option name="adaptive_sync" value="false" />
<option name="v3d_nonmsaa_texture_size_limit" value="true" />
+ <option name="v3d_maintain_ignorable_scanout" value="true" />
</application>
<application name="muffin" executable="muffin">
<option name="adaptive_sync" value="false" />
@@ -801,6 +803,7 @@ TODO: document the other workarounds.
</application>
<application name="Xorg" executable="Xorg">
<option name="v3d_nonmsaa_texture_size_limit" value="true" />
+ <option name="v3d_is_xserver_process" value="true" />
</application>
<application name="gfxbench" executable="testfw_app">
diff --git a/src/util/driconf.h b/src/util/driconf.h
index ab7aa2c6553..70fa9f7b41b 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -517,6 +517,14 @@
DRI_CONF_OPT_B(v3d_nonmsaa_texture_size_limit, def, \
"Report the non-MSAA-only texture size limit")
+#define DRI_CONF_V3D_IS_XSERVER_PROCESS(def) \
+ DRI_CONF_OPT_B(v3d_is_xserver_process, def, \
+ "Identifies if the application is the Xserver.")
+
+#define DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(def) \
+ DRI_CONF_OPT_B(v3d_maintain_ignorable_scanout, def, \
+ "Maintain SCANOUT usage on resource allocations when the environment allows ignoring SCANOUT usage.")
+
/**
* \brief virgl specific configuration options
*/