nixfiles/hosts/raspberry-pi5/profiles/rbp2-001-add-raspberrypi5-support.patch

12319 lines
523 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h
index 3dfc0af8756..1a7d7a689de 100644
--- a/include/drm-uapi/v3d_drm.h
+++ b/include/drm-uapi/v3d_drm.h
@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu {
/* Pointer to an array of ioctl extensions*/
__u64 extensions;
+
+ struct {
+ __u32 ioc;
+ __u32 pad;
+ } v71;
};
/* Submits a compute shader for dispatch. This job will block on any
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index 31a0d5bfa94..8ac32b313e4 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -23,7 +23,8 @@ v3d_versions = [
[21, 21],
[33, 33],
[41, 33],
- [42, 33]
+ [42, 33],
+ [71, 33]
]
v3d_xml_files = []
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index a0242b5f1c2..624353ca2bf 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="33" max_ver="42">
+<vcxml gen="3.3" min_ver="33" max_ver="71">
<enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
<value name="NEVER" value="0"/>
@@ -167,13 +167,36 @@
<value name="depth_16" value="2"/>
</enum>
- <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+ <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
<value name="none" value="0"/> <!-- no clamping -->
<value name="norm" value="1"/> <!-- [0,1] for f16 -->
<value name="pos" value="2"/> <!-- [0, for f16 -->
<value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
</enum>
+ <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
+ <value name="8i" value="0"/> <!-- no clamping -->
+ <value name="16i" value="1"/> <!-- no clamping -->
+ <value name="32i" value="2"/> <!-- no clamping -->
+ <value name="8ui" value="4"/> <!-- no clamping -->
+ <value name="16ui" value="5"/> <!-- no clamping -->
+ <value name="32ui" value="6"/> <!-- no clamping -->
+ <value name="8" value="8"/> <!-- no clamping -->
+ <value name="16f" value="9"/> <!-- no clamping -->
+ <value name="32f" value="10"/> <!-- no clamping -->
+ <value name="8i_clamped" value="16"/> <!-- clamp to integer RT's range -->
+ <value name="16i_clamped" value="17"/> <!-- clamp to integer RT's range -->
+ <value name="32i_clamped" value="18"/> <!-- clamp to integer RT's range -->
+ <value name="8ui_clamped" value="20"/> <!-- clamp to integer RT's range -->
+ <value name="16ui_clamped" value="21"/> <!-- clamp to integer RT's range -->
+ <value name="32ui_clamped" value="22"/> <!-- clamp to integer RT's range -->
+ <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
+ <value name="16f_clamp_pos" value="25"/> <!-- [0, for f16 -->
+ <value name="16f_clamp_pq" value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
+ <value name="16f_clamp_hlg" value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
+ <value name="invalid" value="32"/>
+ </enum>
+
<!---
CL cache flush commands are not fully documented and subject to a
number of hardware issues that make them unreliable. Specifically:
@@ -263,13 +286,27 @@
<value name="r8ui" value="36"/>
<value name="srgbx8" value="37" max_ver="33"/>
<value name="rgbx8" value="38" max_ver="33"/>
- <value name="bstc" value="39" min_ver="41"/>
+ <value name="bstc8" value="39" min_ver="41"/>
<value name="d32f" value="40" min_ver="41"/>
<value name="d24" value="41" min_ver="41"/>
<value name="d16" value="42" min_ver="41"/>
<value name="d24s8" value="43" min_ver="41"/>
<value name="s8" value="44" min_ver="41"/>
<value name="rgba5551" value="45" min_ver="41"/>
+ <value name="bstc8_srgb" value="46" min_ver="71"/>
+ <value name="bstc10" value="47" min_ver="71"/>
+ <value name="bstc10_srgb" value="48" min_ver="71"/>
+ <value name="bstc10_pq" value="49" min_ver="71"/>
+ <value name="rgba10x6" value="50" min_ver="71"/>
+ <value name="bstc10_hlg" value="55" min_ver="71"/>
+ <value name="rgba10x6_hlg" value="56" min_ver="71"/>
+ <value name="rgb10_a2_hlg" value="57" min_ver="71"/>
+ <value name="bstc10_pq_bt1886" value="58" min_ver="71"/>
+ <value name="rgba10x6_pq_bt1886" value="59" min_ver="71"/>
+ <value name="rgb10_a2_pq_bt1886" value="60" min_ver="71"/>
+ <value name="bstc10_hlg_bt1886" value="61" min_ver="71"/>
+ <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
+ <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
</enum>
<enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
@@ -314,6 +351,12 @@
<value name="perp end caps" value="1"/>
</enum>
+ <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
+ <value name="NONE" value="0"/>
+ <value name="MIN_ONE_TO_ONE" value="1"/>
+ <value name="ZERO_TO_ONE" value="2"/>
+ </enum>
+
<packet code="0" name="Halt"/>
<packet code="1" name="NOP"/>
<packet code="4" name="Flush"/>
@@ -381,11 +424,13 @@
<field name="Last Tile of Frame" size="1" start="0" type="bool"/>
</packet>
- <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+ <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
<field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
<field name="Clear all Render Targets" size="1" start="0" type="bool"/>
</packet>
+ <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
+
<packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
<field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
<field name="Enable Z load" size="1" start="7" type="bool"/>
@@ -443,6 +488,10 @@
<value name="Render target 1" value="1"/>
<value name="Render target 2" value="2"/>
<value name="Render target 3" value="3"/>
+ <value name="Render target 4" value="4" min_ver="71"/>
+ <value name="Render target 5" value="5" min_ver="71"/>
+ <value name="Render target 6" value="6" min_ver="71"/>
+ <value name="Render target 7" value="7" min_ver="71"/>
<value name="None" value="8"/>
<value name="Z" value="9"/>
<value name="Stencil" value="10"/>
@@ -789,7 +838,7 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
- <packet code="84" name="Blend Cfg" min_ver="41">
+ <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
<field name="Render Target Mask" size="4" start="24" type="uint"/>
<field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
<field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
@@ -799,6 +848,16 @@
<field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
</packet>
+ <packet code="84" name="Blend Cfg" min_ver="71">
+ <field name="Render Target Mask" size="8" start="24" type="uint"/>
+ <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+ <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+ <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
+ <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
+ <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
+ <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+ </packet>
+
<packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
<field name="Alpha (F16)" size="16" start="48" type="uint"/>
<field name="Blue (F16)" size="16" start="32" type="uint"/>
@@ -828,7 +887,12 @@
<field name="address" size="32" start="0" type="address"/>
</packet>
- <packet code="96" name="Cfg Bits">
+ <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
+ <field name="Lower Test Limit" size="32" start="0" type="float"/>
+ <field name="Upper Test Limit" size="32" start="32" type="float"/>
+ </packet>
+
+ <packet code="96" name="Cfg Bits" max_ver="42">
<field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
<field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
<field name="Blend enable" size="1" start="19" type="bool"/>
@@ -846,6 +910,25 @@
<field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
</packet>
+ <packet code="96" name="Cfg Bits" min_ver="71">
+ <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
+ <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+ <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+ <field name="Blend enable" size="1" start="19" type="bool"/>
+ <field name="Stencil enable" size="1" start="18" type="bool"/>
+ <field name="Z updates enable" size="1" start="15" type="bool"/>
+ <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
+ <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
+ <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
+ <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
+ <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
+ <field name="Line Rasterization" size="1" start="4" type="uint"/>
+ <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+ <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+ <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+ <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+ </packet>
+
<packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
<packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
@@ -907,16 +990,26 @@
<field name="Minimum Zw" size="32" start="0" type="float"/>
</packet>
- <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
<field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
<field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
+ <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
+ <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
+ </packet>
+
<packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
<field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
<field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
</packet>
+ <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
+ <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+ <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+ </packet>
+
<packet name="Number of Layers" code="119" min_ver="41">
<field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
</packet>
@@ -947,7 +1040,7 @@
<field name="sub-id" size="1" start="0" type="uint" default="0"/>
</packet>
- <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">
<field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
<field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
@@ -971,6 +1064,35 @@
</field>
</packet>
+ <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
+ <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+ <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
+
+ <field name="Log2 Tile Height" size="3" start="11" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="8" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
+
+ <field name="tile allocation block size" size="2" start="4" type="uint">
+ <value name="tile allocation block size 64b" value="0"/>
+ <value name="tile allocation block size 128b" value="1"/>
+ <value name="tile allocation block size 256b" value="2"/>
+ </field>
+ <field name="tile allocation initial block size" size="2" start="2" type="uint">
+ <value name="tile allocation initial block size 64b" value="0"/>
+ <value name="tile allocation initial block size 128b" value="1"/>
+ <value name="tile allocation initial block size 256b" value="2"/>
+ </field>
+ </packet>
+
<packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
<field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
<field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
@@ -1002,7 +1124,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
<field name="Pad" size="12" start="52" type="uint"/>
<field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
@@ -1018,7 +1140,11 @@
<field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
<field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
- <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+ <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
+ <value name="Render target maximum 32bpp" value="0"/>
+ <value name="Render target maximum 64bpp" value="1"/>
+ <value name="Render target maximum 128bpp" value="2"/>
+ </field>
<field name="Image Height (pixels)" size="16" start="24" type="uint"/>
<field name="Image Width (pixels)" size="16" start="8" type="uint"/>
@@ -1027,6 +1153,43 @@
<field name="sub-id" size="4" start="0" type="uint" default="0"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
+ <field name="Pad" size="6" start="58" type="uint"/>
+
+ <field name="Log2 Tile Height" size="3" start="55" type="uint">
+ <value name="tile height 8 pixels" value="0"/>
+ <value name="tile height 16 pixels" value="1"/>
+ <value name="tile height 32 pixels" value="2"/>
+ <value name="tile height 64 pixels" value="3"/>
+ </field>
+ <field name="Log2 Tile Width" size="3" start="52" type="uint">
+ <value name="tile width 8 pixels" value="0"/>
+ <value name="tile width 16 pixels" value="1"/>
+ <value name="tile width 32 pixels" value="2"/>
+ <value name="tile width 64 pixels" value="3"/>
+ </field>
+
+ <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+ <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
+
+ <field name="Early-Z disable" size="1" start="46" type="bool"/>
+
+ <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
+ <value name="Early-Z direction LT/LE" value="0"/>
+ <value name="Early-Z direction GT/GE" value="1"/>
+ </field>
+
+ <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
+ <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+ <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+
+ <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+ <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+ <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
+
+ <field name="sub-id" size="3" start="0" type="uint" default="0"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
<field name="Address" size="32" start="32" type="address"/>
@@ -1048,7 +1211,8 @@
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+ <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
+ <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">
<field name="Pad" size="28" start="36" type="uint"/>
@@ -1099,7 +1263,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
<field name="unused" size="16" start="48" type="uint"/>
<field name="Z Clear Value" size="32" start="16" type="float"/>
@@ -1108,6 +1272,15 @@
<field name="sub-id" size="4" start="0" type="uint" default="2"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
+ <field name="unused" size="16" start="48" type="uint"/>
+
+ <field name="Z Clear Value" size="32" start="16" type="float"/>
+
+ <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
+ <field name="sub-id" size="4" start="0" type="uint" default="1"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
@@ -1117,7 +1290,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -1126,6 +1299,19 @@
<field name="sub-id" size="4" start="0" type="uint" default="3"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
+
+ <field name="Clear Color low bits" size="32" start="32" type="uint"/>
+ <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
+ <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
+
+ <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
+ <!-- In multiples of 512 bits -->
+ <field name="Base Address" size="11" start="7" type="uint"/>
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="2"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
@@ -1135,7 +1321,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
<!-- Express this as a 56-bit field? -->
<field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
<field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -1144,6 +1330,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="4"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
+ <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="3"/>
+ </packet>
+
<packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
@@ -1155,7 +1348,7 @@
<field name="sub-id" size="4" start="0" type="uint" default="6"/>
</packet>
- <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+ <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
<field name="pad" size="11" start="53" type="uint"/>
<field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
<!-- image height is for Y flipping -->
@@ -1166,6 +1359,13 @@
<field name="sub-id" size="4" start="0" type="uint" default="5"/>
</packet>
+ <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
+ <field name="Clear Color top bits" size="56" start="8" type="uint"/>
+
+ <field name="Render Target number" size="3" start="3" type="uint"/>
+ <field name="sub-id" size="3" start="0" type="uint" default="4"/>
+ </packet>
+
<packet code="124" shortname="tile_coords" name="Tile Coordinates">
<field name="tile row number" size="12" start="12" type="uint"/>
<field name="tile column number" size="12" start="0" type="uint"/>
@@ -1240,7 +1440,7 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
- <struct name="GL Shader State Record" min_ver="41">
+ <struct name="GL Shader State Record" min_ver="41" max_ver="42">
<field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
<field name="Enable clipping" size="1" start="1" type="bool"/>
@@ -1299,6 +1499,63 @@
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
</struct>
+ <struct name="GL Shader State Record" min_ver="71">
+ <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+ <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+ <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+ <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+ <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+ <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+ <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+ <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+ <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+ <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+
+ <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+ <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+ <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+ <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+ <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+ <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+ <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+ <field name="No prim pack" size="1" start="19" type="bool"/>
+ <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
+
+ <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+ <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+ <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+ <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+ <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+ <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+ <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+ <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+ <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+ <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
+ <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
+ <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
+ <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
+ <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+ <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
+ <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
+ <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
+ <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
+ <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
+
+ <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
+ <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
+ <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
+ <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
+ <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
+ </struct>
+
<struct name="Geometry Shader State Record" min_ver="41">
<field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
<field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
@@ -1543,7 +1800,7 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
- <struct name="TMU Config Parameter 2" min_ver="42">
+ <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
<field name="Pad" size="7" start="25" type="uint"/>
<field name="LOD Query" size="1" start="24" type="bool"/>
<field name="Op" size="4" start="20" type="TMU Op"/>
@@ -1558,6 +1815,23 @@
<field name="Offset Format 8" size="1" start="0" type="bool"/>
</struct>
+ <struct name="TMU Config Parameter 2" min_ver="71">
+ <field name="Pad" size="5" start="27" type="uint"/>
+ <field name="Write conversion" size="1" start="26" type="bool"/>
+ <field name="DIM query" size="1" start="25" type="bool"/>
+ <field name="LOD Query" size="1" start="24" type="bool"/>
+ <field name="Op" size="4" start="20" type="TMU Op"/>
+ <field name="Offset R" size="4" start="16" type="int"/>
+ <field name="Offset T" size="4" start="12" type="int"/>
+ <field name="Offset S" size="4" start="8" type="int"/>
+ <field name="Gather Mode" size="1" start="7" type="bool"/>
+ <field name="Gather Component" size="2" start="5" type="uint"/>
+ <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+ <field name="Sample Number" size="2" start="2" type="uint"/>
+ <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+ <field name="Offset Format 8" size="1" start="0" type="bool"/>
+ </struct>
+
<struct name="Texture Shader State" max_ver="33">
<field name="UIF XOR disable" size="1" start="255" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
@@ -1611,7 +1885,7 @@
<field name="Filter" size="4" start="0" type="TMU Filter"/>
</struct>
- <struct name="Texture Shader State" min_ver="41">
+ <struct name="Texture Shader State" min_ver="41" max_ver="42">
<field name="Pad" size="56" start="136" type="uint"/>
<field name="UIF XOR disable" size="1" start="135" type="bool"/>
<field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
@@ -1652,6 +1926,82 @@
<field name="Flip texture X Axis" size="1" start="0" type="bool"/>
</struct>
+ <struct name="Texture Shader State" min_ver="71">
+ <field name="Pad" size="2" start="190" type="uint"/>
+ <!-- When we use an address type, there is an implicit requirement
+ that the address is a 32-bit that is encoded starting at a 32-bit
+ aligned bit offset into the packet. If the address field has less than
+ 32 bits, it is assumed that the address is aligned. For example, a
+ 26-bit address field is expected to be 64-byte aligned (6 lsb bits
+ are 0) and that this will be encoded into a packet starting at bit
+ offset 6 into a 32-bit dword (since bits 0..5 of the address are
+ implicitly 0 and don't need to be explicitly encoded).
+
+ Unfortunately, the CB address below doesn't match this requirement:
+ it starts at bit 138, which is 10 bits into a 32-bit dword, but it
+ represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
+ encode it as an address type. To fix this we encode these addresses
+ as uint types which has two implications:
+ 1. the driver is responsible for manually addinng the buffer objects
+ for these addresses to the job BO list.
+ 2. the driver needs to pass an actual 26-bit address value by manually
+ shifting the 6 lsb bits (that are implicitly 0).
+ -->
+ <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
+ <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
+ <field name="Chroma offset y" size="1" start="137" type="uint"/>
+ <field name="Chroma offset x" size="1" start="136" type="uint"/>
+
+ <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+ <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+ <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+ <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+ <field name="Base Level" size="4" start="124" type="uint"/>
+ <field name="Max Level" size="4" start="120" type="uint"/>
+
+ <field name="Swizzle A" size="3" start="117" type="uint">
+ <value name="Swizzle Zero" value="0"/>
+ <value name="Swizzle One" value="1"/>
+ <value name="Swizzle Red" value="2"/>
+ <value name="Swizzle Green" value="3"/>
+ <value name="Swizzle Blue" value="4"/>
+ <value name="Swizzle Alpha" value="5"/>
+ </field>
+
+ <field name="Swizzle B" size="3" start="114" type="uint"/>
+ <field name="Swizzle G" size="3" start="111" type="uint"/>
+ <field name="Swizzle R" size="3" start="108" type="uint"/>
+ <field name="Extended" size="1" start="107" type="bool"/>
+
+ <field name="Texture type" size="7" start="100" type="uint"/>
+ <field name="Image Depth" size="14" start="86" type="uint"/>
+ <field name="Image Height" size="14" start="72" type="uint"/>
+ <field name="Image Width" size="14" start="58" type="uint"/>
+
+ <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
+ at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
+ Array Stride starting at 33, which is backwards incompatible,
+ We use the definition from 7.1.5.
+ -->
+ <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
+ <field name="R/B swap" size="1" start="32" type="bool"/>
+
+ <field name="Texture base pointer" size="32" start="0" type="address"/>
+
+ <field name="Reverse" size="1" start="5" type="bool"/>
+ <field name="Transfer func" size="3" start="2" type="uint">
+ <value name="Transfer Func None" value="0"/>
+ <value name="Transfer Func sRGB" value="1"/>
+ <value name="Transfer Func PQ" value="2"/>
+ <value name="Transfer Func HLG" value="3"/>
+ <value name="Transfer Func PQ BT1886" value="4"/>
+ <value name="Transfer Func HLG BT1886" value="5"/>
+ </field>
+ <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+ <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+ </struct>
+
<struct name="Sampler State" min_ver="41">
<field name="Border color word 3" size="32" start="160" type="uint"/>
<field name="Border color word 2" size="32" start="128" type="uint"/>
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
index 5762e5aaa70..e5a1eb26698 100644
--- a/src/broadcom/cle/v3dx_pack.h
+++ b/src/broadcom/cle/v3dx_pack.h
@@ -37,6 +37,8 @@
# include "cle/v3d_packet_v41_pack.h"
#elif (V3D_VERSION == 42)
# include "cle/v3d_packet_v42_pack.h"
+#elif (V3D_VERSION == 71)
+# include "cle/v3d_packet_v71_pack.h"
#else
# error "Need to add a pack header include for this v3d version"
#endif
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index 6ace62b0310..cda407a00bf 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
const uint8_t *cl, uint32_t *size, bool reloc_mode);
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+ const uint8_t *cl, uint32_t *size, bool reloc_mode);
static inline void
out(struct clif_dump *clif, const char *fmt, ...)
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 272190eb2e5..7bc2b662cfc 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
struct drm_v3d_get_param ident1 = {
.param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
};
+ struct drm_v3d_get_param hub_ident3 = {
+ .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
+ };
int ret;
ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
@@ -62,10 +65,13 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
int qups = (ident1.value >> 8) & 0xf;
devinfo->qpu_count = nslc * qups;
+ devinfo->has_accumulators = devinfo->ver < 71;
+
switch (devinfo->ver) {
case 33:
case 41:
case 42:
+ case 71:
break;
default:
fprintf(stderr,
@@ -75,5 +81,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
return false;
}
- return true;
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ devinfo->rev = (hub_ident3.value >> 8) & 0xff;
+
+ return true;
}
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 97abd9b8d9f..8dfc7858727 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -34,11 +34,17 @@ struct v3d_device_info {
/** Simple V3D version: major * 10 + minor */
uint8_t ver;
+ /** V3D revision number */
+ uint8_t rev;
+
/** Size of the VPM, in bytes. */
int vpm_size;
/* NSLC * QUPS from the core's IDENT registers. */
int qpu_count;
+
+ /* If the hw has accumulator registers */
+ bool has_accumulators;
};
typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index 46f38bd7484..354c8784914 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -42,7 +42,8 @@
#define V3D_MAX_SAMPLES 4
-#define V3D_MAX_DRAW_BUFFERS 4
+#define V3D_MAX_DRAW_BUFFERS 8
+#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)
#define V3D_MAX_POINT_SIZE 512.0f
#define V3D_MAX_LINE_WIDTH 32
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
index fe89398208a..b4291fb5350 100644
--- a/src/broadcom/common/v3d_macros.h
+++ b/src/broadcom/common/v3d_macros.h
@@ -41,6 +41,9 @@
#elif (V3D_VERSION == 42)
# define V3DX(x) V3D42_##x
# define v3dX(x) v3d42_##x
+#elif (V3D_VERSION == 71)
+# define V3DX(x) V3D71_##x
+# define v3dX(x) v3d71_##x
#else
# error "Need to add prefixing macros for this v3d version"
#endif
diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
index 08d750c2cbe..a8f0cff8784 100644
--- a/src/broadcom/common/v3d_performance_counters.h
+++ b/src/broadcom/common/v3d_performance_counters.h
@@ -28,6 +28,110 @@
#define V3D_PERFCNT_NAME 1
#define V3D_PERFCNT_DESCRIPTION 2
+#ifndef V3D_VERSION
+# error "The V3D_VERSION macro must be defined"
+#endif
+
+#if (V3D_VERSION >= 71)
+
+static const char *v3d_performance_counters[][3] = {
+ {"CORE", "cycle-count", "[CORE] Cycle counter"},
+ {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
+ {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+ {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+ {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+ {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+ {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+ {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+ {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+ {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+ {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+ {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+ {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+ {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+ {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+ {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+ {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+ {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+ {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+ {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+ {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+ {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+ {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+ {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+ {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
+ {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
+ {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
+ {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+ {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+ {"L2T", "L2T-local", "[L2T] Local mode access"},
+ {"L2T", "L2T-writeback", "[L2T] Writeback"},
+ {"L2T", "L2T-zero", "[L2T] Zero"},
+ {"L2T", "L2T-merge", "[L2T] Merge"},
+ {"L2T", "L2T-fill", "[L2T] Fill"},
+ {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
+ {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
+ {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
+ {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
+ {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
+ {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
+ {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
+ {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
+ {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
+ {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
+ {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+ {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
+ {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+ {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
+ {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
+ {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+ {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+ {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+ {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+ {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+ {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+ {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+ {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+ {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+ {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+ {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+ {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+ {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+ {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+ {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+ {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+ {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+ {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+ {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+ {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+ {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+ {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+ {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+ {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+ {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+ {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
+ {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
+ {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
+ {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
+ {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
+ {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
+ {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
+ {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
+ {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
+ {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+ {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
+ {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+ {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
+ {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
+ {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
+ {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
+ {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
+ {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
+};
+
+#elif (V3D_VERSION >= 41)
+
static const char *v3d_performance_counters[][3] = {
{"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
{"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = {
{"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
};
+#else
+static const char *v3d_performance_counters[][3] = { };
+#endif
+
#endif
diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
index 80da224ca2d..572d0074794 100644
--- a/src/broadcom/common/v3d_tfu.h
+++ b/src/broadcom/common/v3d_tfu.h
@@ -48,4 +48,27 @@
#define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
#define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15
+/* Disable level 0 write, just write following mipmaps */
+#define V3D71_TFU_IOC_DIMTW (1 << 0)
+#define V3D71_TFU_IOC_FORMAT_SHIFT 12
+#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
+#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6
+#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7
+
+#define V3D71_TFU_IOC_STRIDE_SHIFT 16
+#define V3D71_TFU_IOC_NUMMM_SHIFT 4
+
+#define V3D71_TFU_ICFG_OTYPE_SHIFT 16
+#define V3D71_TFU_ICFG_IFORMAT_SHIFT 23
+#define V3D71_TFU_ICFG_FORMAT_RASTER 0
+#define V3D71_TFU_ICFG_FORMAT_SAND_128 1
+#define V3D71_TFU_ICFG_FORMAT_SAND_256 2
+#define V3D71_TFU_ICFG_FORMAT_LINEARTILE 11
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+#define V3D71_TFU_ICFG_FORMAT_UIF_XOR 15
+
#endif
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
index 57872a923d3..8a50d279985 100644
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@@ -87,10 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
return best_wgs_per_sg;
}
+#define V3D71_TLB_COLOR_SIZE (16 * 1024)
+#define V3D71_TLB_DETPH_SIZE (16 * 1024)
+#define V3D71_TLB_AUX_DETPH_SIZE (8 * 1024)
+
+static bool
+tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
+{
+ /* First, we check if we can fit this tile size allocating the depth
+ * TLB memory to color.
+ */
+ if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
+ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
+ return true;
+ }
+
+ /* Otherwise the tile must fit in the main TLB buffers */
+ return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
+ pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
+}
+
void
-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
- bool msaa, bool double_buffer,
- uint32_t *width, uint32_t *height)
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+ uint32_t color_attachment_count,
+ /* V3D 4.x max internal bpp of all RTs */
+ uint32_t max_internal_bpp,
+ /* V3D 7.x accumulated bpp for all RTs (in bytes) */
+ uint32_t total_color_bpp,
+ bool msaa,
+ bool double_buffer,
+ uint32_t *width,
+ uint32_t *height)
{
static const uint8_t tile_sizes[] = {
64, 64,
@@ -103,19 +130,65 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
};
uint32_t idx = 0;
- if (color_attachment_count > 2)
- idx += 2;
- else if (color_attachment_count > 1)
- idx += 1;
+ if (devinfo->ver >= 71) {
+ /* In V3D 7.x, we use the actual bpp used by color attachments to compute
+ * the tile size instead of the maximum bpp. This may allow us to choose a
+ * larger tile size than we would in 4.x in scenarios with multiple RTs
+ * with different bpps.
+ *
+ * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
+ * used for depth instead of the main 16KB depth TLB buffer when the depth
+ * tile fits in the auxiliary buffer, allowing the hardware to allocate
+ * the 16KB from the main depth TLB to the color TLB. If we can do that,
+ * then we are effectively doubling the memory we have for color and we
+ * can also select a larger tile size. This is necessary to support
+ * the most expensive configuration: 8x128bpp RTs + MSAA.
+ *
+ * FIXME: the docs state that depth TLB memory can be used for color
+ * if depth testing is not used by setting the 'depth disable' bit in the
+ * rendering configuration. However, this comes with a requirement that
+ * occlussion queries must not be active. We need to clarify if this means
+ * active at the point at which we emit a tile rendering configuration
+ * item, meaning that the we have a query spanning a full render pass
+ * (this is something we can tell before we emit the rendering
+ * configuration item) or active in the subpass for which we are enabling
+ * the bit (which we can't tell until later, when we record commands for
+ * the subpass). If it is the latter, then we cannot use this feature.
+ *
+ * FIXME: pending handling double_buffer.
+ */
+ const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
+ const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
+ do {
+ const uint32_t tile_w = tile_sizes[idx * 2];
+ const uint32_t tile_h = tile_sizes[idx * 2 + 1];
+ if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
+ break;
+ idx++;
+ } while (idx < ARRAY_SIZE(tile_sizes) / 2);
+
+ /* FIXME: pending handling double_buffer */
+ assert(!double_buffer);
+ } else {
+ /* On V3D 4.x tile size is selected based on the number of RTs, the
+ * maximum bpp across all of them and whether 4x MSAA is used.
+ */
+ if (color_attachment_count > 4)
+ idx += 3;
+ else if (color_attachment_count > 2)
+ idx += 2;
+ else if (color_attachment_count > 1)
+ idx += 1;
- /* MSAA and double-buffer are mutually exclusive */
- assert(!msaa || !double_buffer);
- if (msaa)
- idx += 2;
- else if (double_buffer)
- idx += 1;
+ /* MSAA and double-buffer are mutually exclusive */
+ assert(!msaa || !double_buffer);
+ if (msaa)
+ idx += 2;
+ else if (double_buffer)
+ idx += 1;
- idx += max_color_bpp;
+ idx += max_internal_bpp;
+ }
assert(idx < ARRAY_SIZE(tile_sizes) / 2);
@@ -170,3 +243,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
unreachable("Unsupported primitive type");
}
}
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp)
+{
+ switch (internal_bpp) {
+ case 0 /* V3D_INTERNAL_BPP_32 */:
+ return 1;
+ case 1 /* V3D_INTERNAL_BPP_64 */:
+ return 2;
+ case 2 /* V3D_INTERNAL_BPP_128 */:
+ return 4;
+ default:
+ unreachable("Unsupported internal BPP");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp)
+{
+ /* stride in multiples of 128 bits, and covers 2 rows. This is the
+ * reason we divide by 2 instead of 4, as we divide number of 32-bit
+ * words per row by 2.
+ */
+
+ return (tile_width * bpp) / 2;
+}
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
index eb802b77f67..d02d41dd089 100644
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@@ -24,6 +24,7 @@
#ifndef V3D_UTIL_H
#define V3D_UTIL_H
+#include "util/macros.h"
#include "common/v3d_device_info.h"
#include "pipe/p_defines.h"
@@ -36,9 +37,14 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
uint32_t wg_size);
void
-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
- bool msaa, bool double_buffer,
- uint32_t *width, uint32_t *height);
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+ uint32_t color_attachment_count,
+ uint32_t max_internal_bpp,
+ uint32_t total_color_bpp,
+ bool msaa,
+ bool double_buffer,
+ uint32_t *width,
+ uint32_t *height);
uint32_t
v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
@@ -46,4 +52,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
uint32_t
v3d_hw_prim_type(enum mesa_prim prim_type);
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp);
+
+/* Some configuration packets want the size on log2, but starting at 0 for
+ * size 8.
+ */
+static inline uint8_t
+log2_tile_size(uint32_t size)
+{
+ switch(size) {
+ case 8:
+ return 0;
+ case 16:
+ return 1;
+ case 32:
+ return 2;
+ case 64:
+ return 3;
+ default:
+ unreachable("Unsupported tile width/height");
+ }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+ uint32_t bpp);
#endif
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index ad461dbe24c..4536d3bc67b 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
static struct qreg
emit_smooth_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg w, struct qreg r5)
+ struct qreg vary, struct qreg w, struct qreg c_reg)
{
- return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+ return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
}
static struct qreg
emit_noperspective_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
- return vir_FADD(c, vir_MOV(c, vary), r5);
+ return vir_FADD(c, vir_MOV(c, vary), c_reg);
}
static struct qreg
emit_flat_varying(struct v3d_compile *c,
- struct qreg vary, struct qreg r5)
+ struct qreg vary, struct qreg c_reg)
{
vir_MOV_dest(c, c->undef, vary);
- return vir_MOV(c, r5);
+ return vir_MOV(c, c_reg);
}
static struct qreg
emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
int8_t input_idx, uint8_t swizzle, int array_index)
{
- struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
- struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ struct qreg c_reg; /* C coefficient */
+
+ if (c->devinfo->has_accumulators)
+ c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+ else
+ c_reg = vir_reg(QFILE_REG, 0);
struct qinst *ldvary = NULL;
struct qreg vary;
@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
vary = vir_emit_def(c, ldvary);
} else {
vir_NOP(c)->qpu.sig.ldvary = true;
- vary = r3;
+ vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
}
/* Store the input value before interpolation so we can implement
@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (input_idx >= 0) {
assert(var);
c->interp[input_idx].vp = vary;
- c->interp[input_idx].C = vir_MOV(c, r5);
+ c->interp[input_idx].C = vir_MOV(c, c_reg);
c->interp[input_idx].mode = var->data.interpolation;
}
@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
*/
if (!var) {
assert(input_idx < 0);
- return emit_smooth_varying(c, vary, c->payload_w, r5);
+ return emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
int i = c->num_inputs++;
@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
if (var->data.centroid) {
BITSET_SET(c->centroid_flags, i);
result = emit_smooth_varying(c, vary,
- c->payload_w_centroid, r5);
+ c->payload_w_centroid, c_reg);
} else {
- result = emit_smooth_varying(c, vary, c->payload_w, r5);
+ result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
}
break;
case INTERP_MODE_NOPERSPECTIVE:
BITSET_SET(c->noperspective_flags, i);
- result = emit_noperspective_varying(c, vary, r5);
+ result = emit_noperspective_varying(c, vary, c_reg);
break;
case INTERP_MODE_FLAT:
BITSET_SET(c->flat_shade_flags, i);
- result = emit_flat_varying(c, vary, r5);
+ result = emit_flat_varying(c, vary, c_reg);
break;
default:
@@ -1685,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_VFPACK(c, src[0], src[1]);
break;
+ case nir_op_vpack_v3d:
+ result = vir_VPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_v11fpack_v3d:
+ result = vir_V11FPACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_v10pack_v3d:
+ result = vir_V10PACK(c, src[0], src[1]);
+ break;
+
+ case nir_op_v8pack_v3d:
+ result = vir_V8PACK(c, src[0], src[1]);
+ break;
+
case nir_op_unpack_half_2x16_split_x:
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1715,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
break;
}
+ case nir_op_vftounorm8_v3d:
+ result = vir_VFTOUNORM8(c, src[0]);
+ break;
+
+ case nir_op_vftosnorm8_v3d:
+ result = vir_VFTOSNORM8(c, src[0]);
+ break;
+
+ case nir_op_vftounorm10lo_v3d:
+ result = vir_VFTOUNORM10LO(c, src[0]);
+ break;
+
+ case nir_op_vftounorm10hi_v3d:
+ result = vir_VFTOUNORM10HI(c, src[0]);
+ break;
+
+ case nir_op_ftounorm16_v3d:
+ result = vir_FTOUNORM16(c, src[0]);
+ break;
+
+ case nir_op_ftosnorm16_v3d:
+ result = vir_FTOSNORM16(c, src[0]);
+ break;
default:
fprintf(stderr, "unknown NIR ALU inst: ");
@@ -2440,15 +2483,17 @@ ntq_setup_outputs(struct v3d_compile *c)
switch (var->data.location) {
case FRAG_RESULT_COLOR:
- c->output_color_var[0] = var;
- c->output_color_var[1] = var;
- c->output_color_var[2] = var;
- c->output_color_var[3] = var;
+ for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+ c->output_color_var[i] = var;
break;
case FRAG_RESULT_DATA0:
case FRAG_RESULT_DATA1:
case FRAG_RESULT_DATA2:
case FRAG_RESULT_DATA3:
+ case FRAG_RESULT_DATA4:
+ case FRAG_RESULT_DATA5:
+ case FRAG_RESULT_DATA6:
+ case FRAG_RESULT_DATA7:
c->output_color_var[var->data.location -
FRAG_RESULT_DATA0] = var;
break;
@@ -4321,7 +4366,11 @@ nir_to_vir(struct v3d_compile *c)
{
switch (c->s->info.stage) {
case MESA_SHADER_FRAGMENT:
- c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ if (c->devinfo->ver < 71)
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ else
+ c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
@@ -4361,8 +4410,13 @@ nir_to_vir(struct v3d_compile *c)
V3D_QPU_WADDR_SYNC));
}
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ if (c->devinfo->ver <= 42) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ } else if (c->devinfo->ver >= 71) {
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ }
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
@@ -4541,8 +4595,8 @@ vir_check_payload_w(struct v3d_compile *c)
vir_for_each_inst_inorder(inst, c) {
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_REG &&
- inst->src[i].index == 0) {
+ if (inst->src[i].file == c->payload_w.file &&
+ inst->src[i].index == c->payload_w.index) {
c->uses_center_w = true;
return;
}
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 3b32b48f86f..4f767296860 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -155,12 +155,13 @@ static void
process_mux_deps(struct schedule_state *state, struct schedule_node *n,
enum v3d_qpu_mux mux)
{
+ assert(state->devinfo->ver < 71);
switch (mux) {
case V3D_QPU_MUX_A:
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- if (!n->inst->qpu.sig.small_imm) {
+ if (!n->inst->qpu.sig.small_imm_b) {
add_read_dep(state,
state->last_rf[n->inst->qpu.raddr_b], n);
}
@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
}
}
+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+ uint8_t raddr, bool is_small_imm)
+{
+ assert(state->devinfo->ver >= 71);
+
+ if (!is_small_imm)
+ add_read_dep(state, state->last_rf[raddr], n);
+}
+
static bool
tmu_write_is_sequence_terminator(uint32_t waddr)
{
@@ -285,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* If the input and output segments are shared, then all VPM reads to
* a location need to happen before all writes. We handle this by
* serializing all VPM operations for now.
+ *
+ * FIXME: we are assuming that the segments are shared. That is
+ * correct right now as we are only using shared, but technically you
+ * can choose.
*/
bool separate_vpm_segment = false;
@@ -305,15 +321,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
/* XXX: LOAD_IMM */
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
- process_mux_deps(state, n, inst->alu.add.a);
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
- process_mux_deps(state, n, inst->alu.add.b);
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.a.raddr,
+ inst->sig.small_imm_a);
+ }
+ }
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.add.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.add.b.raddr,
+ inst->sig.small_imm_b);
+ }
+ }
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
- process_mux_deps(state, n, inst->alu.mul.a);
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
- process_mux_deps(state, n, inst->alu.mul.b);
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.a.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+ inst->sig.small_imm_c);
+ }
+ }
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ process_mux_deps(state, n, inst->alu.mul.b.mux);
+ } else {
+ process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+ inst->sig.small_imm_d);
+ }
+ }
switch (inst->alu.add.op) {
case V3D_QPU_A_VPMSETUP:
@@ -386,6 +426,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_r[4], n);
if (v3d_qpu_writes_r5(devinfo, inst))
add_write_dep(state, &state->last_r[5], n);
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+ add_write_dep(state, &state->last_rf[0], n);
/* If we add any more dependencies here we should consider whether we
* also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -500,6 +542,10 @@ struct choose_scoreboard {
int ldvary_count;
int pending_ldtmu_count;
bool first_ldtmu_after_thrsw;
+
+ /* V3D 7.x */
+ int last_implicit_rf0_write_tick;
+ bool has_rf0_flops_conflict;
};
static bool
@@ -524,7 +570,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
}
static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ switch (raddr) {
+ case 0: /* ldvary delayed write of C coefficient to rf0 */
+ if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
struct qinst *qinst)
{
const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -536,24 +599,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
if (inst->alu.add.op != V3D_QPU_A_NOP) {
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+ return true;
+ }
}
- if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
- return true;
+ if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+ return true;
+ }
}
}
if (inst->alu.mul.op != V3D_QPU_M_NOP) {
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
- if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
- mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
- return true;
+ if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+ if (devinfo->ver < 71) {
+ if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+ return true;
+ } else {
+ if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+ return true;
+ }
}
}
@@ -577,6 +660,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
v3d_qpu_writes_r4(devinfo, inst))
return true;
+ if (devinfo->ver <= 42)
+ return false;
+
+ /* Don't schedule anything that writes rf0 right after ldvary, since
+ * that would clash with the ldvary's delayed rf0 write (the exception
+ * is another ldvary, since its implicit rf0 write would also have
+ * one cycle of delay and would not clash).
+ */
+ if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+ (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !inst->sig.ldvary))) {
+ return true;
+ }
+
return false;
}
@@ -604,29 +702,36 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
}
static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
uint32_t waddr) {
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
- inst->raddr_a == waddr)
- return true;
+ if (devinfo->ver < 71) {
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
- if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
- !inst->sig.small_imm && (inst->raddr_b == waddr))
- return true;
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+ return true;
+ } else {
+ if (v3d71_qpu_reads_raddr(inst, waddr))
+ return true;
+ }
return false;
}
static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
- const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+ struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
{
return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
- qpu_instruction_uses_rf(inst,
+ qpu_instruction_uses_rf(devinfo, inst,
scoreboard->last_stallable_sfu_reg);
}
@@ -692,7 +797,8 @@ enum {
V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
V3D_PERIPHERAL_TSY = (1 << 8),
- V3D_PERIPHERAL_TLB = (1 << 9),
+ V3D_PERIPHERAL_TLB_READ = (1 << 9),
+ V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
};
static uint32_t
@@ -717,8 +823,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
if (v3d_qpu_uses_sfu(inst))
result |= V3D_PERIPHERAL_SFU;
- if (v3d_qpu_uses_tlb(inst))
- result |= V3D_PERIPHERAL_TLB;
+ if (v3d_qpu_reads_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_READ;
+ if (v3d_qpu_writes_tlb(inst))
+ result |= V3D_PERIPHERAL_TLB_WRITE;
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
@@ -749,32 +857,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
if (devinfo->ver < 41)
return false;
- /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
- * tmuc).
+ /* V3D 4.x can't do more than one peripheral access except in a
+ * few cases:
*/
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
- b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ if (devinfo->ver <= 42) {
+ /* WRTMUC signal with TMU register write (other than tmuc). */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+ return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ }
+
+ /* TMU read with VPM read/write. */
+ if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+ if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+ (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+ a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+ return true;
+ }
+
+ return false;
}
- if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
- b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
- return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+ /* V3D 7.x can't have more than one of these restricted peripherals */
+ const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+ V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+ V3D_PERIPHERAL_TSY |
+ V3D_PERIPHERAL_TLB_READ |
+ V3D_PERIPHERAL_SFU |
+ V3D_PERIPHERAL_VPM_READ |
+ V3D_PERIPHERAL_VPM_WRITE;
+
+ const uint32_t a_restricted = a_peripherals & restricted;
+ const uint32_t b_restricted = b_peripherals & restricted;
+ if (a_restricted && b_restricted) {
+ /* WRTMUC signal with TMU register write (other than tmuc) is
+ * allowed though.
+ */
+ if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+ (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+ a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+ v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+ return false;
+ }
}
- /* V3D 4.1+ allows TMU read with VPM read/write. */
- if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
- (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
- b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
- return true;
+ /* Only one TMU read per instruction */
+ if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+ (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+ return false;
}
- if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
- (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
- a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
- return true;
+
+ /* Only one TLB access per instruction */
+ if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ)) &&
+ (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+ V3D_PERIPHERAL_TLB_READ))) {
+ return false;
}
- return false;
+ return true;
}
/* Compute a bitmask of which rf registers are used between
@@ -790,42 +941,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
uint64_t raddrs_used = 0;
if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
raddrs_used |= (1ll << a->raddr_a);
- if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+ if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
raddrs_used |= (1ll << a->raddr_b);
if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
raddrs_used |= (1ll << b->raddr_a);
- if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+ if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
raddrs_used |= (1ll << b->raddr_b);
return raddrs_used;
}
-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
*/
static bool
qpu_merge_raddrs(struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *add_instr,
- const struct v3d_qpu_instr *mul_instr)
+ const struct v3d_qpu_instr *mul_instr,
+ const struct v3d_device_info *devinfo)
{
+ if (devinfo->ver >= 71) {
+ assert(add_instr->sig.small_imm_a +
+ add_instr->sig.small_imm_b <= 1);
+ assert(add_instr->sig.small_imm_c +
+ add_instr->sig.small_imm_d == 0);
+ assert(mul_instr->sig.small_imm_a +
+ mul_instr->sig.small_imm_b == 0);
+ assert(mul_instr->sig.small_imm_c +
+ mul_instr->sig.small_imm_d <= 1);
+
+ result->sig.small_imm_a = add_instr->sig.small_imm_a;
+ result->sig.small_imm_b = add_instr->sig.small_imm_b;
+ result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+ result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+ return (result->sig.small_imm_a +
+ result->sig.small_imm_b +
+ result->sig.small_imm_c +
+ result->sig.small_imm_d) <= 1;
+ }
+
+ assert(devinfo->ver <= 42);
+
uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
int naddrs = util_bitcount64(raddrs_used);
if (naddrs > 2)
return false;
- if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+ if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
if (naddrs > 1)
return false;
- if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+ if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
if (add_instr->raddr_b != mul_instr->raddr_b)
return false;
- result->sig.small_imm = true;
- result->raddr_b = add_instr->sig.small_imm ?
+ result->sig.small_imm_b = true;
+ result->raddr_b = add_instr->sig.small_imm_b ?
add_instr->raddr_b : mul_instr->raddr_b;
}
@@ -836,23 +1012,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
raddrs_used &= ~(1ll << raddr_a);
result->raddr_a = raddr_a;
- if (!result->sig.small_imm) {
+ if (!result->sig.small_imm_b) {
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
raddr_a == add_instr->raddr_b) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_B)
- result->alu.add.a = V3D_QPU_MUX_A;
- if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+ result->alu.add.a.mux = V3D_QPU_MUX_A;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_A;
+ result->alu.add.b.mux = V3D_QPU_MUX_A;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
raddr_a == mul_instr->raddr_b) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
- result->alu.mul.a = V3D_QPU_MUX_A;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+ result->alu.mul.a.mux = V3D_QPU_MUX_A;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_A;
+ result->alu.mul.b.mux = V3D_QPU_MUX_A;
}
}
}
@@ -863,20 +1039,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
result->raddr_b = raddr_b;
if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
raddr_b == add_instr->raddr_a) {
- if (add_instr->alu.add.a == V3D_QPU_MUX_A)
- result->alu.add.a = V3D_QPU_MUX_B;
- if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+ if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+ result->alu.add.a.mux = V3D_QPU_MUX_B;
+ if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
- result->alu.add.b = V3D_QPU_MUX_B;
+ result->alu.add.b.mux = V3D_QPU_MUX_B;
}
}
if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
raddr_b == mul_instr->raddr_a) {
- if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
- result->alu.mul.a = V3D_QPU_MUX_B;
- if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+ if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+ result->alu.mul.a.mux = V3D_QPU_MUX_B;
+ if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
- result->alu.mul.b = V3D_QPU_MUX_B;
+ result->alu.mul.b.mux = V3D_QPU_MUX_B;
}
}
@@ -909,7 +1085,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
}
static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst)
{
STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -927,11 +1104,85 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
inst->flags.auf = V3D_QPU_UF_NONE;
inst->alu.mul.output_pack = inst->alu.add.output_pack;
- inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
- inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
+
+ inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+ inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
- inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ if (devinfo->ver >= 71) {
+ assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+ assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+ if (inst->sig.small_imm_a) {
+ inst->sig.small_imm_c = true;
+ inst->sig.small_imm_a = false;
+ } else if (inst->sig.small_imm_b) {
+ inst->sig.small_imm_d = true;
+ inst->sig.small_imm_b = false;
+ }
+ }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ case V3D_QPU_M_FMOV:
+ return devinfo->ver >= 71;
+ default:
+ return false;
+ }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+ switch (op) {
+ case V3D_QPU_M_MOV:
+ return V3D_QPU_A_MOV;
+ case V3D_QPU_M_FMOV:
+ return V3D_QPU_A_FMOV;
+ default:
+ unreachable("unexpected mov opcode");
+ }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+ STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+ assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+ assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+ memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+ inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+ inst->alu.mul.op = V3D_QPU_M_NOP;
+
+ inst->flags.ac = inst->flags.mc;
+ inst->flags.apf = inst->flags.mpf;
+ inst->flags.auf = inst->flags.muf;
+ inst->flags.mc = V3D_QPU_COND_NONE;
+ inst->flags.mpf = V3D_QPU_PF_NONE;
+ inst->flags.muf = V3D_QPU_UF_NONE;
+
+ inst->alu.add.output_pack = inst->alu.mul.output_pack;
+ inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+ inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+ inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+ assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+ if (inst->sig.small_imm_c) {
+ inst->sig.small_imm_a = true;
+ inst->sig.small_imm_c = false;
+ } else if (inst->sig.small_imm_d) {
+ inst->sig.small_imm_b = true;
+ inst->sig.small_imm_d = false;
+ }
}
static bool
@@ -970,20 +1221,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(b->alu.add.op)) {
mul_inst = *b;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge.alu.mul = mul_inst.alu.mul;
- merge.flags.mc = b->flags.ac;
- merge.flags.mpf = b->flags.apf;
- merge.flags.muf = b->flags.auf;
+ merge.flags.mc = mul_inst.flags.mc;
+ merge.flags.mpf = mul_inst.flags.mpf;
+ merge.flags.muf = mul_inst.flags.muf;
add_instr = a;
mul_instr = &mul_inst;
} else if (a->alu.mul.op == V3D_QPU_M_NOP &&
can_do_add_as_mul(a->alu.add.op)) {
mul_inst = *a;
- qpu_convert_add_to_mul(&mul_inst);
+ qpu_convert_add_to_mul(devinfo, &mul_inst);
merge = mul_inst;
merge.alu.add = b->alu.add;
@@ -999,22 +1250,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
}
}
+ struct v3d_qpu_instr add_inst;
if (b->alu.mul.op != V3D_QPU_M_NOP) {
- if (a->alu.mul.op != V3D_QPU_M_NOP)
- return false;
- merge.alu.mul = b->alu.mul;
+ if (a->alu.mul.op == V3D_QPU_M_NOP) {
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = a;
+ }
+ /* If a's mul op is used but its add op is not, then see if we
+ * can convert either a's mul op or b's mul op to an add op
+ * so we can merge.
+ */
+ else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+ add_inst = *b;
+ qpu_convert_mul_to_add(&add_inst);
- merge.flags.mc = b->flags.mc;
- merge.flags.mpf = b->flags.mpf;
- merge.flags.muf = b->flags.muf;
+ merge.alu.add = add_inst.alu.add;
- mul_instr = b;
- add_instr = a;
+ merge.flags.ac = add_inst.flags.ac;
+ merge.flags.apf = add_inst.flags.apf;
+ merge.flags.auf = add_inst.flags.auf;
+
+ mul_instr = a;
+ add_instr = &add_inst;
+ } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+ can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+ add_inst = *a;
+ qpu_convert_mul_to_add(&add_inst);
+
+ merge = add_inst;
+ merge.alu.mul = b->alu.mul;
+
+ merge.flags.mc = b->flags.mc;
+ merge.flags.mpf = b->flags.mpf;
+ merge.flags.muf = b->flags.muf;
+
+ mul_instr = b;
+ add_instr = &add_inst;
+ } else {
+ return false;
+ }
}
+ /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+ * they have restrictions on the number of raddrs that can be adressed
+ * in a single instruction. In V3D 7.x, we don't have that restriction,
+ * but we are still limited to a single small immediate per instruction.
+ */
if (add_instr && mul_instr &&
- !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
- return false;
+ !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+ return false;
}
merge.sig.thrsw |= b->sig.thrsw;
@@ -1025,7 +1316,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
merge.sig.ldtmu |= b->sig.ldtmu;
merge.sig.ldvary |= b->sig.ldvary;
merge.sig.ldvpm |= b->sig.ldvpm;
- merge.sig.small_imm |= b->sig.small_imm;
merge.sig.ldtlb |= b->sig.ldtlb;
merge.sig.ldtlbu |= b->sig.ldtlbu;
merge.sig.ucb |= b->sig.ucb;
@@ -1108,7 +1398,7 @@ retry:
* regfile A or B that was written to by the previous
* instruction."
*/
- if (reads_too_soon_after_write(scoreboard, n->inst))
+ if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
continue;
if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
@@ -1122,10 +1412,11 @@ retry:
if (pixel_scoreboard_too_soon(c, scoreboard, inst))
continue;
- /* ldunif and ldvary both write r5, but ldunif does so a tick
- * sooner. If the ldvary's r5 wasn't used, then ldunif might
+ /* ldunif and ldvary both write the same register (r5 for v42
+ * and below, rf0 for v71), but ldunif does so a tick sooner.
+ * If the ldvary's register wasn't used, then ldunif might
* otherwise get scheduled so ldunif and ldvary try to update
- * r5 in the same tick.
+ * the register in the same tick.
*/
if ((inst->sig.ldunif || inst->sig.ldunifa) &&
scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1204,11 +1495,20 @@ retry:
* ldvary now if the follow-up fixup would place
* it in the delay slots of a thrsw, which is not
* allowed and would prevent the fixup from being
- * successful.
+ * successful. In V3D 7.x we can allow this to happen
+ * as long as it is not the last delay slot.
*/
- if (inst->sig.ldvary &&
- scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
- continue;
+ if (inst->sig.ldvary) {
+ if (c->devinfo->ver <= 42 &&
+ scoreboard->last_thrsw_tick + 2 >=
+ scoreboard->tick - 1) {
+ continue;
+ }
+ if (c->devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 ==
+ scoreboard->tick - 1) {
+ continue;
+ }
}
/* We can emit a new tmu lookup with a previous ldtmu
@@ -1243,7 +1543,7 @@ retry:
int prio = get_instruction_priority(c->devinfo, inst);
- if (mux_read_stalls(scoreboard, inst)) {
+ if (read_stalls(c->devinfo, scoreboard, inst)) {
/* Don't merge an instruction that stalls */
if (prev_inst)
continue;
@@ -1340,6 +1640,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
}
}
+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+ v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ scoreboard->has_rf0_flops_conflict = true;
+ }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return;
+
+ /* Thread switch restrictions:
+ *
+ * At the point of a thread switch or thread end (when the actual
+ * thread switch or thread end happens, not when the signalling
+ * instruction is processed):
+ *
+ * - If the most recent write to rf0 was from a ldunif, ldunifa, or
+ * ldvary instruction in which another signal also wrote to the
+ * register file, and the final instruction of the thread section
+ * contained a signal which wrote to the register file, then the
+ * value of rf0 is undefined at the start of the new section
+ *
+ * Here we use the scoreboard to track if our last rf0 implicit write
+ * happens at the same time that another signal writes the register
+ * file (has_rf0_flops_conflict). We will use that information when
+ * scheduling thrsw instructions to avoid putting anything in their
+ * last delay slot which has a signal that writes to the register file.
+ */
+
+ /* Reset tracking if we have an explicit rf0 write or we are starting
+ * a new thread section.
+ */
+ if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+ scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+ scoreboard->last_implicit_rf0_write_tick = -10;
+ scoreboard->has_rf0_flops_conflict = false;
+ }
+
+ if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+ scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+ scoreboard->tick + 1 : scoreboard->tick;
+ }
+
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
@@ -1383,6 +1739,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
if (inst->sig.ldvary)
scoreboard->last_ldvary_tick = scoreboard->tick;
+ update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
update_scoreboard_tmu_tracking(scoreboard, qinst);
}
@@ -1580,7 +1938,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (slot > 0 && qinst->uniform != ~0)
return false;
- if (v3d_qpu_waits_vpm(inst))
+ if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
return false;
if (inst->sig.ldvary)
@@ -1588,35 +1946,67 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
/* GFXH-1625: TMUWT not allowed in the final instruction. */
- if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+ if (c->devinfo->ver <= 42 && slot == 2 &&
+ inst->alu.add.op == V3D_QPU_A_TMUWT) {
return false;
+ }
- /* No writing physical registers at the end. */
- bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
- bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
- if ((!add_is_nop && !inst->alu.add.magic_write) ||
- (!mul_is_nop && !inst->alu.mul.magic_write)) {
- return false;
+ if (c->devinfo->ver <= 42) {
+ /* No writing physical registers at the end. */
+ bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+ bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+ if ((!add_is_nop && !inst->alu.add.magic_write) ||
+ (!mul_is_nop && !inst->alu.mul.magic_write)) {
+ return false;
+ }
+
+ if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+ !inst->sig_magic) {
+ return false;
+ }
}
- if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
- !inst->sig_magic) {
- return false;
+ if (c->devinfo->ver >= 71) {
+ /* The thread end instruction must not write to the
+ * register file via the add/mul ALUs.
+ */
+ if (slot == 0 &&
+ (!inst->alu.add.magic_write ||
+ !inst->alu.mul.magic_write)) {
+ return false;
+ }
}
if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
return false;
- /* RF0-2 might be overwritten during the delay slots by
- * fragment shader setup.
- */
- if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
- return false;
+ if (c->devinfo->ver <= 42) {
+ /* RF0-2 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+ return false;
- if (inst->raddr_b < 3 &&
- !inst->sig.small_imm &&
- v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
- return false;
+ if (inst->raddr_b < 3 &&
+ !inst->sig.small_imm_b &&
+ v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+ return false;
+ }
+ }
+
+ if (c->devinfo->ver >= 71) {
+ /* RF2-3 might be overwritten during the delay slots by
+ * fragment shader setup.
+ */
+ if (v3d71_qpu_reads_raddr(inst, 2) ||
+ v3d71_qpu_reads_raddr(inst, 3)) {
+ return false;
+ }
+
+ if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+ v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+ return false;
+ }
}
}
@@ -1632,6 +2022,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
*/
static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+ struct choose_scoreboard *scoreboard,
const struct qinst *qinst,
uint32_t slot)
{
@@ -1642,8 +2033,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
return false;
- if (slot > 0 && qinst->qpu.sig.ldvary)
- return false;
+ if (qinst->qpu.sig.ldvary) {
+ if (c->devinfo->ver <= 42 && slot > 0)
+ return false;
+ if (c->devinfo->ver >= 71 && slot == 2)
+ return false;
+ }
/* unifa and the following 3 instructions can't overlap a
* thread switch/end. The docs further clarify that this means
@@ -1662,6 +2057,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
return false;
+ /* See comment when we set has_rf0_flops_conflict for details */
+ if (c->devinfo->ver >= 71 &&
+ slot == 2 &&
+ v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+ !qinst->qpu.sig_magic) {
+ if (scoreboard->has_rf0_flops_conflict)
+ return false;
+ if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+ return false;
+ }
+
return true;
}
@@ -1694,7 +2100,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
* also apply to instructions scheduled after the thrsw that we want
* to place in its delay slots.
*/
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
return false;
/* TLB access is disallowed until scoreboard wait is executed, which
@@ -1767,8 +2173,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
bool is_thrend)
{
for (int slot = 0; slot < instructions_in_sequence; slot++) {
- if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+ if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+ qinst, slot)) {
return false;
+ }
if (is_thrend &&
!qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1969,10 +2377,11 @@ emit_branch(struct v3d_compile *c,
assert(scoreboard->last_branch_tick + 3 < branch_tick);
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
- /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+ /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
* setmsf.
*/
bool is_safe_msf_branch =
+ c->devinfo->ver >= 71 ||
inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -2056,46 +2465,72 @@ emit_branch(struct v3d_compile *c,
}
static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+ struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
uint32_t num_src;
- enum v3d_qpu_mux mux_a, mux_b;
-
- if (add) {
+ if (add)
num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
- mux_a = inst->alu.add.a;
- mux_b = inst->alu.add.b;
- } else {
+ else
num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- mux_a = inst->alu.mul.a;
- mux_b = inst->alu.mul.b;
- }
- for (int i = 0; i < num_src; i++) {
- if (magic) {
- if (i == 0 && mux_a == index)
- return true;
- if (i == 1 && mux_b == index)
- return true;
+ if (devinfo->ver <= 42) {
+ enum v3d_qpu_mux mux_a, mux_b;
+ if (add) {
+ mux_a = inst->alu.add.a.mux;
+ mux_b = inst->alu.add.b.mux;
} else {
- if (i == 0 && mux_a == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 0 && mux_a == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_A &&
- inst->raddr_a == index) {
- return true;
- }
- if (i == 1 && mux_b == V3D_QPU_MUX_B &&
- inst->raddr_b == index) {
- return true;
+ mux_a = inst->alu.mul.a.mux;
+ mux_b = inst->alu.mul.b.mux;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (magic) {
+ if (i == 0 && mux_a == index)
+ return true;
+ if (i == 1 && mux_b == index)
+ return true;
+ } else {
+ if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+ inst->raddr_a == index) {
+ return true;
+ }
+ if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+ inst->raddr_b == index) {
+ return true;
+ }
}
}
+
+ return false;
+ }
+
+ assert(devinfo->ver >= 71);
+ assert(!magic);
+
+ uint32_t raddr_a, raddr_b;
+ if (add) {
+ raddr_a = inst->alu.add.a.raddr;
+ raddr_b = inst->alu.add.b.raddr;
+ } else {
+ raddr_a = inst->alu.mul.a.raddr;
+ raddr_b = inst->alu.mul.b.raddr;
+ }
+
+ for (int i = 0; i < num_src; i++) {
+ if (i == 0 && raddr_a == index)
+ return true;
+ if (i == 1 && raddr_b == index)
+ return true;
}
return false;
@@ -2130,6 +2565,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
struct qblock *block,
struct v3d_qpu_instr *inst)
{
+ const struct v3d_device_info *devinfo = c->devinfo;
+
/* We only call this if we have successfully merged an ldvary into a
* previous instruction.
*/
@@ -2142,9 +2579,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
* the ldvary destination, if it does, then moving the ldvary before
* it would overwrite it.
*/
- if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
return false;
- if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+ if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
return false;
/* The implicit ldvary destination may not be written to by a signal
@@ -2180,13 +2617,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
}
/* The previous instruction cannot have a conflicting signal */
- if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+ if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
return false;
uint32_t sig;
struct v3d_qpu_sig new_sig = prev->qpu.sig;
new_sig.ldvary = true;
- if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
return false;
/* The previous instruction cannot use flags since ldvary uses the
@@ -2199,9 +2636,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
/* We can't put an ldvary in the delay slots of a thrsw. We should've
* prevented this when pairing up the ldvary with another instruction
- * and flagging it for a fixup.
+ * and flagging it for a fixup. In V3D 7.x this is limited only to the
+ * second delay slot.
*/
- assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+ assert((devinfo->ver <= 42 &&
+ scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+ (devinfo->ver >= 71 &&
+ scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
/* Move the ldvary to the previous instruction and remove it from the
* current one.
@@ -2215,14 +2656,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
inst->sig_magic = false;
inst->sig_addr = 0;
- /* By moving ldvary to the previous instruction we make it update
- * r5 in the current one, so nothing else in it should write r5.
- * This should've been prevented by our dependency tracking, which
+ /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+ if (devinfo->ver >= 71) {
+ scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+ set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+ }
+
+ /* By moving ldvary to the previous instruction we make it update r5
+ * (rf0 for ver >= 71) in the current one, so nothing else in it
+ * should write this register.
+ *
+ * This should've been prevented by our depedency tracking, which
* would not allow ldvary to be paired up with an instruction that
- * writes r5 (since our dependency tracking doesn't know that the
- * ldvary write r5 happens in the next instruction).
+ * writes r5/rf0 (since our dependency tracking doesn't know that the
+ * ldvary write to r5/rf0 happens in the next instruction).
*/
- assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+ assert(!v3d_qpu_writes_r5(devinfo, inst));
+ assert(devinfo->ver <= 42 ||
+ (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+ !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
return true;
}
@@ -2313,7 +2765,7 @@ schedule_instructions(struct v3d_compile *c,
}
}
}
- if (mux_read_stalls(scoreboard, inst))
+ if (read_stalls(c->devinfo, scoreboard, inst))
c->qpu_inst_stalled_count++;
}
@@ -2538,6 +2990,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_setmsf_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
scoreboard.first_ldtmu_after_thrsw = true;
+ scoreboard.last_implicit_rf0_write_tick = - 10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 2cc7a0eb0ae..0466ee5d0b6 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
int last_sfu_write;
int last_branch_ip;
int last_thrsw_ip;
+ int first_tlb_z_write;
/* Set when we've found the last-THRSW signal, or if we were started
* in single-segment mode.
@@ -110,11 +111,58 @@ static void
qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
{
const struct v3d_device_info *devinfo = state->c->devinfo;
+
+ if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+ state->first_tlb_z_write = state->ip;
+
const struct v3d_qpu_instr *inst = &qinst->qpu;
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+ inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+ fail_instr(state, "Implicit branch MSF read after TLB Z write");
+ }
+
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return;
+ if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+ state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write) {
+ fail_instr(state, "SETMSF after TLB Z write");
+ }
+
+ if (state->first_tlb_z_write >= 0 &&
+ state->ip > state->first_tlb_z_write &&
+ inst->alu.add.op == V3D_QPU_A_MSF) {
+ fail_instr(state, "MSF read after TLB Z write");
+ }
+
+ if (devinfo->ver < 71) {
+ if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+ inst->sig.small_imm_d) {
+ fail_instr(state, "small imm a/c/d added after V3D 7.1");
+ }
+ } else {
+ if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+ !vir_is_add(qinst)) {
+ fail_instr(state, "small imm a/b used but no ADD inst");
+ }
+ if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+ !vir_is_mul(qinst)) {
+ fail_instr(state, "small imm c/d used but no MUL inst");
+ }
+ if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+ inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+ fail_instr(state, "only one small immediate can be "
+ "enabled per instruction");
+ }
+ }
+
/* LDVARY writes r5 two instructions later and LDUNIF writes
* r5 one instruction later, which is illegal to have
* together.
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
"SFU write started during THRSW delay slots ");
}
- if (inst->sig.ldvary)
- fail_instr(state, "LDVARY during THRSW delay slots");
+ if (inst->sig.ldvary) {
+ if (devinfo->ver <= 42)
+ fail_instr(state, "LDVARY during THRSW delay slots");
+ if (devinfo->ver >= 71 &&
+ state->ip - state->last_thrsw_ip == 2) {
+ fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+ }
+ }
}
(void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
vpm_writes +
tlb_writes +
tsy_writes +
- inst->sig.ldtmu +
+ (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
inst->sig.ldtlb +
inst->sig.ldvpm +
inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if ((inst->alu.add.op != V3D_QPU_A_NOP &&
!inst->alu.add.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "ADD RF write at THREND");
+ }
+ if (inst->alu.add.waddr == 2 ||
+ inst->alu.add.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
!inst->alu.mul.magic_write)) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71) {
+ if (state->last_thrsw_ip - state->ip == 0) {
+ fail_instr(state,
+ "MUL RF write at THREND");
+ }
+
+ if (inst->alu.mul.waddr == 2 ||
+ inst->alu.mul.waddr == 3) {
+ fail_instr(state,
+ "RF2-3 write after THREND");
+ }
+ }
}
if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
!inst->sig_magic) {
- fail_instr(state, "RF write after THREND");
+ if (devinfo->ver <= 42) {
+ fail_instr(state, "RF write after THREND");
+ } else if (devinfo->ver >= 71 &&
+ (inst->sig_addr == 2 ||
+ inst->sig_addr == 3)) {
+ fail_instr(state, "RF2-3 write after THREND");
+ }
}
/* GFXH-1625: No TMUWT in the last instruction */
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
.last_sfu_write = -10,
.last_thrsw_ip = -10,
.last_branch_ip = -10,
+ .first_tlb_z_write = INT_MAX,
.ip = 0,
.last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 9f4129870e1..b437b5f5168 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -613,6 +613,11 @@ struct v3d_ra_node_info {
struct {
uint32_t priority;
uint8_t class_bits;
+ bool is_program_end;
+ bool unused;
+
+ /* V3D 7.x */
+ bool is_ldunif_dst;
} *info;
uint32_t alloc_count;
};
@@ -1150,8 +1155,8 @@ bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
uint8_t vir_channels_written(struct qinst *inst);
struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
@@ -1184,7 +1189,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
bool v3d_nir_lower_scratch(nir_shader *s);
bool v3d_nir_lower_txf_ms(nir_shader *s);
-bool v3d_nir_lower_image_load_store(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
@@ -1425,6 +1430,20 @@ VIR_SFU(LOG)
VIR_SFU(SIN)
VIR_SFU(RSQRT2)
+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
static inline struct qinst *
vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
struct qreg dest, struct qreg src)
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2900a29817f..bbb55be4a14 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,6 +40,10 @@
* calculations and load/store using the TMU general memory access path.
*/
+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
bool
v3d_gl_format_is_return_32(enum pipe_format format)
{
@@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format)
/* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
* 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
*/
static nir_ssa_def *
pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
}
+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static nir_ssa_def *
+nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2)
+{
+ return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_11f11f10f(nir_builder *b, nir_ssa_def *color)
+{
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ /* FIXME: we noted that we could just use p2 again as the second
+ * element to pack, and CTS tests still works. Just using undef as is
+ * slightly more correct
+ */
+ nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+ return nir_v11fpack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color)
+{
+ nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+
+ return nir_v10pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color)
+{
+ nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, 1));
+ p1 = nir_vftounorm10lo_v3d(b, p1);
+
+ nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = nir_vftounorm10hi_v3d(b, p2);
+
+ return nir_v10pack_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+ NONE,
+ TO_SNORM,
+ TO_UNORM
+};
+
+static inline nir_ssa_def *
+pack_8bit(nir_builder *b, nir_ssa_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) for 1 component if we are not doing any
+ * conversion. But we support also that case, and let the caller
+ * decide which method to use.
+ */
+ nir_ssa_def *p1;
+ nir_ssa_def *p2;
+
+ if (conversion == NONE) {
+ p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ } else {
+ p1 = nir_vfpack(b, nir_channel(b, color, 0),
+ nir_channel(b, color, num_components == 1 ? 0 : 1));
+ p1 = (conversion == TO_UNORM) ?
+ nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1);
+ }
+ if (num_components == 4) {
+ if (conversion == NONE) {
+ p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ } else {
+ p2 = nir_vfpack(b, nir_channel(b, color, 2),
+ nir_channel(b, color, 3));
+ p2 = (conversion == TO_UNORM) ?
+ nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2);
+ }
+ } else {
+ /* As mentioned on the comment before, using an undef here
+ * would be more correct. But for this case we are getting
+ * worse values, and in fact even some worse instruction count
+ * with some CTS tests, so we just reuse the first packing
+ */
+ p2 = p1;
+ }
+
+ return nir_v8pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_16bit(nir_builder *b, nir_ssa_def *color,
+ unsigned num_components,
+ enum hw_conversion conversion)
+{
+ nir_ssa_def *results[2];
+ nir_ssa_def *channels[4];
+
+ /* Note that usually you should not use this method (that relies on
+ * custom packing) if we are not doing any conversion. But we support
+ * also that case, and let the caller decide which method to use.
+ */
+
+ for (unsigned i = 0; i < num_components; i++) {
+ channels[i] = nir_channel(b, color, i);
+ switch (conversion) {
+ case TO_SNORM:
+ channels[i] = nir_ftosnorm16_v3d(b, channels[i]);
+ break;
+ case TO_UNORM:
+ channels[i] = nir_ftounorm16_v3d(b, channels[i]);
+ break;
+ default:
+ break;
+ }
+ }
+
+ switch (num_components) {
+ case 1:
+ results[0] = channels[0];
+ break;
+ case 4:
+ results[1] = nir_vpack_v3d(b, channels[2], channels[3]);
+ FALLTHROUGH;
+ case 2:
+ results[0] = nir_vpack_v3d(b, channels[0], channels[1]);
+ break;
+ }
+
+ return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_ssa_def *
+pack_xbit(nir_builder *b, nir_ssa_def *color,
+ unsigned num_components,
+ const struct util_format_channel_description *r_chan)
+{
+ bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+ enum hw_conversion conversion = NONE;
+ if (r_chan->normalized) {
+ conversion =
+ (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+ }
+
+ switch (r_chan->size) {
+ case 8:
+ if (conversion == NONE && num_components < 2)
+ return pack_bits(b, color, bits_8, num_components, pack_mask);
+ else
+ return pack_8bit(b, color, num_components, conversion);
+ break;
+ case 16:
+ /* pack_mask implies that the generic packing method would
+ * need to include extra operations to handle negative values,
+ * so in that case, even without a conversion, it is better to
+ * use the packing using custom hw operations.
+ */
+ if (conversion == NONE && !pack_mask)
+ return pack_bits(b, color, bits_16, num_components, pack_mask);
+ else
+ return pack_16bit(b, color, num_components, conversion);
+ break;
+ default:
+ unreachable("unrecognized bits");
+ }
+}
+
static bool
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
{
enum pipe_format format = nir_intrinsic_format(instr);
assert(format != PIPE_FORMAT_NONE);
@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
*/
formatted = color;
} else {
- static const unsigned bits_8[4] = {8, 8, 8, 8};
- static const unsigned bits_16[4] = {16, 16, 16, 16};
- static const unsigned bits_1010102[4] = {10, 10, 10, 2};
const unsigned *bits;
switch (r_chan->size) {
@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
return true;
}
+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ enum pipe_format format = nir_intrinsic_format(instr);
+ assert(format != PIPE_FORMAT_NONE);
+ const struct util_format_description *desc =
+ util_format_description(format);
+ const struct util_format_channel_description *r_chan = &desc->channel[0];
+ unsigned num_components = util_format_get_nr_components(format);
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_ssa_def *color = nir_channels(b,
+ nir_ssa_for_src(b, instr->src[3], 4),
+ (1 << num_components) - 1);
+ nir_ssa_def *formatted = NULL;
+ if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ formatted = nir_format_pack_r9g9b9e5(b, color);
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ formatted = pack_11f11f10f(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+ formatted = pack_r10g10b10a2_uint(b, color);
+ } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+ formatted = pack_r10g10b10a2_unorm(b, color);
+ } else if (r_chan->size == 32) {
+ /* For 32-bit formats, we just have to move the vector
+ * across (possibly reducing the number of channels).
+ */
+ formatted = color;
+ } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+ assert(r_chan->size == 16);
+ formatted = nir_format_float_to_half(b, color);
+ formatted = pack_bits(b, formatted, bits_16, num_components,
+ false);
+ } else {
+ assert(r_chan->size == 8 || r_chan->size == 16);
+ formatted = pack_xbit(b, color, num_components, r_chan);
+ }
+
+ nir_instr_rewrite_src(&instr->instr, &instr->src[3],
+ nir_src_for_ssa(formatted));
+ instr->num_components = formatted->num_components;
+
+ return true;
+}
+
static bool
v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
{
@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
nir_intrinsic_instr *intr =
nir_instr_as_intrinsic(instr);
+ struct v3d_compile *c = (struct v3d_compile *) _state;
+
switch (intr->intrinsic) {
case nir_intrinsic_image_load:
return v3d_nir_lower_image_load(b, intr);
case nir_intrinsic_image_store:
- return v3d_nir_lower_image_store(b, intr);
+ if (c->devinfo->ver >= 71)
+ return v3d_nir_lower_image_store_v71(b, intr);
+ else
+ return v3d_nir_lower_image_store_v42(b, intr);
+ break;
default:
return false;
}
@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
}
bool
-v3d_nir_lower_image_load_store(nir_shader *s)
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
{
return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb,
nir_metadata_block_index |
- nir_metadata_dominance, NULL);
+ nir_metadata_dominance, c);
}
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 69929a145aa..8a314c8b5a9 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
* The correct fix for this as recommended by Broadcom
* is to convert to .8 fixed-point with ffloor().
*/
- pos = nir_f2i32(b, nir_ffloor(b, pos));
- v3d_nir_store_output(b, state->vp_vpm_offset + i,
- offset_reg, pos);
+ if (c->devinfo->ver <= 42)
+ pos = nir_f2i32(b, nir_ffloor(b, pos));
+ else
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
+ offset_reg, pos);
}
}
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 499215454c0..192872f368c 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
return false;
}
- if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
- inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+ if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
@@ -156,8 +156,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
for (int i = 0; i < vir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_VPM:
@@ -178,8 +182,12 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
}
bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+ struct qinst *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
switch (inst->dst.file) {
case QFILE_MAGIC:
switch (inst->dst.index) {
@@ -209,15 +217,15 @@ vir_set_unpack(struct qinst *inst, int src,
if (vir_is_add(inst)) {
if (src == 0)
- inst->qpu.alu.add.a_unpack = unpack;
+ inst->qpu.alu.add.a.unpack = unpack;
else
- inst->qpu.alu.add.b_unpack = unpack;
+ inst->qpu.alu.add.b.unpack = unpack;
} else {
assert(vir_is_mul(inst));
if (src == 0)
- inst->qpu.alu.mul.a_unpack = unpack;
+ inst->qpu.alu.mul.a.unpack = unpack;
else
- inst->qpu.alu.mul.b_unpack = unpack;
+ inst->qpu.alu.mul.b.unpack = unpack;
}
}
@@ -737,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
+ *
+ * FIXME: initially testing on V3D 7.1 seems to work fine when using
+ * separate segments. So we could try to reevaluate in the future, if
+ * there is any advantage of using separate segments.
*/
prog_data->separate_segments = false;
prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -1572,7 +1584,7 @@ v3d_attempt_compile(struct v3d_compile *c)
NIR_PASS(_, c->s, v3d_nir_lower_io, c);
NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
- NIR_PASS(_, c->s, v3d_nir_lower_image_load_store);
+ NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
nir_lower_idiv_options idiv_options = {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..ab5d4043039 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
- unpack[0] = instr->alu.add.a_unpack;
- unpack[1] = instr->alu.add.b_unpack;
+ unpack[0] = instr->alu.add.a.unpack;
+ unpack[1] = instr->alu.add.b.unpack;
} else {
fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
vir_print_reg(c, inst, inst->dst);
fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
- unpack[0] = instr->alu.mul.a_unpack;
- unpack[1] = instr->alu.mul.b_unpack;
+ unpack[0] = instr->alu.mul.a.unpack;
+ unpack[1] = instr->alu.mul.b.unpack;
}
for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..2907de9049f 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
flags_inst = NULL;
}
- /* Payload registers: r0/1/2 contain W, centroid W,
- * and Z at program start. Register allocation will
- * force their nodes to R0/1/2.
+ /* Payload registers: for fragment shaders, W,
+ * centroid W, and Z will be initialized at r0/1/2
+ * until v42, or r1/r2/r3 from v71.
+ *
+ * For compute shaders, payload would be r0/r2 until
+ * v42, r3/r2 from v71
+ *
+ * Register allocation will force their nodes to those
+ * registers.
*/
if (inst->src[0].file == QFILE_REG) {
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
+ uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+ uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+ if (inst->src[0].index >= min_payload_r ||
+ inst->src[0].index <= max_payload_r) {
c->temp_start[inst->dst.index] = 0;
- break;
}
}
diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index da121c2a5bd..1260838ca05 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
#include "v3d_compiler.h"
static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
{
if (!inst)
return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
return false;
}
- switch (inst->src[0].file) {
- case QFILE_MAGIC:
- /* No copy propagating from R3/R4/R5 -- the MOVs from those
- * are there to register allocate values produced into R3/4/5
- * to other regs (though hopefully r3/4/5).
- */
- switch (inst->src[0].index) {
- case V3D_QPU_WADDR_R3:
- case V3D_QPU_WADDR_R4:
- case V3D_QPU_WADDR_R5:
- return false;
+ if (devinfo->ver <= 42) {
+ switch (inst->src[0].file) {
+ case QFILE_MAGIC:
+ /* No copy propagating from R3/R4/R5 -- the MOVs from
+ * those are there to register allocate values produced
+ * into R3/4/5 to other regs (though hopefully r3/4/5).
+ */
+ switch (inst->src[0].index) {
+ case V3D_QPU_WADDR_R3:
+ case V3D_QPU_WADDR_R4:
+ case V3D_QPU_WADDR_R5:
+ return false;
+ default:
+ break;
+ }
+ break;
+
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ case 0:
+ case 1:
+ case 2:
+ /* MOVs from rf0/1/2 are only to track the live
+ * intervals for W/centroid W/Z.
+ */
+ return false;
+ }
+ break;
+
default:
break;
}
- break;
-
- case QFILE_REG:
- switch (inst->src[0].index) {
- case 0:
- case 1:
- case 2:
- /* MOVs from rf0/1/2 are only to track the live
+ } else {
+ assert(devinfo->ver >= 71);
+ switch (inst->src[0].file) {
+ case QFILE_REG:
+ switch (inst->src[0].index) {
+ /* MOVs from rf1/2/3 are only to track the live
* intervals for W/centroid W/Z.
+ *
+ * Note: rf0 can be implicitly written by ldvary
+ * (no temp involved), so it is not an SSA value and
+ * could clash with writes to other temps that are
+ * also allocated to rf0. In theory, that would mean
+ * that we can't copy propagate from it, but we handle
+ * this at register allocation time, preventing temps
+ * from being allocated to rf0 while the rf0 value from
+ * ldvary is still live.
*/
- return false;
- }
- break;
+ case 1:
+ case 2:
+ case 3:
+ return false;
+ }
+ break;
- default:
- break;
+ default:
+ break;
+ }
}
return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)
if (vir_is_add(inst)) {
if (chan == 0)
- return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
} else {
if (chan == 0)
- return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
else
- return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+ return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
}
}
@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
*/
struct qinst *mov = movs[inst->src[i].index];
if (!mov) {
- if (!is_copy_mov(c->defs[inst->src[i].index]))
+ if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
continue;
mov = c->defs[inst->src[i].index];
@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
continue;
/* these ops can't represent abs. */
- if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+ if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_VFPACK:
case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
inst->src[i] = mov->src[0];
if (vir_has_unpack(mov, 0)) {
- enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+ enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
vir_set_unpack(inst, i, unpack);
}
@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)
apply_kills(c, movs, inst);
- if (is_copy_mov(inst))
+ if (is_copy_mov(c->devinfo, inst))
movs[inst->dst.index] = inst;
}
}
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index c7896d57f2b..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
a->qpu.flags.mpf != b->qpu.flags.mpf ||
a->qpu.alu.add.op != b->qpu.alu.add.op ||
a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
- a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
- a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+ a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+ a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
- a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
- a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+ a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+ a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
return false;
}
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..ed5bc011964 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
/* The small immediate value sits in the raddr B field, so we
* can't have 2 small immediates in one instruction (unless
* they're the same value, but that should be optimized away
- * elsewhere).
+ * elsewhere). Since 7.x we can encode small immediates in
+ * any raddr field, but each instruction can still only use
+ * one.
*/
bool uses_small_imm = false;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
*/
struct v3d_qpu_sig new_sig = inst->qpu.sig;
uint32_t sig_packed;
- new_sig.small_imm = true;
+ if (c->devinfo->ver <= 42) {
+ new_sig.small_imm_b = true;
+ } else {
+ if (vir_is_add(inst)) {
+ if (i == 0)
+ new_sig.small_imm_a = true;
+ else
+ new_sig.small_imm_b = true;
+ } else {
+ if (i == 0)
+ new_sig.small_imm_c = true;
+ else
+ new_sig.small_imm_d = true;
+ }
+ }
+
if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
continue;
@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
vir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->qpu.sig.small_imm = true;
+ inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+ inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+ inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+ inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
inst->qpu.raddr_b = packed;
inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index b22f915d1df..8eac2b75bd7 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -28,41 +28,73 @@
#define ACC_INDEX 0
#define ACC_COUNT 6
-#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT 64
+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return ACC_INDEX + ACC_COUNT;
+ else
+ return 0;
+}
+
+/* ACC as accumulator */
#define CLASS_BITS_PHYS (1 << 0)
#define CLASS_BITS_ACC (1 << 1)
#define CLASS_BITS_R5 (1 << 4)
-#define CLASS_BITS_ANY (CLASS_BITS_PHYS | \
- CLASS_BITS_ACC | \
- CLASS_BITS_R5)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+ if (devinfo->has_accumulators)
+ return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+ else
+ return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+ if (!devinfo->has_accumulators) {
+ assert(class_bits & CLASS_BITS_PHYS);
+ class_bits = CLASS_BITS_PHYS;
+ }
+ return class_bits;
+}
static inline uint32_t
-temp_to_node(uint32_t temp)
+temp_to_node(struct v3d_compile *c, uint32_t temp)
{
- return temp + ACC_COUNT;
+ return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
}
static inline uint32_t
-node_to_temp(uint32_t node)
+node_to_temp(struct v3d_compile *c, uint32_t node)
{
- assert(node >= ACC_COUNT);
- return node - ACC_COUNT;
+ assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+ (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+ return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+ IMPLICIT_RF_COUNT);
}
static inline uint8_t
-get_temp_class_bits(struct v3d_ra_node_info *nodes,
+get_temp_class_bits(struct v3d_compile *c,
uint32_t temp)
{
- return nodes->info[temp_to_node(temp)].class_bits;
+ return c->nodes.info[temp_to_node(c, temp)].class_bits;
}
static inline void
-set_temp_class_bits(struct v3d_ra_node_info *nodes,
+set_temp_class_bits(struct v3d_compile *c,
uint32_t temp, uint8_t class_bits)
{
- nodes->info[temp_to_node(temp)].class_bits = class_bits;
+ c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
}
static struct ra_class *
@@ -71,11 +103,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
if (class_bits == CLASS_BITS_PHYS) {
return c->compiler->reg_class_phys[c->thread_index];
} else if (class_bits == (CLASS_BITS_R5)) {
+ assert(c->devinfo->has_accumulators);
return c->compiler->reg_class_r5[c->thread_index];
} else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+ assert(c->devinfo->has_accumulators);
return c->compiler->reg_class_phys_or_acc[c->thread_index];
} else {
- assert(class_bits == CLASS_BITS_ANY);
+ assert(class_bits == get_class_bit_any(c->devinfo));
return c->compiler->reg_class_any[c->thread_index];
}
}
@@ -84,7 +118,7 @@ static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
{
assert(temp < c->num_temps && temp < c->nodes.alloc_count);
- return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
+ return choose_reg_class(c, get_temp_class_bits(c, temp));
}
static inline bool
@@ -313,7 +347,7 @@ v3d_choose_spill_node(struct v3d_compile *c)
for (unsigned i = 0; i < c->num_temps; i++) {
if (BITSET_TEST(c->spillable, i)) {
- ra_set_node_spill_cost(c->g, temp_to_node(i),
+ ra_set_node_spill_cost(c->g, temp_to_node(c, i),
spill_costs[i]);
}
}
@@ -331,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
c->nodes.info = reralloc_array_size(c,
c->nodes.info,
sizeof(c->nodes.info[0]),
- c->nodes.alloc_count + ACC_COUNT);
+ c->nodes.alloc_count +
+ MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
}
/* Creates the interference node for a new temp. We use this to keep the node
@@ -343,11 +378,15 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
ensure_nodes(c);
int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
- assert(node == temp + ACC_COUNT);
+ assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+ node == temp + IMPLICIT_RF_COUNT);
/* We fill the node priority after we are done inserting spills */
c->nodes.info[node].class_bits = class_bits;
c->nodes.info[node].priority = 0;
+ c->nodes.info[node].is_ldunif_dst = false;
+ c->nodes.info[node].is_program_end = false;
+ c->nodes.info[node].unused = false;
}
/* The spill offset for this thread takes a bit of setup, so do it once at
@@ -395,8 +434,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
*/
if (c->spilling) {
int temp_class = CLASS_BITS_PHYS;
- if (i != c->spill_base.index)
+ if (c->devinfo->has_accumulators &&
+ i != c->spill_base.index) {
temp_class |= CLASS_BITS_ACC;
+ }
add_node(c, i, temp_class);
}
}
@@ -436,7 +477,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
*/
assert(c->disable_ldunif_opt);
struct qreg offset = vir_uniform_ui(c, spill_offset);
- add_node(c, offset.index, CLASS_BITS_ANY);
+ add_node(c, offset.index, get_class_bit_any(c->devinfo));
/* We always enable per-quad on spills/fills to ensure we spill
* any channels involved with helper invocations.
@@ -455,14 +496,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
* temp will be used immediately so just like the uniform above we
* can allow accumulators.
*/
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
if (!fill_dst) {
struct qreg dst = vir_TMUWT(c);
assert(dst.file == QFILE_TEMP);
- add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ add_node(c, dst.index, temp_class);
} else {
*fill_dst = vir_LDTMU(c);
assert(fill_dst->file == QFILE_TEMP);
- add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+ add_node(c, fill_dst->index, temp_class);
}
/* Temps across the thread switch we injected can't be assigned to
@@ -482,7 +525,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
c->temp_start[i] < ip && c->temp_end[i] >= ip :
c->temp_start[i] <= ip && c->temp_end[i] > ip;
if (thrsw_cross) {
- ra_set_node_class(c->g, temp_to_node(i),
+ ra_set_node_class(c->g, temp_to_node(c, i),
choose_reg_class(c, CLASS_BITS_PHYS));
}
}
@@ -509,8 +552,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
* same register class bits as the original.
*/
if (inst == position) {
- uint8_t class_bits = get_temp_class_bits(&c->nodes,
- inst->dst.index);
+ uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
inst->dst = vir_get_temp(c);
add_node(c, inst->dst.index, class_bits);
} else {
@@ -542,7 +584,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
}
static void
-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+ int spill_temp)
{
c->spill_start_num_temps = c->num_temps;
c->spilling = true;
@@ -554,8 +597,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
spill_offset = c->spill_size;
c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
- if (spill_offset == 0)
+ if (spill_offset == 0) {
v3d_setup_spill_base(c);
+
+ /* Don't allocate our spill base to rf0 to avoid
+ * conflicts with instructions doing implicit writes
+ * to that register.
+ */
+ if (!c->devinfo->has_accumulators) {
+ ra_add_node_interference(
+ c->g,
+ temp_to_node(c, c->spill_base.index),
+ implicit_rf_nodes[0]);
+ }
+ }
}
struct qinst *last_thrsw = c->last_thrsw;
@@ -574,7 +629,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
reconstruct_op = orig_def->qpu.alu.add.op;
}
- uint32_t spill_node = temp_to_node(spill_temp);
+ uint32_t spill_node = temp_to_node(c, spill_temp);
/* We must disable the ldunif optimization if we are spilling uniforms */
bool had_disable_ldunif_opt = c->disable_ldunif_opt;
@@ -635,7 +690,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* instruction immediately after, so
* we can use any register class for it.
*/
- add_node(c, unif.index, CLASS_BITS_ANY);
+ add_node(c, unif.index,
+ get_class_bit_any(c->devinfo));
} else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
struct qreg temp =
reconstruct_temp(c, reconstruct_op);
@@ -644,8 +700,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* instruction immediately after so we
* can use ACC.
*/
- add_node(c, temp.index, CLASS_BITS_PHYS |
- CLASS_BITS_ACC);
+ int temp_class =
+ filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+ CLASS_BITS_ACC);
+ add_node(c, temp.index, temp_class);
} else {
/* If we have a postponed spill, we
* don't need a fill as the temp would
@@ -739,12 +797,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
* update node priorities based one new liveness data.
*/
uint32_t sb_temp =c->spill_base.index;
- uint32_t sb_node = temp_to_node(sb_temp);
+ uint32_t sb_node = temp_to_node(c, sb_temp);
for (uint32_t i = 0; i < c->num_temps; i++) {
if (c->temp_end[i] == -1)
continue;
- uint32_t node_i = temp_to_node(i);
+ uint32_t node_i = temp_to_node(c, i);
c->nodes.info[node_i].priority =
c->temp_end[i] - c->temp_start[i];
@@ -752,7 +810,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
j < c->num_temps; j++) {
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
- uint32_t node_j = temp_to_node(j);
+ uint32_t node_j = temp_to_node(c, j);
ra_add_node_interference(c->g, node_i, node_j);
}
}
@@ -771,9 +829,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
}
struct v3d_ra_select_callback_data {
+ uint32_t phys_index;
uint32_t next_acc;
uint32_t next_phys;
struct v3d_ra_node_info *nodes;
+ const struct v3d_device_info *devinfo;
};
/* Choosing accumulators improves chances of merging QPU instructions
@@ -785,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
int priority)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Favor accumulators if we have less that this number of physical
* registers. Accumulators have more restrictions (like being
* invalidated through thrsw), so running out of physical registers
@@ -794,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
static const int available_rf_threshold = 5;
int available_rf = 0 ;
for (int i = 0; i < PHYS_COUNT; i++) {
- if (BITSET_TEST(regs, PHYS_INDEX + i))
+ if (BITSET_TEST(regs, v3d_ra->phys_index + i))
available_rf++;
if (available_rf >= available_rf_threshold)
break;
@@ -820,6 +883,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
+ if (!v3d_ra->devinfo->has_accumulators)
+ return false;
+
/* Choose r5 for our ldunifs if possible (nobody else can load to that
* reg, and it keeps the QPU cond field free from being occupied by
* ldunifrf).
@@ -849,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ unsigned int node,
BITSET_WORD *regs,
unsigned int *out)
{
+ /* If this node is for an unused temp, ignore. */
+ if (v3d_ra->nodes->info[node].unused) {
+ *out = 0;
+ return true;
+ }
+
+ /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+ * so we can avoid turning them into ldunifrf (which uses the
+ * cond field to encode the dst and would prevent merge with
+ * instructions that use cond flags).
+ */
+ if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ assert(v3d_ra->devinfo->ver >= 71);
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
+ /* The last 3 instructions in a shader can't use some specific registers
+ * (usually early rf registers, depends on v3d version) so try to
+ * avoid allocating these to registers used by the last instructions
+ * in the shader.
+ */
+ const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
+ if (v3d_ra->nodes->info[node].is_program_end &&
+ v3d_ra->next_phys < safe_rf_start) {
+ v3d_ra->next_phys = safe_rf_start;
+ }
+
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
- int phys = PHYS_INDEX + phys_off;
+
+ /* Try to keep rf0 available for ldunif in 7.x (see above). */
+ if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+ continue;
+
+ int phys = v3d_ra->phys_index + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
@@ -863,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
}
}
+ /* If we couldn't allocate, do try to assign rf0 if it is available. */
+ if (v3d_ra->devinfo->ver >= 71 &&
+ BITSET_TEST(regs, v3d_ra->phys_index)) {
+ v3d_ra->next_phys = 1;
+ *out = v3d_ra->phys_index;
+ return true;
+ }
+
return false;
}
@@ -877,7 +986,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
return reg;
}
- if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+ if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
return reg;
/* If we ran out of physical registers try to assign an accumulator
@@ -896,8 +1005,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
* register file can be divided up for fragment shader threading.
*/
int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+ uint8_t phys_index = get_phys_index(compiler->devinfo);
- compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+ compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
false);
if (!compiler->regs)
return false;
@@ -905,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
for (int threads = 0; threads < max_thread_index; threads++) {
compiler->reg_class_any[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_r5[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
- compiler->reg_class_phys_or_acc[threads] =
- ra_alloc_contig_reg_class(compiler->regs, 1);
+ if (compiler->devinfo->has_accumulators) {
+ compiler->reg_class_r5[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ compiler->reg_class_phys_or_acc[threads] =
+ ra_alloc_contig_reg_class(compiler->regs, 1);
+ }
compiler->reg_class_phys[threads] =
ra_alloc_contig_reg_class(compiler->regs, 1);
- for (int i = PHYS_INDEX;
- i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ /* Init physical regs */
+ for (int i = phys_index;
+ i < phys_index + (PHYS_COUNT >> threads); i++) {
+ if (compiler->devinfo->has_accumulators)
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->reg_class_phys[threads], i);
ra_class_add_reg(compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
- ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
- ra_class_add_reg(compiler->reg_class_any[threads], i);
+ /* Init accumulator regs */
+ if (compiler->devinfo->has_accumulators) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+ ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->reg_class_any[threads], i);
+ }
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
+ ra_class_add_reg(compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
}
- /* r5 can only store a single 32-bit value, so not much can
- * use it.
- */
- ra_class_add_reg(compiler->reg_class_r5[threads],
- ACC_INDEX + 5);
- ra_class_add_reg(compiler->reg_class_any[threads],
- ACC_INDEX + 5);
}
ra_set_finalize(compiler->regs, NULL);
@@ -944,7 +1061,10 @@ tmu_spilling_allowed(struct v3d_compile *c)
}
static void
-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+ int *acc_nodes,
+ int *implicit_rf_nodes,
+ int last_ldvary_ip,
struct qinst *inst)
{
int32_t ip = inst->ip;
@@ -954,26 +1074,39 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* result to a temp), nothing else can be stored in r3/r4 across
* it.
*/
- if (vir_writes_r3(c->devinfo, inst)) {
+ if (vir_writes_r3_implicitly(c->devinfo, inst)) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
- temp_to_node(i),
+ temp_to_node(c, i),
acc_nodes[3]);
}
}
}
- if (vir_writes_r4(c->devinfo, inst)) {
+ if (vir_writes_r4_implicitly(c->devinfo, inst)) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
ra_add_node_interference(c->g,
- temp_to_node(i),
+ temp_to_node(c, i),
acc_nodes[4]);
}
}
}
+ /* If any instruction writes to a physical register implicitly
+ * nothing else can write the same register across it.
+ */
+ if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_LDVPMV_IN:
@@ -987,7 +1120,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* decides whether the LDVPM is in or out)
*/
assert(inst->dst.file == QFILE_TEMP);
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_PHYS);
break;
}
@@ -1002,7 +1135,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* phys regfile.
*/
assert(inst->dst.file == QFILE_TEMP);
- set_temp_class_bits(&c->nodes, inst->dst.index,
+ set_temp_class_bits(c, inst->dst.index,
CLASS_BITS_PHYS);
break;
}
@@ -1015,6 +1148,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
if (inst->src[0].file == QFILE_REG) {
switch (inst->src[0].index) {
case 0:
+ /* V3D 7.x doesn't use rf0 for thread payload */
+ if (c->devinfo->ver >= 71)
+ break;
+ else
+ FALLTHROUGH;
case 1:
case 2:
case 3: {
@@ -1024,14 +1162,34 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
*/
assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
assert(inst->dst.file == QFILE_TEMP);
- uint32_t node = temp_to_node(inst->dst.index);
+ uint32_t node = temp_to_node(c, inst->dst.index);
ra_set_node_reg(c->g, node,
- PHYS_INDEX + inst->src[0].index);
+ get_phys_index(c->devinfo) +
+ inst->src[0].index);
break;
}
}
}
+ /* Don't allocate rf0 to temps that cross ranges where we have
+ * live implicit rf0 writes from ldvary. We can identify these
+ * by tracking the last ldvary instruction and explicit reads
+ * of rf0.
+ */
+ if (c->devinfo->ver >= 71 &&
+ ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+ (vir_get_nsrc(inst) > 1 &&
+ inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip &&
+ c->temp_end[i] > last_ldvary_ip) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, i),
+ implicit_rf_nodes[0]);
+ }
+ }
+ }
+
if (inst->dst.file == QFILE_TEMP) {
/* Only a ldunif gets to write to R5, which only has a
* single 32-bit channel of storage.
@@ -1041,36 +1199,95 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
* because ldunif has usually a shorter lifespan, allowing for
* more accumulator reuse and QPU merges.
*/
- if (!inst->qpu.sig.ldunif) {
- uint8_t class_bits =
- get_temp_class_bits(&c->nodes, inst->dst.index) &
- ~CLASS_BITS_R5;
- set_temp_class_bits(&c->nodes, inst->dst.index,
- class_bits);
-
+ if (c->devinfo->has_accumulators) {
+ if (!inst->qpu.sig.ldunif) {
+ uint8_t class_bits =
+ get_temp_class_bits(c, inst->dst.index) &
+ ~CLASS_BITS_R5;
+ set_temp_class_bits(c, inst->dst.index,
+ class_bits);
+
+ } else {
+ /* Until V3D 4.x, we could only load a uniform
+ * to r5, so we'll need to spill if uniform
+ * loads interfere with each other.
+ */
+ if (c->devinfo->ver < 40) {
+ set_temp_class_bits(c, inst->dst.index,
+ CLASS_BITS_R5);
+ }
+ }
} else {
- /* Until V3D 4.x, we could only load a uniform
- * to r5, so we'll need to spill if uniform
- * loads interfere with each other.
+ /* Make sure we don't allocate the ldvary's
+ * destination to rf0, since it would clash
+ * with its implicit write to that register.
+ */
+ if (inst->qpu.sig.ldvary) {
+ ra_add_node_interference(c->g,
+ temp_to_node(c, inst->dst.index),
+ implicit_rf_nodes[0]);
+ }
+ /* Flag dst temps from ldunif(a) instructions
+ * so we can try to assign rf0 to them and avoid
+ * converting these to ldunif(a)rf.
*/
- if (c->devinfo->ver < 40) {
- set_temp_class_bits(&c->nodes, inst->dst.index,
- CLASS_BITS_R5);
+ if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+ const uint32_t dst_n =
+ temp_to_node(c, inst->dst.index);
+ c->nodes.info[dst_n].is_ldunif_dst = true;
}
}
}
/* All accumulators are invalidated across a thread switch. */
- if (inst->qpu.sig.thrsw) {
+ if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
- set_temp_class_bits(&c->nodes, i,
+ set_temp_class_bits(c, i,
CLASS_BITS_PHYS);
}
}
}
}
+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+ /* Only look for registers used in this many instructions */
+ uint32_t last_set_count = 6;
+
+ struct qblock *last_block = vir_exit_block(c);
+ list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+ if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
+ continue;
+
+ int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+ }
+ }
+
+ num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+ for (int i = 0; i < num_src; i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->src[i].index);
+ c->nodes.info[node].is_program_end = true;
+
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int node = temp_to_node(c, inst->dst.index);
+ c->nodes.info[node].is_program_end = true;
+ }
+
+ if (--last_set_count == 0)
+ break;
+ }
+}
+
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
*
@@ -1080,19 +1297,32 @@ struct qpu_reg *
v3d_register_allocate(struct v3d_compile *c)
{
int acc_nodes[ACC_COUNT];
+ int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+ unsigned num_ra_nodes = c->num_temps;
+ if (c->devinfo->has_accumulators)
+ num_ra_nodes += ARRAY_SIZE(acc_nodes);
+ else
+ num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
c->nodes = (struct v3d_ra_node_info) {
.alloc_count = c->num_temps,
.info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
- c->num_temps + ACC_COUNT),
+ num_ra_nodes),
};
+ uint32_t phys_index = get_phys_index(c->devinfo);
+
struct v3d_ra_select_callback_data callback_data = {
+ .phys_index = phys_index,
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
- * RF0-2.
+ * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+ * using RF2-3.
*/
- .next_phys = 3,
+ .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
.nodes = &c->nodes,
+ .devinfo = c->devinfo,
};
vir_calculate_live_intervals(c);
@@ -1108,27 +1338,35 @@ v3d_register_allocate(struct v3d_compile *c)
c->thread_index--;
}
- c->g = ra_alloc_interference_graph(c->compiler->regs,
- c->num_temps + ARRAY_SIZE(acc_nodes));
+ c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
* switches. We could represent these as classes for the nodes to
* live in, but the classes take up a lot of memory to set up, so we
- * don't want to make too many.
+ * don't want to make too many. We use the same mechanism on platforms
+ * without accumulators that can have implicit writes to phys regs.
*/
- for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
- if (i < ACC_COUNT) {
+ for (uint32_t i = 0; i < num_ra_nodes; i++) {
+ c->nodes.info[i].is_ldunif_dst = false;
+ c->nodes.info[i].is_program_end = false;
+ c->nodes.info[i].unused = false;
+ c->nodes.info[i].priority = 0;
+ c->nodes.info[i].class_bits = 0;
+ if (c->devinfo->has_accumulators && i < ACC_COUNT) {
acc_nodes[i] = i;
ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
- c->nodes.info[i].priority = 0;
- c->nodes.info[i].class_bits = 0;
+ } else if (!c->devinfo->has_accumulators &&
+ i < ARRAY_SIZE(implicit_rf_nodes)) {
+ implicit_rf_nodes[i] = i;
+ ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
} else {
- uint32_t t = node_to_temp(i);
+ uint32_t t = node_to_temp(c, i);
c->nodes.info[i].priority =
c->temp_end[t] - c->temp_start[t];
- c->nodes.info[i].class_bits = CLASS_BITS_ANY;
+ c->nodes.info[i].class_bits =
+ get_class_bit_any(c->devinfo);
}
}
@@ -1136,25 +1374,61 @@ v3d_register_allocate(struct v3d_compile *c)
* interferences.
*/
int ip = 0;
+ int last_ldvary_ip = -1;
vir_for_each_inst_inorder(inst, c) {
inst->ip = ip++;
- update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
+
+ /* ldunif(a) always write to a temporary, so we have
+ * liveness info available to decide if rf0 is
+ * available for them, however, ldvary is different:
+ * it always writes to rf0 directly so we don't have
+ * liveness information for its implicit rf0 write.
+ *
+ * That means the allocator may assign rf0 to a temp
+ * that is defined while an implicit rf0 write from
+ * ldvary is still live. We fix that by manually
+ * tracking rf0 live ranges from ldvary instructions.
+ */
+ if (inst->qpu.sig.ldvary)
+ last_ldvary_ip = ip;
+
+ update_graph_and_reg_classes_for_inst(c, acc_nodes,
+ implicit_rf_nodes,
+ last_ldvary_ip, inst);
}
+ /* Flag the nodes that are used in the last instructions of the program
+ * (there are some registers that cannot be used in the last 3
+ * instructions). We only do this for fragment shaders, because the idea
+ * is that by avoiding this conflict we may be able to emit the last
+ * thread switch earlier in some cases, however, in non-fragment shaders
+ * this won't happen because the last instructions are always VPM stores
+ * with a small immediate, which conflicts with other signals,
+ * preventing us from ever moving the thrsw earlier.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+ flag_program_end_nodes(c);
+
/* Set the register classes for all our temporaries in the graph */
for (uint32_t i = 0; i < c->num_temps; i++) {
- ra_set_node_class(c->g, temp_to_node(i),
+ ra_set_node_class(c->g, temp_to_node(c, i),
choose_reg_class_for_temp(c, i));
}
/* Add register interferences based on liveness data */
for (uint32_t i = 0; i < c->num_temps; i++) {
+ /* And while we are here, let's also flag nodes for
+ * unused temps.
+ */
+ if (c->temp_start[i] > c->temp_end[i])
+ c->nodes.info[temp_to_node(c, i)].unused = true;
+
for (uint32_t j = i + 1; j < c->num_temps; j++) {
if (interferes(c->temp_start[i], c->temp_end[i],
c->temp_start[j], c->temp_end[j])) {
ra_add_node_interference(c->g,
- temp_to_node(i),
- temp_to_node(j));
+ temp_to_node(c, i),
+ temp_to_node(c, j));
}
}
}
@@ -1171,9 +1445,9 @@ v3d_register_allocate(struct v3d_compile *c)
if (c->spill_size <
V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
int node = v3d_choose_spill_node(c);
- uint32_t temp = node_to_temp(node);
+ uint32_t temp = node_to_temp(c, node);
if (node != -1) {
- v3d_spill_reg(c, acc_nodes, temp);
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
continue;
}
}
@@ -1186,11 +1460,11 @@ v3d_register_allocate(struct v3d_compile *c)
if (node == -1)
goto spill_fail;
- uint32_t temp = node_to_temp(node);
+ uint32_t temp = node_to_temp(c, node);
enum temp_spill_type spill_type =
get_spill_type_for_temp(c, temp);
if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
- v3d_spill_reg(c, acc_nodes, temp);
+ v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
if (c->spills + c->fills > c->max_tmu_spills)
goto spill_fail;
} else {
@@ -1201,14 +1475,14 @@ v3d_register_allocate(struct v3d_compile *c)
/* Allocation was successful, build the 'temp -> reg' map */
temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
for (uint32_t i = 0; i < c->num_temps; i++) {
- int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
- if (ra_reg < PHYS_INDEX) {
+ int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+ if (ra_reg < phys_index) {
temp_registers[i].magic = true;
temp_registers[i].index = (V3D_QPU_WADDR_R0 +
ra_reg - ACC_INDEX);
} else {
temp_registers[i].magic = false;
- temp_registers[i].index = ra_reg - PHYS_INDEX;
+ temp_registers[i].index = ra_reg - phys_index;
}
}
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 45e6bfa1470..4ed184cbbcb 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -86,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
return q;
}
+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+ /* If we have a small immediate move it from inst->raddr_b to the
+ * corresponding raddr.
+ */
+ if (src.smimm) {
+ assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+ instr->sig.small_imm_c || instr->sig.small_imm_d);
+ *raddr = instr->raddr_b;
+ return;
+ }
+
+ assert(!src.magic);
+ *raddr = src.index;
+}
+
/**
* Allocates the src register (accumulator or register file) into the RADDR
* fields of the instruction.
*/
static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
{
if (src.smimm) {
- assert(instr->sig.small_imm);
+ assert(instr->sig.small_imm_b);
*mux = V3D_QPU_MUX_B;
return;
}
@@ -106,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
return;
}
- if (instr->alu.add.a != V3D_QPU_MUX_A &&
- instr->alu.add.b != V3D_QPU_MUX_A &&
- instr->alu.mul.a != V3D_QPU_MUX_A &&
- instr->alu.mul.b != V3D_QPU_MUX_A) {
+ if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+ instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
instr->raddr_a = src.index;
*mux = V3D_QPU_MUX_A;
} else {
if (instr->raddr_a == src.index) {
*mux = V3D_QPU_MUX_A;
} else {
- assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
- instr->alu.add.b == V3D_QPU_MUX_B &&
- instr->alu.mul.a == V3D_QPU_MUX_B &&
- instr->alu.mul.b == V3D_QPU_MUX_B) ||
+ assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+ instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
src.index == instr->raddr_b);
instr->raddr_b = src.index;
@@ -128,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
}
}
-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux *mux,
+ uint8_t *raddr,
+ struct qpu_reg src,
+ const struct v3d_device_info *devinfo)
{
- static const struct v3d_qpu_sig no_sig = {0};
-
- /* Make sure it's just a lone MOV. */
- if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
- qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
- qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
- memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
- return false;
- }
+ if (devinfo->ver < 71)
+ return v3d33_set_src(instr, mux, src);
+ else
+ return v3d71_set_src(instr, raddr, src);
+}
- /* Check if it's a MOV from a register to itself. */
+static bool
+v3d33_mov_src_and_dst_equal(struct qinst *qinst)
+{
enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
if (qinst->qpu.alu.mul.magic_write) {
if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
return false;
- if (qinst->qpu.alu.mul.a !=
+ if (qinst->qpu.alu.mul.a.mux !=
V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
return false;
}
} else {
int raddr;
- switch (qinst->qpu.alu.mul.a) {
+ switch (qinst->qpu.alu.mul.a.mux) {
case V3D_QPU_MUX_A:
raddr = qinst->qpu.raddr_a;
break;
@@ -168,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
return false;
}
+ return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+ if (qinst->qpu.alu.mul.magic_write)
+ return false;
+
+ enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+ int raddr;
+
+ raddr = qinst->qpu.alu.mul.a.raddr;
+ if (raddr != waddr)
+ return false;
+
+ return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ if (devinfo->ver < 71)
+ return v3d33_mov_src_and_dst_equal(qinst);
+ else
+ return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+ const struct v3d_device_info *devinfo)
+{
+ static const struct v3d_qpu_sig no_sig = {0};
+
+ /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+ * for V3D 7.x there is also A_MOV, we don't need to check for it as
+ * we always emit using M_MOV. We could use A_MOV later on the
+ * squedule to improve performance
+ */
+ if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+ qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+ memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+ return false;
+ }
+
+ if (!mov_src_and_dst_equal(qinst, devinfo))
+ return false;
+
/* No packing or flags updates, or we need to execute the
* instruction.
*/
- if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+ if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -277,8 +352,15 @@ v3d_generate_code_block(struct v3d_compile *c,
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
- if (!dst.magic ||
- dst.index != V3D_QPU_WADDR_R5) {
+ bool use_rf;
+ if (c->devinfo->has_accumulators) {
+ use_rf = !dst.magic ||
+ dst.index != V3D_QPU_WADDR_R5;
+ } else {
+ use_rf = dst.magic || dst.index != 0;
+ }
+
+ if (use_rf) {
assert(c->devinfo->ver >= 40);
if (qinst->qpu.sig.ldunif) {
@@ -300,13 +382,18 @@ v3d_generate_code_block(struct v3d_compile *c,
qinst->qpu.sig_magic = dst.magic;
} else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.a, src[0]);
+ &qinst->qpu.alu.add.a.mux,
+ &qinst->qpu.alu.add.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.add.b, src[1]);
+ &qinst->qpu.alu.add.b.mux,
+ &qinst->qpu.alu.add.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.add.waddr = dst.index;
@@ -314,17 +401,21 @@ v3d_generate_code_block(struct v3d_compile *c,
} else {
if (nsrc >= 1) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.a, src[0]);
+ &qinst->qpu.alu.mul.a.mux,
+ &qinst->qpu.alu.mul.a.raddr,
+ src[0], c->devinfo);
}
if (nsrc >= 2) {
set_src(&qinst->qpu,
- &qinst->qpu.alu.mul.b, src[1]);
+ &qinst->qpu.alu.mul.b.mux,
+ &qinst->qpu.alu.mul.b.raddr,
+ src[1], c->devinfo);
}
qinst->qpu.alu.mul.waddr = dst.index;
qinst->qpu.alu.mul.magic_write = dst.magic;
- if (is_no_op_mov(qinst)) {
+ if (is_no_op_mov(qinst, c->devinfo)) {
vir_remove_instruction(c, qinst);
continue;
}
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 2c10e46b188..73cb7aa0575 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
subdir('cle')
-v3d_versions = ['33', '41', '42']
+v3d_versions = ['33', '41', '42', '71']
v3d_libs = []
if with_gallium_v3d or with_broadcom_vk
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index 28fb2357b97..c1590a760de 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n)
static void
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
- const struct v3d_qpu_instr *instr, uint8_t mux)
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ enum v3d_qpu_mux mux)
{
if (mux == V3D_QPU_MUX_A) {
append(disasm, "rf%d", instr->raddr_a);
} else if (mux == V3D_QPU_MUX_B) {
- if (instr->sig.small_imm) {
+ if (instr->sig.small_imm_b) {
uint32_t val;
ASSERTED bool ok =
v3d_qpu_small_imm_unpack(disasm->devinfo,
@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
}
}
+enum v3d_qpu_input_class {
+ V3D_QPU_ADD_A,
+ V3D_QPU_ADD_B,
+ V3D_QPU_MUL_A,
+ V3D_QPU_MUL_B
+};
+
+static void
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ uint8_t raddr,
+ enum v3d_qpu_input_class input_class)
+{
+ bool is_small_imm = false;
+ switch(input_class) {
+ case V3D_QPU_ADD_A:
+ is_small_imm = instr->sig.small_imm_a;
+ break;
+ case V3D_QPU_ADD_B:
+ is_small_imm = instr->sig.small_imm_b;
+ break;
+ case V3D_QPU_MUL_A:
+ is_small_imm = instr->sig.small_imm_c;
+ break;
+ case V3D_QPU_MUL_B:
+ is_small_imm = instr->sig.small_imm_d;
+ break;
+ }
+
+ if (is_small_imm) {
+ uint32_t val;
+ ASSERTED bool ok =
+ v3d_qpu_small_imm_unpack(disasm->devinfo,
+ raddr,
+ &val);
+
+ if ((int)val >= -16 && (int)val <= 15)
+ append(disasm, "%d", val);
+ else
+ append(disasm, "0x%08x", val);
+ assert(ok);
+ } else {
+ append(disasm, "rf%d", raddr);
+ }
+}
+
+static void
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+ const struct v3d_qpu_instr *instr,
+ const struct v3d_qpu_input *input,
+ enum v3d_qpu_input_class input_class)
+{
+ if (disasm->devinfo->ver < 71)
+ v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
+ else
+ v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
+}
+
static void
v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
{
@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.add.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.add.b.unpack));
}
}
@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
if (num_src >= 1) {
if (has_dst)
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
}
if (num_src >= 2) {
append(disasm, ", ");
- v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
+ v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
append(disasm, "%s",
- v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
+ v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
}
}
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 60dabf74e8e..44f20618a5a 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
return "tmu";
+ /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
+ */
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
+ return "quad";
+
+ if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
+ return "rep";
+
static const char *waddr_magic[] = {
[V3D_QPU_WADDR_R0] = "r0",
[V3D_QPU_WADDR_R1] = "r1",
@@ -169,6 +177,12 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
[V3D_QPU_A_ITOF] = "itof",
[V3D_QPU_A_CLZ] = "clz",
[V3D_QPU_A_UTOF] = "utof",
+ [V3D_QPU_A_MOV] = "mov",
+ [V3D_QPU_A_FMOV] = "fmov",
+ [V3D_QPU_A_VPACK] = "vpack",
+ [V3D_QPU_A_V8PACK] = "v8pack",
+ [V3D_QPU_A_V10PACK] = "v10pack",
+ [V3D_QPU_A_V11FPACK] = "v11fpack",
};
if (op >= ARRAY_SIZE(op_names))
@@ -191,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
[V3D_QPU_M_MOV] = "mov",
[V3D_QPU_M_NOP] = "nop",
[V3D_QPU_M_FMUL] = "fmul",
+ [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
+ [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
+ [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
+ [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
+ [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
+ [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
};
if (op >= ARRAY_SIZE(op_names))
@@ -450,6 +470,13 @@ static const uint8_t add_op_args[] = {
[V3D_QPU_A_ITOF] = D | A,
[V3D_QPU_A_CLZ] = D | A,
[V3D_QPU_A_UTOF] = D | A,
+
+ [V3D_QPU_A_MOV] = D | A,
+ [V3D_QPU_A_FMOV] = D | A,
+ [V3D_QPU_A_VPACK] = D | A | B,
+ [V3D_QPU_A_V8PACK] = D | A | B,
+ [V3D_QPU_A_V10PACK] = D | A | B,
+ [V3D_QPU_A_V11FPACK] = D | A | B,
};
static const uint8_t mul_op_args[] = {
@@ -463,6 +490,12 @@ static const uint8_t mul_op_args[] = {
[V3D_QPU_M_NOP] = 0,
[V3D_QPU_M_MOV] = D | A,
[V3D_QPU_M_FMUL] = D | A | B,
+ [V3D_QPU_M_FTOUNORM16] = D | A,
+ [V3D_QPU_M_FTOSNORM16] = D | A,
+ [V3D_QPU_M_VFTOUNORM8] = D | A,
+ [V3D_QPU_M_VFTOSNORM8] = D | A,
+ [V3D_QPU_M_VFTOUNORM10LO] = D | A,
+ [V3D_QPU_M_VFTOUNORM10HI] = D | A,
};
bool
@@ -636,12 +669,14 @@ v3d_qpu_add_op_writes_vpm(enum v3d_qpu_add_op op)
}
bool
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
{
- if (inst->sig.ldtlb ||
- inst->sig.ldtlbu)
- return true;
+ return inst->sig.ldtlb || inst->sig.ldtlbu;
+}
+bool
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
+{
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
@@ -659,6 +694,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
return false;
}
+bool
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+{
+ return v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
+}
+
bool
v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
{
@@ -846,6 +887,9 @@ bool
v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if(!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
return true;
@@ -856,6 +900,9 @@ bool
v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.op != V3D_QPU_A_NOP &&
inst->alu.add.magic_write &&
@@ -886,6 +933,9 @@ bool
v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
return true;
@@ -896,6 +946,9 @@ bool
v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
+ if (!devinfo->has_accumulators)
+ return false;
+
if (v3d_qpu_writes_r5(devinfo, inst))
return true;
if (v3d_qpu_writes_r4(devinfo, inst))
@@ -912,16 +965,68 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
return false;
}
+bool
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst)
+{
+ if (devinfo->ver >= 71 &&
+ (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
+ return true;
+ }
+
+ return false;
+}
+
bool
v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
{
int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
- return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
- (add_nsrc > 1 && inst->alu.add.b == mux) ||
- (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
- (mul_nsrc > 1 && inst->alu.mul.b == mux));
+ return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
+ (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
+ (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
+ (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+}
+
+bool
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+ int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+ int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+
+ return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
+ (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
+ (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
+ (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
+}
+
+bool
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
+ !inst->alu.add.magic_write &&
+ inst->alu.add.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
+ !inst->alu.mul.magic_write &&
+ inst->alu.mul.waddr == waddr) {
+ return true;
+ }
+
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+ !inst->sig_magic && inst->sig_addr == waddr) {
+ return true;
+ }
+
+ return false;
}
bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 2e133472698..56eee9f9cac 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
bool ldvpm:1;
bool ldtlb:1;
bool ldtlbu:1;
- bool small_imm:1;
bool ucb:1;
bool rotate:1;
bool wrtmuc:1;
+ bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
+ bool small_imm_b:1; /* raddr_b (add b) */
+ bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
+ bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
};
enum v3d_qpu_cond {
@@ -88,12 +91,13 @@ enum v3d_qpu_uf {
};
enum v3d_qpu_waddr {
- V3D_QPU_WADDR_R0 = 0,
- V3D_QPU_WADDR_R1 = 1,
- V3D_QPU_WADDR_R2 = 2,
- V3D_QPU_WADDR_R3 = 3,
- V3D_QPU_WADDR_R4 = 4,
- V3D_QPU_WADDR_R5 = 5,
+ V3D_QPU_WADDR_R0 = 0, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R1 = 1, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R2 = 2, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R3 = 3, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R4 = 4, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_R5 = 5, /* V3D 4.x */
+ V3D_QPU_WADDR_QUAD = 5, /* V3D 7.x */
V3D_QPU_WADDR_NOP = 6,
V3D_QPU_WADDR_TLB = 7,
V3D_QPU_WADDR_TLBU = 8,
@@ -108,12 +112,12 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_SYNC = 16,
V3D_QPU_WADDR_SYNCU = 17,
V3D_QPU_WADDR_SYNCB = 18,
- V3D_QPU_WADDR_RECIP = 19,
- V3D_QPU_WADDR_RSQRT = 20,
- V3D_QPU_WADDR_EXP = 21,
- V3D_QPU_WADDR_LOG = 22,
- V3D_QPU_WADDR_SIN = 23,
- V3D_QPU_WADDR_RSQRT2 = 24,
+ V3D_QPU_WADDR_RECIP = 19, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT = 20, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_EXP = 21, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_LOG = 22, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_SIN = 23, /* Reserved on V3D 7.x */
+ V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
V3D_QPU_WADDR_TMUC = 32,
V3D_QPU_WADDR_TMUS = 33,
V3D_QPU_WADDR_TMUT = 34,
@@ -129,7 +133,8 @@ enum v3d_qpu_waddr {
V3D_QPU_WADDR_TMUHSCM = 44,
V3D_QPU_WADDR_TMUHSF = 45,
V3D_QPU_WADDR_TMUHSLOD = 46,
- V3D_QPU_WADDR_R5REP = 55,
+ V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
+ V3D_QPU_WADDR_REP = 55, /* V3D 7.x */
};
struct v3d_qpu_flags {
@@ -222,6 +227,14 @@ enum v3d_qpu_add_op {
V3D_QPU_A_ITOF,
V3D_QPU_A_CLZ,
V3D_QPU_A_UTOF,
+
+ /* V3D 7.x */
+ V3D_QPU_A_FMOV,
+ V3D_QPU_A_MOV,
+ V3D_QPU_A_VPACK,
+ V3D_QPU_A_V8PACK,
+ V3D_QPU_A_V10PACK,
+ V3D_QPU_A_V11FPACK,
};
enum v3d_qpu_mul_op {
@@ -235,6 +248,14 @@ enum v3d_qpu_mul_op {
V3D_QPU_M_MOV,
V3D_QPU_M_NOP,
V3D_QPU_M_FMUL,
+
+ /* V3D 7.x */
+ V3D_QPU_M_FTOUNORM16,
+ V3D_QPU_M_FTOSNORM16,
+ V3D_QPU_M_VFTOUNORM8,
+ V3D_QPU_M_VFTOSNORM8,
+ V3D_QPU_M_VFTOUNORM10LO,
+ V3D_QPU_M_VFTOUNORM10HI,
};
enum v3d_qpu_output_pack {
@@ -276,6 +297,15 @@ enum v3d_qpu_input_unpack {
/** Swap high and low 16 bits */
V3D_QPU_UNPACK_SWAP_16,
+
+ /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UL,
+ /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
+ V3D_QPU_UNPACK_UH,
+ /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IL,
+ /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
+ V3D_QPU_UNPACK_IH,
};
enum v3d_qpu_mux {
@@ -289,25 +319,29 @@ enum v3d_qpu_mux {
V3D_QPU_MUX_B,
};
+struct v3d_qpu_input {
+ union {
+ enum v3d_qpu_mux mux; /* V3D 4.x */
+ uint8_t raddr; /* V3D 7.x */
+ };
+ enum v3d_qpu_input_unpack unpack;
+};
+
struct v3d_qpu_alu_instr {
struct {
enum v3d_qpu_add_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} add;
struct {
enum v3d_qpu_mul_op op;
- enum v3d_qpu_mux a, b;
+ struct v3d_qpu_input a, b;
uint8_t waddr;
bool magic_write;
enum v3d_qpu_output_pack output_pack;
- enum v3d_qpu_input_unpack a_unpack;
- enum v3d_qpu_input_unpack b_unpack;
} mul;
};
@@ -379,8 +413,8 @@ struct v3d_qpu_instr {
struct v3d_qpu_sig sig;
uint8_t sig_addr;
bool sig_magic; /* If the signal writes to a magic address */
- uint8_t raddr_a;
- uint8_t raddr_b;
+ uint8_t raddr_a; /* V3D 4.x */
+ uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
struct v3d_qpu_flags flags;
union {
@@ -450,6 +484,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -464,6 +500,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -483,4 +521,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *inst,
+ uint8_t waddr);
#endif
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 94629aff4fc..4e3c3da8866 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -84,6 +84,9 @@
#define V3D_QPU_MUL_A_SHIFT 18
#define V3D_QPU_MUL_A_MASK QPU_MASK(20, 18)
+#define V3D_QPU_RADDR_C_SHIFT 18
+#define V3D_QPU_RADDR_C_MASK QPU_MASK(23, 18)
+
#define V3D_QPU_ADD_B_SHIFT 15
#define V3D_QPU_ADD_B_MASK QPU_MASK(17, 15)
@@ -98,6 +101,9 @@
#define V3D_QPU_BRANCH_BDI_SHIFT 12
#define V3D_QPU_BRANCH_BDI_MASK QPU_MASK(13, 12)
+#define V3D_QPU_RADDR_D_SHIFT 12
+#define V3D_QPU_RADDR_D_MASK QPU_MASK(17, 12)
+
#define V3D_QPU_RADDR_A_SHIFT 6
#define V3D_QPU_RADDR_A_MASK QPU_MASK(11, 6)
@@ -112,12 +118,15 @@
#define LDTMU .ldtmu = true
#define LDVARY .ldvary = true
#define LDVPM .ldvpm = true
-#define SMIMM .small_imm = true
#define LDTLB .ldtlb = true
#define LDTLBU .ldtlbu = true
#define UCB .ucb = true
#define ROT .rotate = true
#define WRTMUC .wrtmuc = true
+#define SMIMM_A .small_imm_a = true
+#define SMIMM_B .small_imm_b = true
+#define SMIMM_C .small_imm_c = true
+#define SMIMM_D .small_imm_d = true
static const struct v3d_qpu_sig v33_sig_map[] = {
/* MISC R3 R4 R5 */
@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDVARY, LDTMU, },
[13] = { THRSW, LDVARY, LDTMU, },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
/* 18-21 reserved */
@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
[27] = { THRSW, LDVPM, LDUNIF },
[28] = { LDVPM, LDTMU, },
[29] = { THRSW, LDVPM, LDTMU, },
- [30] = { SMIMM, LDVPM, },
- [31] = { SMIMM, },
+ [30] = { SMIMM_B, LDVPM, },
+ [31] = { SMIMM_B, },
};
static const struct v3d_qpu_sig v40_sig_map[] = {
@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[10] = { LDVARY, LDUNIF },
[11] = { THRSW, LDVARY, LDUNIF },
/* 12-13 reserved */
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY, },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
[22] = { UCB, },
[23] = { ROT, },
/* 24-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
};
static const struct v3d_qpu_sig v41_sig_map[] = {
@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[11] = { THRSW, LDVARY, LDUNIF },
[12] = { LDUNIFRF },
[13] = { THRSW, LDUNIFRF },
- [14] = { SMIMM, LDVARY, },
- [15] = { SMIMM, },
+ [14] = { SMIMM_B, LDVARY },
+ [15] = { SMIMM_B, },
[16] = { LDTLB, },
[17] = { LDTLBU, },
[18] = { WRTMUC },
@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
[24] = { LDUNIFA},
[25] = { LDUNIFARF },
/* 26-30 reserved */
- [31] = { SMIMM, LDTMU, },
+ [31] = { SMIMM_B, LDTMU, },
+};
+
+
+static const struct v3d_qpu_sig v71_sig_map[] = {
+ /* MISC phys RF0 */
+ [0] = { },
+ [1] = { THRSW, },
+ [2] = { LDUNIF },
+ [3] = { THRSW, LDUNIF },
+ [4] = { LDTMU, },
+ [5] = { THRSW, LDTMU, },
+ [6] = { LDTMU, LDUNIF },
+ [7] = { THRSW, LDTMU, LDUNIF },
+ [8] = { LDVARY, },
+ [9] = { THRSW, LDVARY, },
+ [10] = { LDVARY, LDUNIF },
+ [11] = { THRSW, LDVARY, LDUNIF },
+ [12] = { LDUNIFRF },
+ [13] = { THRSW, LDUNIFRF },
+ [14] = { SMIMM_A, },
+ [15] = { SMIMM_B, },
+ [16] = { LDTLB, },
+ [17] = { LDTLBU, },
+ [18] = { WRTMUC },
+ [19] = { THRSW, WRTMUC },
+ [20] = { LDVARY, WRTMUC },
+ [21] = { THRSW, LDVARY, WRTMUC },
+ [22] = { UCB, },
+ /* 23 reserved */
+ [24] = { LDUNIFA},
+ [25] = { LDUNIFARF },
+ /* 26-29 reserved */
+ [30] = { SMIMM_C, },
+ [31] = { SMIMM_D, },
};
bool
@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
if (packed_sig >= ARRAY_SIZE(v33_sig_map))
return false;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ *sig = v71_sig_map[packed_sig];
+ else if (devinfo->ver >= 41)
*sig = v41_sig_map[packed_sig];
else if (devinfo->ver == 40)
*sig = v40_sig_map[packed_sig];
@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
{
static const struct v3d_qpu_sig *map;
- if (devinfo->ver >= 41)
+ if (devinfo->ver >= 71)
+ map = v71_sig_map;
+ else if (devinfo->ver >= 41)
map = v41_sig_map;
else if (devinfo->ver == 40)
map = v40_sig_map;
@@ -443,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
/* Make a mapping of the table of opcodes in the spec. The opcode is
* determined by a combination of the opcode field, and in the case of 0 or
- * 1-arg opcodes, the mux_b field as well.
+ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
+ * well.
*/
-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
-#define ANYMUX MUX_MASK(0, 7)
+#define OP_MASK(val) BITFIELD64_BIT(val)
+#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
+#define ANYMUX OP_RANGE(0, 7)
+#define ANYOPMASK OP_RANGE(0, 63)
struct opcode_desc {
uint8_t opcode_first;
uint8_t opcode_last;
- uint8_t mux_b_mask;
- uint8_t mux_a_mask;
+
+ union {
+ struct {
+ uint8_t b_mask;
+ uint8_t a_mask;
+ } mux;
+ uint64_t raddr_mask;
+ };
+
uint8_t op;
/* first_ver == 0 if it's the same across all V3D versions.
@@ -465,122 +522,321 @@ struct opcode_desc {
uint8_t last_ver;
};
-static const struct opcode_desc add_ops[] = {
+static const struct opcode_desc add_ops_v33[] = {
/* FADD is FADDNF depending on the order of the mux_a/mux_b. */
- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADD },
- { 0, 47, ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
- { 53, 55, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 56, 56, ANYMUX, ANYMUX, V3D_QPU_A_ADD },
- { 57, 59, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 60, 60, ANYMUX, ANYMUX, V3D_QPU_A_SUB },
- { 61, 63, ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
- { 64, 111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
- { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
- { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
- { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
- { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
- { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
- { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
- { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
- { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
+ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
+ { 0, 47, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
+ { 53, 55, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 56, 56, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
+ { 57, 59, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 60, 60, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
+ { 61, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+ { 64, 111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
+ { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
+ { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
+ { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
+ { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
+ { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
+ { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
+ { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
+ { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
/* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
- { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
- { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
-
- { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
- { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
- { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
-
- { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
- { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
- { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
- { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
- { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
- { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
- { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
- { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
- { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
- { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
- { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
- { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
- { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
- { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
- { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
- { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
- { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
- { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
-
- { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
- { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
- { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
- { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
-
- { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
- { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
- { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
- { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
- { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
- { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
- { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
- { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
- { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
- { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
-
- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
- { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
- { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
- { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
- { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
- { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
- { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
- { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
- { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
- { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
+ { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
+ { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
+
+ { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
+ { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
+ { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
+
+ { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
+ { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
+ { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
+ { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
+ { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
+ { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
+ { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
+ { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
+ { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
+ { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+ { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
+ { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
+
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
+ { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
+ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
+ { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+
+ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+ { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
+ { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+ { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
/* FIXME: MORE COMPLICATED */
- /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+ /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
- { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
- { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
+ { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
+ { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },
- { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
- { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
- { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
- { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
- { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
- { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
- { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
- { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
+ { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
+ { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
+ { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
+ { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
+ { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
+ { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
+ { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
+ { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },
- { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
- { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
+ { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
+ { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },
/* The stvpms are distinguished by the waddr field. */
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
- { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
+ { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
+
+ { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
+ { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
+ { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
+};
+
+static const struct opcode_desc mul_ops_v33[] = {
+ { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
+ { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
+ { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
+ { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
+ { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
+ { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
+ { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
+ { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
+ { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
+ { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
+
+ { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
+};
+
+/* Note that it would have been possible to define all the add/mul opcodes in
+ * just one table, using the first_ver/last_ver. But taking into account that
+ * for v71 there were a lot of changes, it was more tidy this way. Also right
+ * now we are doing a linear search on those tables, so this maintains the
+ * tables smaller.
+ *
+ * Just in case we merge the tables, we define the first_ver as 71 for those
+ * opcodes that changed on v71
+ */
+static const struct opcode_desc add_ops_v71[] = {
+ /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
+ { 0, 47, .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
+ { 53, 55, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 56, 56, .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
+ { 57, 59, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 60, 60, .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
+ { 61, 63, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+ { 64, 111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
+ { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
+ { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
+ { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
+ { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
+ { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
+ { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
+ { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
+ { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
+ /* FMIN is instead FMAX depending on the raddr_a/b order. */
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
+ { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
+ { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
+
+ { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
+ { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
+ { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
+ { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
+ { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
+
+ { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
+ { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
+ { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
+ { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
+ { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
+ { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
+ { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
+ { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
+
+ { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+ { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+ { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+ { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
+ { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+ { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+ { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+ { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+ { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
+ { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
+ { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
+ { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
+ { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
+ { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
+ { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
+ { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
+ { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
+ { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
+ { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
+
+ { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
+ { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
+
+ { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
+
+ { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
+ { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
+
+ { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },
- { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
- { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
- { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
+ /* The stvpms are distinguished by the waddr field. */
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
+ { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
+
+ { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FROUND, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(3), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(7), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
+ { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
+
+ { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
+ { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
+
+ { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
+ { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
+
+ { 246, 246, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
+
+ { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+ { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+
+ { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
+ { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
+
+ { 249, 249, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
+ { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
+
+ { 249, 249, .raddr_mask = OP_MASK(3), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(7), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+ { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+
+ { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
+ { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
};
-static const struct opcode_desc mul_ops[] = {
- { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
- { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
- { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
- { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
- { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
- { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
- { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
- { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
- { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
- { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
- { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
+static const struct opcode_desc mul_ops_v71[] = {
+ /* For V3D 7.1, second mask field would be ignored */
+ { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
+ { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
+ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+ { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+ { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
+ { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
+ { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
+
+ { 14, 14, .raddr_mask = OP_RANGE(0, 2), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(4, 6), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(8, 10), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
+ { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(3), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(7), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
+ { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
+
+ { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
+
+ { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
};
/* Returns true if op_desc should be filtered out based on devinfo->ver
@@ -589,17 +845,23 @@ static const struct opcode_desc mul_ops[] = {
*/
static bool
opcode_invalid_in_version(const struct v3d_device_info *devinfo,
- const struct opcode_desc *op_desc)
+ const uint8_t first_ver,
+ const uint8_t last_ver)
{
- return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
- (op_desc->last_ver != 0 && devinfo->ver > op_desc->last_ver);
+ return (first_ver != 0 && devinfo->ver < first_ver) ||
+ (last_ver != 0 && devinfo->ver > last_ver);
}
+/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
+ * on the devinfo->ver some would be ignored. We do this way just to avoid
+ * having two really similar lookup_opcode methods
+ */
static const struct opcode_desc *
lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
const struct opcode_desc *opcodes,
size_t num_opcodes, uint32_t opcode,
- uint32_t mux_a, uint32_t mux_b)
+ uint32_t mux_a, uint32_t mux_b,
+ uint32_t raddr)
{
for (int i = 0; i < num_opcodes; i++) {
const struct opcode_desc *op_desc = &opcodes[i];
@@ -608,14 +870,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
opcode > op_desc->opcode_last)
continue;
- if (opcode_invalid_in_version(devinfo, op_desc))
+ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
continue;
- if (!(op_desc->mux_b_mask & (1 << mux_b)))
- continue;
+ if (devinfo->ver < 71) {
+ if (!(op_desc->mux.b_mask & (1 << mux_b)))
+ continue;
- if (!(op_desc->mux_a_mask & (1 << mux_a)))
- continue;
+ if (!(op_desc->mux.a_mask & (1 << mux_a)))
+ continue;
+ } else {
+ if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
+ continue;
+ }
return op_desc;
}
@@ -667,6 +934,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
}
}
+static bool
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
+ enum v3d_qpu_input_unpack *unpacked)
+{
+ switch (packed) {
+ case 0:
+ *unpacked = V3D_QPU_UNPACK_NONE;
+ return true;
+ case 1:
+ *unpacked = V3D_QPU_UNPACK_UL;
+ return true;
+ case 2:
+ *unpacked = V3D_QPU_UNPACK_UH;
+ return true;
+ case 3:
+ *unpacked = V3D_QPU_UNPACK_IL;
+ return true;
+ case 4:
+ *unpacked = V3D_QPU_UNPACK_IH;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+ uint32_t *packed)
+{
+ switch (unpacked) {
+ case V3D_QPU_UNPACK_NONE:
+ *packed = 0;
+ return true;
+ case V3D_QPU_UNPACK_UL:
+ *packed = 1;
+ return true;
+ case V3D_QPU_UNPACK_UH:
+ *packed = 2;
+ return true;
+ case V3D_QPU_UNPACK_IL:
+ *packed = 3;
+ return true;
+ case V3D_QPU_UNPACK_IH:
+ *packed = 4;
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool
v3d_qpu_float16_unpack_unpack(uint32_t packed,
enum v3d_qpu_input_unpack *unpacked)
@@ -737,8 +1054,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
}
static bool
-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
- struct v3d_qpu_instr *instr)
+v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
@@ -755,8 +1072,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
map_op = (map_op - 253 + 245);
const struct opcode_desc *desc =
- lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
- map_op, mux_a, mux_b);
+ lookup_opcode_from_packed(devinfo, add_ops_v33,
+ ARRAY_SIZE(add_ops_v33),
+ map_op, mux_a, mux_b, 0);
if (!desc)
return false;
@@ -812,12 +1130,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.add.b_unpack)) {
+ &instr->alu.add.b.unpack)) {
return false;
}
break;
@@ -831,7 +1149,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = mux_b & 0x3;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -843,7 +1161,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
break;
@@ -851,23 +1169,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
- &instr->alu.add.a_unpack)) {
+ &instr->alu.add.a.unpack)) {
return false;
}
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.add.a = mux_a;
- instr->alu.add.b = mux_b;
+ instr->alu.add.a.mux = mux_a;
+ instr->alu.add.b.mux = mux_b;
instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
instr->alu.add.magic_write = false;
@@ -892,8 +1210,194 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
}
static bool
-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+ uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
+ uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
+ uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+ uint32_t map_op = op;
+
+ const struct opcode_desc *desc =
+ lookup_opcode_from_packed(devinfo,
+ add_ops_v71,
+ ARRAY_SIZE(add_ops_v71),
+ map_op, 0, 0,
+ raddr_b);
+ if (!desc)
+ return false;
+
+ instr->alu.add.op = desc->op;
+
+ /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
+ * operands.
+ */
+ if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
+ if (instr->alu.add.op == V3D_QPU_A_FMIN)
+ instr->alu.add.op = V3D_QPU_A_FMAX;
+ if (instr->alu.add.op == V3D_QPU_A_FADD)
+ instr->alu.add.op = V3D_QPU_A_FADDNF;
+ }
+
+ /* Some QPU ops require a bit more than just basic opcode and mux a/b
+ * comparisons to distinguish them.
+ */
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_STVPMV:
+ case V3D_QPU_A_STVPMD:
+ case V3D_QPU_A_STVPMP:
+ switch (waddr) {
+ case 0:
+ instr->alu.add.op = V3D_QPU_A_STVPMV;
+ break;
+ case 1:
+ instr->alu.add.op = V3D_QPU_A_STVPMD;
+ break;
+ case 2:
+ instr->alu.add.op = V3D_QPU_A_STVPMP;
+ break;
+ default:
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP:
+ case V3D_QPU_A_VFPACK:
+ if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
+ instr->alu.add.op != V3D_QPU_A_FCMP) {
+ instr->alu.add.output_pack = (op >> 4) & 0x3;
+ } else {
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+ &instr->alu.add.b.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ unreachable("pending v71 update");
+ if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+
+ case V3D_QPU_A_MOV:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ case V3D_QPU_A_FMOV:
+ instr->alu.add.output_pack = raddr_b & 0x3;
+
+ /* Mul alu FMOV has one additional variant */
+ int32_t unpack = (raddr_b >> 2) & 0x7;
+ if (unpack == 7)
+ return false;
+
+ if (!v3d_qpu_float32_unpack_unpack(unpack,
+ &instr->alu.add.a.unpack)) {
+ return false;
+ }
+ break;
+
+ default:
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+ }
+
+ instr->alu.add.a.raddr = raddr_a;
+ instr->alu.add.b.raddr = raddr_b;
+ instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+
+ instr->alu.add.magic_write = false;
+ if (packed_inst & V3D_QPU_MA) {
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_LDVPMV_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
+ break;
+ case V3D_QPU_A_LDVPMD_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
+ break;
+ case V3D_QPU_A_LDVPMG_IN:
+ instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
+ break;
+ default:
+ instr->alu.add.magic_write = true;
+ break;
+ }
+ }
+
+ return true;
+}
+
+static bool
+v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
struct v3d_qpu_instr *instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
+ else
+ return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
+}
+
+static bool
+v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
@@ -901,9 +1405,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
{
const struct opcode_desc *desc =
- lookup_opcode_from_packed(devinfo, mul_ops,
- ARRAY_SIZE(mul_ops),
- op, mux_a, mux_b);
+ lookup_opcode_from_packed(devinfo,
+ mul_ops_v33,
+ ARRAY_SIZE(mul_ops_v33),
+ op, mux_a, mux_b, 0);
if (!desc)
return false;
@@ -915,12 +1420,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
- &instr->alu.mul.b_unpack)) {
+ &instr->alu.mul.b.unpack)) {
return false;
}
@@ -931,7 +1436,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
((mux_b >> 2) & 1));
if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
@@ -941,74 +1446,169 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
- &instr->alu.mul.a_unpack)) {
+ &instr->alu.mul.a.unpack)) {
return false;
}
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
default:
instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
- instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
- instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
break;
}
- instr->alu.mul.a = mux_a;
- instr->alu.mul.b = mux_b;
+ instr->alu.mul.a.mux = mux_a;
+ instr->alu.mul.b.mux = mux_b;
instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
return true;
}
-static const struct opcode_desc *
-lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
- const struct opcode_desc *opcodes, size_t num_opcodes,
- uint8_t op)
+static bool
+v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
{
- for (int i = 0; i < num_opcodes; i++) {
- const struct opcode_desc *op_desc = &opcodes[i];
-
- if (op_desc->op != op)
- continue;
+ uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+ uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
+ uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);
- if (opcode_invalid_in_version(devinfo, op_desc))
- continue;
+ {
+ const struct opcode_desc *desc =
+ lookup_opcode_from_packed(devinfo,
+ mul_ops_v71,
+ ARRAY_SIZE(mul_ops_v71),
+ op, 0, 0,
+ raddr_d);
+ if (!desc)
+ return false;
- return op_desc;
+ instr->alu.mul.op = desc->op;
}
- return NULL;
-}
-
+ switch (instr->alu.mul.op) {
+ case V3D_QPU_M_FMUL:
+ instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+ &instr->alu.mul.b.unpack)) {
+ return false;
+ }
+
+ break;
+
+ case V3D_QPU_M_FMOV:
+ instr->alu.mul.output_pack = raddr_d & 0x3;
+
+ if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ break;
+
+ case V3D_QPU_M_VFMUL:
+ unreachable("pending v71 update");
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+ break;
+
+ case V3D_QPU_M_MOV:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+ if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
+ &instr->alu.mul.a.unpack)) {
+ return false;
+ }
+ break;
+
+ default:
+ instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+ instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+ break;
+ }
+
+ instr->alu.mul.a.raddr = raddr_c;
+ instr->alu.mul.b.raddr = raddr_d;
+ instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+ instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+
+ return true;
+}
+
static bool
-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ struct v3d_qpu_instr *instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
+ else
+ return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
+}
+
+static const struct opcode_desc *
+lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+ const struct opcode_desc *opcodes, size_t num_opcodes,
+ uint8_t op)
+{
+ for (int i = 0; i < num_opcodes; i++) {
+ const struct opcode_desc *op_desc = &opcodes[i];
+
+ if (op_desc->op != op)
+ continue;
+
+ if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
+ continue;
+
+ return op_desc;
+ }
+
+ return NULL;
+}
+
+static bool
+v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
uint32_t waddr = instr->alu.add.waddr;
- uint32_t mux_a = instr->alu.add.a;
- uint32_t mux_b = instr->alu.add.b;
+ uint32_t mux_a = instr->alu.add.a.mux;
+ uint32_t mux_b = instr->alu.add.b.mux;
int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
const struct opcode_desc *desc =
- lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+ lookup_opcode_from_instr(devinfo, add_ops_v33,
+ ARRAY_SIZE(add_ops_v33),
instr->alu.add.op);
if (!desc)
return false;
- uint32_t opcode = desc->opcode_first;
+ uint32_t opcode = opcode = desc->opcode_first;
/* If an operation doesn't use an arg, its mux values may be used to
* identify the operation type.
*/
if (nsrc < 2)
- mux_b = ffs(desc->mux_b_mask) - 1;
+ mux_b = ffs(desc->mux.b_mask) - 1;
if (nsrc < 1)
- mux_a = ffs(desc->mux_a_mask) - 1;
+ mux_a = ffs(desc->mux.a_mask) - 1;
bool no_magic_write = false;
@@ -1061,12 +1661,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
opcode |= output_pack << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
@@ -1100,23 +1700,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
uint32_t a_unpack;
uint32_t b_unpack;
- if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
- instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&a_unpack)) {
return false;
}
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
&b_unpack)) {
return false;
}
- opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
- opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
break;
}
@@ -1135,13 +1735,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
mux_b |= packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
if (packed == 0)
return false;
- opcode = (opcode & ~(1 << 2)) | packed << 2;
+ opcode = (opcode & ~(0x3 << 2)) | packed << 2;
break;
}
@@ -1153,7 +1753,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
return false;
uint32_t packed;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1166,11 +1766,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
case V3D_QPU_A_VFMIN:
case V3D_QPU_A_VFMAX:
if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
return false;
}
- if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
&packed)) {
return false;
}
@@ -1180,8 +1780,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
default:
if (instr->alu.add.op != V3D_QPU_A_NOP &&
(instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
- instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
- instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
return false;
}
break;
@@ -1198,15 +1798,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
static bool
-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
- const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
{
- uint32_t mux_a = instr->alu.mul.a;
- uint32_t mux_b = instr->alu.mul.b;
+ uint32_t waddr = instr->alu.add.waddr;
+ uint32_t raddr_a = instr->alu.add.a.raddr;
+ uint32_t raddr_b = instr->alu.add.b.raddr;
+
+ int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+ const struct opcode_desc *desc =
+ lookup_opcode_from_instr(devinfo, add_ops_v71,
+ ARRAY_SIZE(add_ops_v71),
+ instr->alu.add.op);
+ if (!desc)
+ return false;
+
+ uint32_t opcode = opcode = desc->opcode_first;
+
+ /* If an operation doesn't use an arg, its raddr values may be used to
+ * identify the operation type.
+ */
+ if (nsrc < 2)
+ raddr_b = ffsll(desc->raddr_mask) - 1;
+
+ bool no_magic_write = false;
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_STVPMV:
+ waddr = 0;
+ no_magic_write = true;
+ break;
+ case V3D_QPU_A_STVPMD:
+ waddr = 1;
+ no_magic_write = true;
+ break;
+ case V3D_QPU_A_STVPMP:
+ waddr = 2;
+ no_magic_write = true;
+ break;
+
+ case V3D_QPU_A_LDVPMV_IN:
+ case V3D_QPU_A_LDVPMD_IN:
+ case V3D_QPU_A_LDVPMP:
+ case V3D_QPU_A_LDVPMG_IN:
+ assert(!instr->alu.add.magic_write);
+ break;
+
+ case V3D_QPU_A_LDVPMV_OUT:
+ case V3D_QPU_A_LDVPMD_OUT:
+ case V3D_QPU_A_LDVPMG_OUT:
+ assert(!instr->alu.add.magic_write);
+ *packed_instr |= V3D_QPU_MA;
+ break;
+
+ default:
+ break;
+ }
+
+ switch (instr->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP: {
+ uint32_t output_pack;
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &output_pack)) {
+ return false;
+ }
+ opcode |= output_pack << 4;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ /* These operations with commutative operands are
+ * distinguished by which order their operands come in.
+ */
+ bool ordering =
+ instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
+ instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
+ if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+ instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+ ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+ instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
+ uint32_t temp;
+
+ temp = a_unpack;
+ a_unpack = b_unpack;
+ b_unpack = temp;
+
+ temp = raddr_a;
+ raddr_a = raddr_b;
+ raddr_b = temp;
+
+ /* If we are swapping raddr_a/b we also need to swap
+ * small_imm_a/b.
+ */
+ if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
+ assert(instr->sig.small_imm_a !=
+ instr->sig.small_imm_b);
+ struct v3d_qpu_sig new_sig = instr->sig;
+ new_sig.small_imm_a = !instr->sig.small_imm_a;
+ new_sig.small_imm_b = !instr->sig.small_imm_b;
+ uint32_t sig;
+ if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+ return false;
+ *packed_instr &= ~V3D_QPU_SIG_MASK;
+ *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+ }
+ }
+
+ opcode |= a_unpack << 2;
+ opcode |= b_unpack << 0;
+
+ break;
+ }
+
+ case V3D_QPU_A_VFPACK: {
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+
+ break;
+ }
+
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (packed == 0)
+ return false;
+ raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
+ break;
+ }
+
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ uint32_t packed;
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (packed == 0)
+ return false;
+
+ raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
+
+ break;
+
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+ return false;
+ }
+
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed;
+ break;
+
+ case V3D_QPU_A_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_b |= packed << 2;
+ break;
+ }
+
+ case V3D_QPU_A_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_b = packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_b |= packed << 2;
+ break;
+ }
+
+ default:
+ if (instr->alu.add.op != V3D_QPU_A_NOP &&
+ (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
+ break;
+ }
+
+ *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
+ *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
+ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
+ *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
+ if (instr->alu.add.magic_write && !no_magic_write)
+ *packed_instr |= V3D_QPU_MA;
+
+ return true;
+}
+
+static bool
+v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ uint32_t mux_a = instr->alu.mul.a.mux;
+ uint32_t mux_b = instr->alu.mul.b.mux;
int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
const struct opcode_desc *desc =
- lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
+ lookup_opcode_from_instr(devinfo, mul_ops_v33,
+ ARRAY_SIZE(mul_ops_v33),
instr->alu.mul.op);
if (!desc)
@@ -1218,10 +2083,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
* that here. If mux a/b determine packing, it will be set below.
*/
if (nsrc < 2)
- mux_b = ffs(desc->mux_b_mask) - 1;
+ mux_b = ffs(desc->mux.b_mask) - 1;
if (nsrc < 1)
- mux_a = ffs(desc->mux_a_mask) - 1;
+ mux_a = ffs(desc->mux.a_mask) - 1;
switch (instr->alu.mul.op) {
case V3D_QPU_M_FMUL: {
@@ -1236,13 +2101,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
*/
opcode += packed << 4;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
opcode |= packed << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
&packed)) {
return false;
}
@@ -1260,7 +2125,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
opcode |= (packed >> 1) & 1;
mux_b = (packed & 1) << 2;
- if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
@@ -1274,22 +2139,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
return false;
- if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
&packed)) {
return false;
}
- if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
opcode = 8;
else
opcode |= (packed + 4) & 7;
- if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
return false;
break;
}
default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
break;
}
@@ -1304,6 +2175,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
return true;
}
+static bool
+v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ uint32_t raddr_c = instr->alu.mul.a.raddr;
+ uint32_t raddr_d = instr->alu.mul.b.raddr;
+ int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+
+ const struct opcode_desc *desc =
+ lookup_opcode_from_instr(devinfo, mul_ops_v71,
+ ARRAY_SIZE(mul_ops_v71),
+ instr->alu.mul.op);
+ if (!desc)
+ return false;
+
+ uint32_t opcode = desc->opcode_first;
+
+ /* Some opcodes have a single valid value for their raddr_d, so set
+ * that here. If raddr_b determine packing, it will be set below.
+ */
+ if (nsrc < 2)
+ raddr_d = ffsll(desc->raddr_mask) - 1;
+
+ switch (instr->alu.mul.op) {
+ case V3D_QPU_M_FMUL: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+ &packed)) {
+ return false;
+ }
+ /* No need for a +1 because desc->opcode_first has a 1 in this
+ * field.
+ */
+ opcode += packed << 4;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed << 2;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+ &packed)) {
+ return false;
+ }
+ opcode |= packed << 0;
+ break;
+ }
+
+ case V3D_QPU_M_FMOV: {
+ uint32_t packed;
+
+ if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+ &packed)) {
+ return false;
+ }
+ raddr_d |= packed;
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ raddr_d |= packed << 2;
+ break;
+ }
+
+ case V3D_QPU_M_VFMUL: {
+ unreachable("pending v71 update");
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+ if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+ opcode = 8;
+ else
+ opcode |= (packed + 4) & 7;
+
+ if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+ return false;
+
+ break;
+ }
+
+ case V3D_QPU_M_MOV: {
+ uint32_t packed;
+
+ if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+ return false;
+
+ if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
+ &packed)) {
+ return false;
+ }
+
+ raddr_d |= packed << 2;
+ break;
+ }
+
+ default:
+ if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+ (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+ instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+ instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+ return false;
+ }
+ break;
+ }
+
+ *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
+ *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
+ *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
+ *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
+ if (instr->alu.mul.magic_write)
+ *packed_instr |= V3D_QPU_MM;
+
+ return true;
+}
+
+static bool
+v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
+ else
+ return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
+}
+
+static bool
+v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+ if (devinfo->ver < 71)
+ return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
+ else
+ return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
+}
+
static bool
v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
uint64_t packed_instr,
@@ -1332,8 +2347,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
return false;
}
- instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
- instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+ if (devinfo->ver <= 71) {
+ /*
+ * For v71 this will be set on add/mul unpack, as raddr are now
+ * part of v3d_qpu_input
+ */
+ instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+ instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+ }
if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
return false;
@@ -1419,8 +2440,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
*packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
- *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
- *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+ if (devinfo->ver < 71) {
+ /*
+ * For v71 this will be set on add/mul unpack, as raddr are now
+ * part of v3d_qpu_input
+ */
+ *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+ *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+ }
if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
return false;
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index 2f8e19c73fe..be7b78d5ef0 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -160,10 +160,10 @@ main(int argc, char **argv)
/* Swap the operands to be sure that we test
* how the QPUs distinguish between these ops.
*/
- swap_mux(&instr.alu.add.a,
- &instr.alu.add.b);
- swap_pack(&instr.alu.add.a_unpack,
- &instr.alu.add.b_unpack);
+ swap_mux(&instr.alu.add.a.mux,
+ &instr.alu.add.b.mux);
+ swap_pack(&instr.alu.add.a.unpack,
+ &instr.alu.add.b.unpack);
break;
default:
break;
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
index eea5d3f050e..c4bbd61abc2 100644
--- a/src/broadcom/simulator/v3d_simulator.c
+++ b/src/broadcom/simulator/v3d_simulator.c
@@ -92,6 +92,9 @@ static struct v3d_simulator_state {
/** Last performance monitor ID. */
uint32_t last_perfid;
+ /** Total performance counters */
+ uint32_t perfcnt_total;
+
struct util_dynarray bin_oom;
int refcount;
} sim_state = {
@@ -436,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
if (perfmon)
- v3d41_simulator_perfmon_stop(sim_state.v3d,
- perfmon->ncounters,
- perfmon->values);
+ v3d_X_simulator(perfmon_stop)(sim_state.v3d,
+ perfmon->ncounters,
+ perfmon->values);
perfmon = v3d_get_simulator_perfmon(fd, perfid);
if (perfmon)
- v3d41_simulator_perfmon_start(sim_state.v3d,
- perfmon->ncounters,
- perfmon->counters);
+ v3d_X_simulator(perfmon_start)(sim_state.v3d,
+ perfmon->ncounters,
+ perfmon->counters);
file->active_perfid = perfid;
}
@@ -489,11 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
bin_fd = fd;
v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
-
- if (sim_state.ver >= 41)
- v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
- else
- v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+ v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);
util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
sim_bo) {
@@ -632,15 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
}
-static int
-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
-{
- if (sim_state.ver >= 41)
- return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
- else
- return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
-}
-
static int
v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
{
@@ -652,10 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
- else
- ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+ ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);
v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
@@ -682,11 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
v3d_simulator_perfmon_switch(fd, args->perfmon_id);
- if (sim_state.ver >= 41)
- ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
- file->gmp->ofs);
- else
- ret = -1;
+ ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
+ file->gmp->ofs);
for (int i = 0; i < args->bo_handle_count; i++)
v3d_simulator_copy_out_handle(file, bo_handles[i]);
@@ -716,7 +700,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
perfmon->ncounters = args->ncounters;
for (int i = 0; i < args->ncounters; i++) {
- if (args->counters[i] >= V3D_PERFCNT_NUM) {
+ if (args->counters[i] >= sim_state.perfcnt_total) {
ralloc_free(perfmon);
return -EINVAL;
} else {
@@ -797,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
return 0;
case DRM_IOCTL_V3D_GET_PARAM:
- return v3d_simulator_get_param_ioctl(fd, args);
+ return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);
case DRM_IOCTL_GEM_CLOSE:
return v3d_simulator_gem_close_ioctl(fd, args);
@@ -880,10 +864,19 @@ v3d_simulator_init_global()
util_dynarray_init(&sim_state.bin_oom, NULL);
- if (sim_state.ver >= 41)
- v3d41_simulator_init_regs(sim_state.v3d);
- else
- v3d33_simulator_init_regs(sim_state.v3d);
+ v3d_X_simulator(init_regs)(sim_state.v3d);
+
+ switch(sim_state.ver) {
+ case 41:
+ case 42:
+ sim_state.perfcnt_total = 87;
+ break;
+ case 71:
+ sim_state.perfcnt_total = 93;
+ break;
+ default:
+ sim_state.perfcnt_total = 0;
+ }
}
struct v3d_simulator_file *
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
index ddb079c1455..92305634468 100644
--- a/src/broadcom/simulator/v3d_simulator.h
+++ b/src/broadcom/simulator/v3d_simulator.h
@@ -52,6 +52,32 @@ uint32_t v3d_simulator_get_mem_free(void);
# define v3dX(x) v3d41_##x
# include "v3dx_simulator.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dx_simulator.h"
+# undef v3dX
+
#endif
+/* Helper to call simulator ver specific functions */
+#define v3d_X_simulator(thing) ({ \
+ __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\
+ switch (sim_state.ver) { \
+ case 33: \
+ case 40: \
+ v3d_X_sim_thing = &v3d33_simulator_##thing; \
+ break; \
+ case 41: \
+ case 42: \
+ v3d_X_sim_thing = &v3d41_simulator_##thing; \
+ break; \
+ case 71: \
+ v3d_X_sim_thing = &v3d71_simulator_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ v3d_X_sim_thing; \
+})
+
#endif
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index c9322f0397b..01cf6b22663 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -46,11 +46,15 @@
#define HW_REGISTER_RO(x) (x)
#define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
+#if V3D_VERSION == 71
+#include "libs/core/v3d/registers/7.1.5.1/v3d.h"
+#else
+#if V3D_VERSION == 41 || V3D_VERSION == 42
+#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
#else
#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
#endif
+#endif
#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
@@ -178,38 +182,48 @@ v3d_flush_caches(struct v3d_hw *v3d)
v3d_flush_l2t(v3d);
}
+#if V3D_VERSION < 71
+#define TFU_REG(NAME) V3D_TFU_ ## NAME
+#else
+#define TFU_REG(NAME) V3D_IFC_ ## NAME
+#endif
+
+
int
v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_submit_tfu *args)
{
- int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
- V3D_WRITE(V3D_TFU_IIA, args->iia);
- V3D_WRITE(V3D_TFU_IIS, args->iis);
- V3D_WRITE(V3D_TFU_ICA, args->ica);
- V3D_WRITE(V3D_TFU_IUA, args->iua);
- V3D_WRITE(V3D_TFU_IOA, args->ioa);
- V3D_WRITE(V3D_TFU_IOS, args->ios);
- V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
- V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
- V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
- V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
-
- V3D_WRITE(V3D_TFU_ICFG, args->icfg);
-
- while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+ int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET;
+
+ V3D_WRITE(TFU_REG(IIA), args->iia);
+ V3D_WRITE(TFU_REG(IIS), args->iis);
+ V3D_WRITE(TFU_REG(ICA), args->ica);
+ V3D_WRITE(TFU_REG(IUA), args->iua);
+ V3D_WRITE(TFU_REG(IOA), args->ioa);
+#if V3D_VERSION >= 71
+ V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
+#endif
+ V3D_WRITE(TFU_REG(IOS), args->ios);
+ V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
+ V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
+ V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
+ V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
+
+ V3D_WRITE(TFU_REG(ICFG), args->icfg);
+
+ while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
v3d_hw_tick(v3d);
}
return 0;
}
-#if V3D_VERSION >= 41
int
v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
struct drm_v3d_submit_csd *args,
uint32_t gmp_ofs)
{
+#if V3D_VERSION >= 41
int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
g_gmp_ofs = gmp_ofs;
@@ -223,6 +237,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
+#if V3D_VERSION >= 71
+ V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
+#endif
/* CFG0 kicks off the job */
V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
@@ -239,8 +256,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
v3d_flush_caches(v3d);
return 0;
-}
+#else
+ return -1;
#endif
+}
int
v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
@@ -310,16 +329,17 @@ v3d_isr_core(struct v3d_hw *v3d,
return;
}
+#if V3D_VERSION <= 42
if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
fprintf(stderr, "GMP violation at 0x%08x\n",
V3D_READ(V3D_GMP_VIO_ADDR));
- abort();
} else {
fprintf(stderr,
"Unexpected ISR with core status 0x%08x\n",
core_status);
}
abort();
+#endif
}
static void
@@ -396,6 +416,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
}
handle_mmu_interruptions(v3d, hub_status);
+
+#if V3D_VERSION == 71
+ if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
+ fprintf(stderr, "GMP violation at 0x%08x\n",
+ V3D_READ(V3D_GMP_VIO_ADDR));
+ } else {
+ fprintf(stderr,
+ "Unexpected ISR with status 0x%08x\n",
+ hub_status);
+ }
+ abort();
+#endif
}
static void
@@ -436,8 +468,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
* for tracing. Perhaps we should evaluate to do the same here and add
* some debug options.
*/
- uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
- V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
+ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
+#if V3D_VERSION <= 42
+ core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
+#endif
+
V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
@@ -447,6 +482,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */
V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+#if V3D_VERSION == 71
+ hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
+#endif
V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
@@ -509,7 +547,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
#define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
#define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
#define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
- V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
+ V3D_PCTR_0_SRC_N_SHIFT(x) + \
+ V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
#endif
void
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
index ad032d832ad..182388a35b4 100644
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
'--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
'--beta', with_vulkan_beta.to_string(),
'--device-prefix', 'ver42',
+ '--device-prefix', 'ver71',
],
depend_files : vk_entrypoints_gen_depend_files,
)
@@ -64,13 +65,11 @@ files_per_version = files(
'v3dvx_pipeline.c',
'v3dvx_meta_common.c',
'v3dvx_pipeline.c',
+ 'v3dvx_query.c',
'v3dvx_queue.c',
)
-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']
v3dv_flags = []
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index a14db073b4f..c6462735fe4 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job,
uint32_t layers,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa,
bool double_buffer)
{
@@ -360,13 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job,
tiling->render_target_count = render_target_count;
tiling->msaa = msaa;
tiling->internal_bpp = max_internal_bpp;
+ tiling->total_color_bpp = total_color_bpp;
tiling->double_buffer = double_buffer;
/* Double-buffer is incompatible with MSAA */
assert(!tiling->msaa || !tiling->double_buffer);
- v3d_choose_tile_size(render_target_count, max_internal_bpp,
- tiling->msaa, tiling->double_buffer,
+ v3d_choose_tile_size(&job->device->devinfo,
+ render_target_count,
+ max_internal_bpp, total_color_bpp, msaa,
+ tiling->double_buffer,
&tiling->tile_width, &tiling->tile_height);
tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
@@ -457,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
bool allocate_tile_state_now,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa)
{
assert(job);
@@ -467,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
const struct v3dv_frame_tiling *tiling =
job_compute_frame_tiling(job, width, height, layers,
render_target_count, max_internal_bpp,
- msaa, false);
+ total_color_bpp, msaa, false);
v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
v3dv_return_if_oom(NULL, job);
@@ -528,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
job->frame_tiling.layers,
job->frame_tiling.render_target_count,
job->frame_tiling.internal_bpp,
+ job->frame_tiling.total_color_bpp,
job->frame_tiling.msaa,
true);
@@ -1374,7 +1380,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
}
uint32_t att_count = 0;
- VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
+ VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
/* We only need to emit subpass clears as draw calls for color attachments
* if the render area is not aligned to tile boundaries.
@@ -1672,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
const struct v3dv_framebuffer *framebuffer = state->framebuffer;
- uint8_t internal_bpp;
+ uint8_t max_internal_bpp, total_color_bpp;
bool msaa;
v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
- (framebuffer, state->attachments, subpass, &internal_bpp, &msaa);
+ (framebuffer, state->attachments, subpass,
+ &max_internal_bpp, &total_color_bpp, &msaa);
/* From the Vulkan spec:
*
@@ -1699,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
layers,
true, false,
subpass->color_count,
- internal_bpp,
+ max_internal_bpp,
+ total_color_bpp,
msaa);
}
@@ -2062,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
}
}
+ if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+ if (memcmp(&dest->depth_bounds, &src->depth_bounds,
+ sizeof(src->depth_bounds))) {
+ memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
+ dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+ }
+ }
+
if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
if (dest->line_width != src->line_width) {
dest->line_width = src->line_width;
@@ -2131,39 +2147,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
}
}
-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
-void
-v3dv_viewport_compute_xform(const VkViewport *viewport,
- float scale[3],
- float translate[3])
-{
- float x = viewport->x;
- float y = viewport->y;
- float half_width = 0.5f * viewport->width;
- float half_height = 0.5f * viewport->height;
- double n = viewport->minDepth;
- double f = viewport->maxDepth;
-
- scale[0] = half_width;
- translate[0] = half_width + x;
- scale[1] = half_height;
- translate[1] = half_height + y;
-
- scale[2] = (f - n);
- translate[2] = n;
-
- /* It seems that if the scale is small enough the hardware won't clip
- * correctly so we work around this my choosing the smallest scale that
- * seems to work.
- *
- * This case is exercised by CTS:
- * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
- */
- const float min_abs_scale = 0.000009f;
- if (fabs(scale[2]) < min_abs_scale)
- scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
-}
-
/* Considers the pipeline's negative_one_to_one state and applies it to the
* current viewport transform if needed to produce the resulting Z translate
* and scale parameters.
@@ -2216,9 +2199,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
viewportCount * sizeof(*pViewports));
for (uint32_t i = firstViewport; i < total_count; i++) {
- v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
- state->dynamic.viewport.scale[i],
- state->dynamic.viewport.translate[i]);
+ v3dv_X(cmd_buffer->device, viewport_compute_xform)
+ (&state->dynamic.viewport.viewports[i],
+ state->dynamic.viewport.scale[i],
+ state->dynamic.viewport.translate[i]);
}
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
@@ -2699,6 +2683,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
true, false,
old_job->frame_tiling.render_target_count,
old_job->frame_tiling.internal_bpp,
+ old_job->frame_tiling.total_color_bpp,
true /* msaa */);
v3dv_job_destroy(old_job);
@@ -2963,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
+ if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
+ v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
+
if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
@@ -3410,9 +3398,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
float minDepthBounds,
float maxDepthBounds)
{
- /* We do not support depth bounds testing so we just ignore this. We are
- * already asserting that pipelines don't enable the feature anyway.
- */
+ V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
+ cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+ cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
}
VKAPI_ATTR void VKAPI_CALL
@@ -3844,6 +3834,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
void
v3dv_cmd_buffer_rewrite_indirect_csd_job(
+ struct v3dv_device *device,
struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts)
{
@@ -3863,8 +3854,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
- submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
- (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+ uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
+ (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+ if (device->devinfo.ver < 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+ submit->cfg[4] = num_batches - 1;
+ } else {
+ submit->cfg[4] = num_batches;
+ }
assert(submit->cfg[4] != ~0);
if (info->needs_wg_uniform_rewrite) {
@@ -3897,6 +3895,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t **wg_uniform_offsets_out,
uint32_t *wg_size_out)
{
+ struct v3dv_device *device = cmd_buffer->device;
struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
struct v3dv_shader_variant *cs_variant =
@@ -3955,18 +3954,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
if (wg_size_out)
*wg_size_out = wg_size;
- submit->cfg[4] = num_batches - 1;
+ /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+ if (device->devinfo.ver < 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+ submit->cfg[4] = num_batches - 1;
+ } else {
+ submit->cfg[4] = num_batches;
+ }
assert(submit->cfg[4] != ~0);
assert(pipeline->shared_data->assembly_bo);
struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
- submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (cs_variant->prog_data.base->single_seg)
submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
if (cs_variant->prog_data.base->threads == 4)
submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+ /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */
+ if (device->devinfo.ver < 71)
+ submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (cs_variant->prog_data.cs->shared_size > 0) {
job->csd.shared_memory =
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 3bad290e8c5..d013edaa63d 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -91,7 +91,7 @@ static const struct vk_instance_extension_table instance_extensions = {
.KHR_display = true,
.KHR_get_display_properties2 = true,
.EXT_direct_mode_display = true,
- .EXT_acquire_drm_display = true,
+ .EXT_acquire_drm_display = false,
#endif
.KHR_external_fence_capabilities = true,
.KHR_external_memory_capabilities = true,
@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device,
*features = (struct vk_features) {
/* Vulkan 1.0 */
.robustBufferAccess = true, /* This feature is mandatory */
- .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
+ .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
.imageCubeArray = true,
.independentBlend = true,
.geometryShader = true,
@@ -224,10 +224,10 @@ get_features(const struct v3dv_physical_device *physical_device,
.logicOp = true,
.multiDrawIndirect = false,
.drawIndirectFirstInstance = true,
- .depthClamp = false, /* Only available since V3D 4.5.1.1 */
+ .depthClamp = physical_device->devinfo.ver >= 71,
.depthBiasClamp = true,
.fillModeNonSolid = true,
- .depthBounds = false, /* Only available since V3D 4.3.16.2 */
+ .depthBounds = physical_device->devinfo.ver >= 71,
.wideLines = true,
.largePoints = true,
.alphaToOne = true,
@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device,
* problematic, we would always have to scalarize. Overall, this would
* not lead to best performance so let's just not support it.
*/
- .scalarBlockLayout = false,
+ .scalarBlockLayout = physical_device->devinfo.ver >= 71,
/* This tells applications 2 things:
*
* 1. If they can select just one aspect for barriers. For us barriers
@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
device->next_program_id = 0;
ASSERTED int len =
- asprintf(&device->name, "V3D %d.%d",
- device->devinfo.ver / 10, device->devinfo.ver % 10);
+ asprintf(&device->name, "V3D %d.%d.%d",
+ device->devinfo.ver / 10,
+ device->devinfo.ver % 10,
+ device->devinfo.rev);
assert(len != -1);
v3dv_physical_device_init_disk_cache(device);
@@ -1212,6 +1214,12 @@ create_physical_device(struct v3dv_instance *instance,
list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
+ if (device->devinfo.ver != 42) {
+ fprintf(stderr, "WARNING: v3dv support for hw version %i is neither "
+ "a complete nor a conformant Vulkan implementation. Testing "
+ "use only.\n", device->devinfo.ver);
+ }
+
return VK_SUCCESS;
fail:
@@ -1279,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance)
if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+ if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+ strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
v3d_idx = i;
break;
}
@@ -1288,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance)
} else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
char **compat = devices[i]->deviceinfo.platform->compatible;
while (*compat) {
- if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
- strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+ if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+ strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
vc4_idx = i;
break;
}
@@ -1326,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
switch (dev->devinfo.ver) {
case 42:
return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+ case 71:
+ return 0x55701C33; /* Broadcom deviceID for 2712 */
default:
unreachable("Unsupported V3D version");
}
@@ -1354,6 +1366,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
const VkSampleCountFlags supported_sample_counts =
VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
+ const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
+
struct timespec clock_res;
clock_getres(CLOCK_MONOTONIC, &clock_res);
const float timestamp_period =
@@ -1424,7 +1438,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.maxFragmentInputComponents = max_varying_components,
.maxFragmentOutputAttachments = 4,
.maxFragmentDualSrcAttachments = 0,
- .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS +
+ .maxFragmentCombinedOutputResources = max_rts +
MAX_STORAGE_BUFFERS +
MAX_STORAGE_IMAGES,
@@ -1437,7 +1451,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.subPixelPrecisionBits = V3D_COORD_SHIFT,
.subTexelPrecisionBits = 8,
.mipmapPrecisionBits = 8,
- .maxDrawIndexedIndexValue = 0x00ffffff,
+ .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ?
+ 0xffffffff : 0x00ffffff,
.maxDrawIndirectCount = 0x7fffffff,
.maxSamplerLodBias = 14.0f,
.maxSamplerAnisotropy = 16.0f,
@@ -1464,7 +1479,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.framebufferDepthSampleCounts = supported_sample_counts,
.framebufferStencilSampleCounts = supported_sample_counts,
.framebufferNoAttachmentsSampleCounts = supported_sample_counts,
- .maxColorAttachments = MAX_RENDER_TARGETS,
+ .maxColorAttachments = max_rts,
.sampledImageColorSampleCounts = supported_sample_counts,
.sampledImageIntegerSampleCounts = supported_sample_counts,
.sampledImageDepthSampleCounts = supported_sample_counts,
@@ -2031,7 +2046,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
device->instance->default_pipeline_cache_enabled);
device->default_attribute_float =
- v3dv_pipeline_create_default_attribute_values(device, NULL);
+ v3dv_X(device, create_default_attribute_values)(device, NULL);
device->device_address_mem_ctx = ralloc_context(NULL);
util_dynarray_init(&device->device_address_bo_list,
@@ -2975,7 +2990,7 @@ v3dv_CreateSampler(VkDevice _device,
}
}
- v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
+ v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
*pSampler = v3dv_sampler_to_handle(sampler);
diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
index ebbd60e4c03..e01e2e1bd19 100644
--- a/src/broadcom/vulkan/v3dv_image.c
+++ b/src/broadcom/vulkan/v3dv_image.c
@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device,
* makes sense to implement swizzle composition using VkSwizzle directly.
*/
VkFormat format;
- uint8_t image_view_swizzle[4];
if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
format = VK_FORMAT_R8G8B8A8_UINT;
@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device,
vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
- image_view_swizzle);
+ iview->view_swizzle);
} else {
format = pCreateInfo->format;
vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
- image_view_swizzle);
+ iview->view_swizzle);
}
iview->vk.view_format = format;
@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device,
const uint8_t *format_swizzle =
v3dv_get_format_swizzle(device, format, plane);
- util_format_compose_swizzles(format_swizzle, image_view_swizzle,
+ util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
iview->planes[plane].swizzle);
iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
index 9cda9f0d6d2..8ac99724105 100644
--- a/src/broadcom/vulkan/v3dv_limits.h
+++ b/src/broadcom/vulkan/v3dv_limits.h
@@ -50,8 +50,6 @@
#define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
MAX_DYNAMIC_STORAGE_BUFFERS)
-#define MAX_RENDER_TARGETS 4
-
#define MAX_MULTIVIEW_VIEW_COUNT 16
/* These are tunable parameters in the HW design, but all the V3D
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index a200298a898..0b64653000d 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
v3dv_job_start_frame(job, width, height, max_layer,
false, true, 1, internal_bpp,
+ 4 * v3d_internal_bpp_words(internal_bpp),
image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
@@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
uint32_t bit_offset = 0;
key |= rt_idx;
- bit_offset += 2;
+ bit_offset += 3;
key |= ((uint64_t) format) << bit_offset;
bit_offset += 32;
@@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
- /* We can only clear attachments in the current subpass */
- assert(attachmentCount <= 5); /* 4 color + D/S */
+ /* We can have at most max_color_RTs + 1 D/S attachments */
+ assert(attachmentCount <=
+ V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
+ /* We can only clear attachments in the current subpass */
struct v3dv_render_pass *pass = cmd_buffer->state.pass;
assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index c0ec888b8c7..2d30c611e17 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers,
- false, true, 1, internal_bpp,
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
struct v3dv_meta_framebuffer framebuffer;
@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
- v3dv_job_start_frame(job, width, height, num_layers, false, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
(fb_format, region->srcSubresource.aspectMask,
&internal_type, &internal_bpp);
- v3dv_job_start_frame(job, width, height, num_layers, false, true,
- 1, internal_bpp, true);
+ v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ true);
struct v3dv_meta_framebuffer framebuffer;
v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 20f5014268d..0583faf6f9a 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device,
/* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
* the clear might get lost. If a subpass has this then we can't emit
- * the clear using the TLB and we have to do it as a draw call.
+ * the clear using the TLB and we have to do it as a draw call. This
+ * issue is fixed since V3D 4.3.18.
*
* FIXME: separate stencil.
*/
- if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+ if (device->devinfo.ver == 42 &&
+ subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
struct v3dv_render_pass_attachment *att =
&pass->attachments[subpass->ds_attachment.attachment];
if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
@@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device,
/* Granularity is defined by the tile size */
assert(subpass_idx < pass->subpass_count);
struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
- const uint32_t color_attachment_count = subpass->color_count;
+ const uint32_t color_count = subpass->color_count;
bool msaa = false;
- uint32_t max_bpp = 0;
- for (uint32_t i = 0; i < color_attachment_count; i++) {
+ uint32_t max_internal_bpp = 0;
+ uint32_t total_color_bpp = 0;
+ for (uint32_t i = 0; i < color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
if (attachment_idx == VK_ATTACHMENT_UNUSED)
continue;
@@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device,
v3dv_X(device, get_internal_type_bpp_for_output_format)
(format->planes[0].rt_type, &internal_type, &internal_bpp);
- max_bpp = MAX2(max_bpp, internal_bpp);
+ max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
+ total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
msaa = true;
@@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device,
* heuristics so we choose a conservative granularity here, with it disabled.
*/
uint32_t width, height;
- v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
+ v3d_choose_tile_size(&device->devinfo, color_count,
+ max_internal_bpp, total_color_bpp, msaa,
false /* double-buffer */, &width, &height);
*granularity = (VkExtent2D) {
.width = width,
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 99fe8c16bfa..d6629c9a4a0 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
return V3DV_DYNAMIC_LINE_WIDTH;
case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
-
- /* Depth bounds testing is not available in in V3D 4.2 so here we are just
- * ignoring this dynamic state. We are already asserting at pipeline creation
- * time that depth bounds testing is not enabled.
- */
case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
- return 0;
+ return V3DV_DYNAMIC_DEPTH_BOUNDS;
default:
unreachable("Unhandled dynamic state");
@@ -2632,6 +2627,7 @@ pipeline_init_dynamic_state(
const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
{
/* Initialize to default values */
+ const struct v3d_device_info *devinfo = &pipeline->device->devinfo;
struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
memset(dynamic, 0, sizeof(*dynamic));
dynamic->stencil_compare_mask.front = ~0;
@@ -2639,7 +2635,9 @@ pipeline_init_dynamic_state(
dynamic->stencil_write_mask.front = ~0;
dynamic->stencil_write_mask.back = ~0;
dynamic->line_width = 1.0f;
- dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
+ dynamic->color_write_enable =
+ (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
+ dynamic->depth_bounds.max = 1.0f;
/* Create a mask of enabled dynamic states */
uint32_t dynamic_states = 0;
@@ -2661,9 +2659,10 @@ pipeline_init_dynamic_state(
pViewportState->viewportCount);
for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
- v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
- dynamic->viewport.scale[i],
- dynamic->viewport.translate[i]);
+ v3dv_X(pipeline->device, viewport_compute_xform)
+ (&dynamic->viewport.viewports[i],
+ dynamic->viewport.scale[i],
+ dynamic->viewport.translate[i]);
}
}
@@ -2691,6 +2690,11 @@ pipeline_init_dynamic_state(
dynamic->stencil_reference.front = pDepthStencilState->front.reference;
dynamic->stencil_reference.back = pDepthStencilState->back.reference;
}
+
+ if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+ dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds;
+ dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds;
+ }
}
if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
@@ -2802,62 +2806,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
}
}
-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
- for (uint8_t i = 0; i < pipeline->va_count; i++) {
- if (vk_format_is_int(pipeline->va[i].vk_format))
- return true;
- }
- return false;
-}
-
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline)
-{
- uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
- struct v3dv_bo *bo;
-
- bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
- if (!bo) {
- fprintf(stderr, "failed to allocate memory for the default "
- "attribute values\n");
- return NULL;
- }
-
- bool ok = v3dv_bo_map(device, bo, size);
- if (!ok) {
- fprintf(stderr, "failed to map default attribute values buffer\n");
- return false;
- }
-
- uint32_t *attrs = bo->map;
- uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
- for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
- attrs[i * 4 + 0] = 0;
- attrs[i * 4 + 1] = 0;
- attrs[i * 4 + 2] = 0;
- VkFormat attr_format =
- pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
- if (i < va_count && vk_format_is_int(attr_format)) {
- attrs[i * 4 + 3] = 1;
- } else {
- attrs[i * 4 + 3] = fui(1.0);
- }
- }
-
- v3dv_bo_unmap(device, bo);
-
- return bo;
-}
-
static void
pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
const VkPipelineMultisampleStateCreateInfo *ms_info)
@@ -2960,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline,
/* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
* feature and it shouldn't be used by any pipeline.
*/
- assert(!ds_info || !ds_info->depthBoundsTestEnable);
+ assert(device->devinfo.ver >= 71 ||
+ !ds_info || !ds_info->depthBoundsTestEnable);
+ pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable;
enable_depth_bias(pipeline, rs_info);
@@ -2992,9 +2942,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
- if (pipeline_has_integer_vertex_attrib(pipeline)) {
+ if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
pipeline->default_attribute_values =
- v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+ v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
if (!pipeline->default_attribute_values)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
} else {
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index c6707211529..89e2f1c7e5c 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -123,6 +123,9 @@ struct v3d_simulator_file;
/* Minimum required by the Vulkan 1.1 spec */
#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
+/* Maximum performance counters number */
+#define V3D_MAX_PERFCNT 93
+
struct v3dv_physical_device {
struct vk_physical_device vk;
@@ -581,6 +584,10 @@ struct v3dv_device {
* being float being float, allowing us to reuse the same BO for all
* pipelines matching this requirement. Pipelines that need integer
* attributes will create their own BO.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
+ *
*/
struct v3dv_bo *default_attribute_float;
@@ -772,6 +779,8 @@ struct v3dv_image_view {
const struct v3dv_format *format;
+ uint8_t view_swizzle[4];
+
uint8_t plane_count;
struct {
uint8_t image_plane;
@@ -782,8 +791,8 @@ struct v3dv_image_view {
uint32_t internal_type;
uint32_t offset;
- /* Precomputed (composed from createinfo->components and formar swizzle)
- * swizzles to pass in to the shader key.
+ /* Precomputed swizzle (composed from the view swizzle and the format
+ * swizzle).
*
* This could be also included on the descriptor bo, but the shader state
* packet doesn't need it on a bo, so we can just avoid a memory copy
@@ -946,6 +955,7 @@ struct v3dv_frame_tiling {
uint32_t layers;
uint32_t render_target_count;
uint32_t internal_bpp;
+ uint32_t total_color_bpp;
bool msaa;
bool double_buffer;
uint32_t tile_width;
@@ -1040,7 +1050,8 @@ enum v3dv_dynamic_state_bits {
V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6,
V3DV_DYNAMIC_LINE_WIDTH = 1 << 7,
V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8,
- V3DV_DYNAMIC_ALL = (1 << 9) - 1,
+ V3DV_DYNAMIC_DEPTH_BOUNDS = 1 << 9,
+ V3DV_DYNAMIC_ALL = (1 << 10) - 1,
};
/* Flags for dirty pipeline state.
@@ -1065,6 +1076,7 @@ enum v3dv_cmd_dirty_bits {
V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 16,
V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 17,
V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 18,
+ V3DV_CMD_DIRTY_DEPTH_BOUNDS = 1 << 19,
};
struct v3dv_dynamic_state {
@@ -1101,6 +1113,11 @@ struct v3dv_dynamic_state {
float slope_factor;
} depth_bias;
+ struct {
+ float min;
+ float max;
+ } depth_bounds;
+
float line_width;
uint32_t color_write_enable;
@@ -1196,7 +1213,7 @@ struct v3dv_timestamp_query_cpu_job_info {
};
/* Number of perfmons required to handle all supported performance counters */
-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
DRM_V3D_MAX_PERF_COUNTERS)
struct v3dv_perf_query {
@@ -1369,6 +1386,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
bool allocate_tile_state_now,
uint32_t render_target_count,
uint8_t max_internal_bpp,
+ uint8_t total_color_bpp,
bool msaa);
bool v3dv_job_type_is_gpu(struct v3dv_job *job);
@@ -1667,7 +1685,7 @@ struct v3dv_query_pool {
/* Only used with performance queries */
struct {
uint32_t ncounters;
- uint8_t counters[V3D_PERFCNT_NUM];
+ uint8_t counters[V3D_MAX_PERFCNT];
/* V3D has a limit on the number of counters we can track in a
* single performance monitor, so if too many counters are requested
@@ -1803,7 +1821,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
struct drm_v3d_submit_tfu *tfu);
-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
+void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
+ struct v3dv_csd_indirect_cpu_job_info *info,
const uint32_t *wg_counts);
void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
@@ -2289,11 +2308,15 @@ struct v3dv_pipeline {
unsigned char sha1[20];
/* In general we can reuse v3dv_device->default_attribute_float, so note
- * that the following can be NULL.
+ * that the following can be NULL. In 7.x this is not used, so it will be
+ * NULL.
*
* FIXME: the content of this BO will be small, so it could be improved to
* be uploaded to a common BO. But as in most cases it will be NULL, it is
* not a priority.
+ *
+ * Note that since v71 the default attribute values are not needed, so this
+ * can be NULL.
*/
struct v3dv_bo *default_attribute_values;
@@ -2323,6 +2346,9 @@ struct v3dv_pipeline {
bool is_z16;
} depth_bias;
+ /* Depth bounds */
+ bool depth_bounds_test_enabled;
+
struct {
void *mem_ctx;
struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
@@ -2338,6 +2364,13 @@ struct v3dv_pipeline {
uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
};
+static inline bool
+v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
+{
+ return device->devinfo.ver > 71 ||
+ (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
+}
+
static inline VkPipelineBindPoint
v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
{
@@ -2500,10 +2533,6 @@ void
v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
struct v3dv_pipeline_cache *cache);
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
- struct v3dv_pipeline *pipeline);
-
VkResult
v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
nir_shader *nir,
@@ -2608,12 +2637,32 @@ u64_compare(const void *key1, const void *key2)
case 42: \
v3d_X_thing = &v3d42_##thing; \
break; \
+ case 71: \
+ v3d_X_thing = &v3d71_##thing; \
+ break; \
default: \
unreachable("Unsupported hardware generation"); \
} \
v3d_X_thing; \
})
+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({ \
+ __typeof(V3D42_##thing) V3D_X_THING; \
+ switch (device->devinfo.ver) { \
+ case 42: \
+ V3D_X_THING = V3D42_##thing; \
+ break; \
+ case 71: \
+ V3D_X_THING = V3D71_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ V3D_X_THING; \
+})
+
+
/* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
* define v3dX for each version supported, because when we compile code that
@@ -2626,6 +2675,10 @@ u64_compare(const void *key1, const void *key2)
# define v3dX(x) v3d42_##x
# include "v3dvx_private.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dvx_private.h"
+# undef v3dX
#endif
#ifdef ANDROID
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 3284c467d74..deb7821f02b 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -23,7 +23,6 @@
#include "v3dv_private.h"
-#include "common/v3d_performance_counters.h"
#include "util/timespec.h"
#include "compiler/nir/nir_builder.h"
@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device,
DRM_IOCTL_V3D_PERFMON_CREATE,
&req);
if (ret)
- fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
+ fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
pool->queries[query].perf.kperfmon_ids[i] = req.id;
}
@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device,
QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
assert(pq_info);
- assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
pool->perfmon.ncounters = pq_info->counterIndexCount;
for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device,
assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
struct v3dv_query *q = &pool->queries[query];
- uint64_t counter_values[V3D_PERFCNT_NUM];
+ uint64_t counter_values[V3D_MAX_PERFCNT];
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
struct drm_v3d_perfmon_get_values req = {
@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
VkPerformanceCounterKHR *pCounters,
VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
{
- uint32_t desc_count = *pCounterCount;
+ V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
- out, pCounters, pCounterCount);
- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
- out_desc, pCounterDescriptions, &desc_count);
-
- for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
- vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
- counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
- counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
- counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
-
- unsigned char sha1_result[20];
- _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
- strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
- sha1_result);
-
- memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
- }
-
- vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
- &out_desc, desc) {
- desc->flags = 0;
- snprintf(desc->name, sizeof(desc->name), "%s",
- v3d_performance_counters[i][V3D_PERFCNT_NAME]);
- snprintf(desc->category, sizeof(desc->category), "%s",
- v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
- snprintf(desc->description, sizeof(desc->description), "%s",
- v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
- }
- }
-
- return vk_outarray_status(&out);
+ return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
+ pCounters,
+ pCounterDescriptions);
}
VKAPI_ATTR void VKAPI_CALL
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index b4aae195180..429d14a9196 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
if (memcmp(group_counts, info->csd_job->csd.wg_count,
sizeof(info->csd_job->csd.wg_count)) != 0) {
- v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
+ v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
}
return VK_SUCCESS;
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 72fa9a1b39c..0e681cc4ee2 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
+ float clipper_xy_granularity =
+ V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
for (int i = 0; i < uinfo->count; i++) {
uint32_t data = uinfo->data[i];
@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
break;
case QUNIFORM_VIEWPORT_X_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Y_SCALE:
- cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+ cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Z_OFFSET: {
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index f182b790d36..1bd634f5027 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
};
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ unreachable("HW generation 71 not supported yet.");
+#endif
uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
@@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = tiling->width;
config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideally we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
}
/* There's definitely nothing in the VCD cache we want. */
@@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
iview->vk.base_array_layer + layer,
image_plane);
+ /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+ * is broken in earlier V3D versions.
+ */
+ assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
+
cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = buffer;
store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
@@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
const VkImageAspectFlags aspects =
vk_format_aspects(ds_attachment->desc.format);
+#if V3D_VERSION <= 42
+ /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+ * for depth/stencil.
+ *
+ * There used to be some confusion regarding the Clear Tile Buffers
+ * Z/S bit also being broken, but we confirmed with Broadcom that this
+ * is not the case, it was just that some other hardware bugs (that we
+ * need to work around, such as GFXH-1461) could cause this bit to behave
+ * incorrectly.
+ *
+ * There used to be another issue where the RTs bit in the Clear Tile
+ * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+ * fixed since V3D 4.1.
+ *
+ * So if we have to emit a clear of depth or stencil we don't use
+ * the per-buffer store clear bit, even if we need to store the buffers,
+ * instead we always have to use the Clear Tile Buffers Z/S bit.
+ * If we have configured the job to do early Z/S clearing, then we
+ * don't want to emit any Clear Tile Buffers command at all here.
+ *
+ * Note that GFXH-1689 is not reproduced in the simulator, where
+ * using the clear buffer bit in depth/stencil stores works fine.
+ */
+
/* Only clear once on the first subpass that uses the attachment */
uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
ds_attachment->first_subpass :
@@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
ds_attachment->desc.stencilLoadOp,
subpass->do_stencil_clear_with_draw);
+ use_global_zs_clear = !state->job->early_zs_clear &&
+ (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+ /* The store command's clear buffer bit cannot be used for Z/S stencil:
+ * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+ * so we don't want to emit redundant clears here.
+ */
+ use_global_zs_clear = false;
+#endif
+
/* Skip the last store if it is not required */
uint32_t ds_last_subpass = !pass->multiview_enabled ?
ds_attachment->last_subpass :
@@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
needs_stencil_store = subpass->resolve_stencil;
}
- /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
- * for depth/stencil.
- *
- * There used to be some confusion regarding the Clear Tile Buffers
- * Z/S bit also being broken, but we confirmed with Broadcom that this
- * is not the case, it was just that some other hardware bugs (that we
- * need to work around, such as GFXH-1461) could cause this bit to behave
- * incorrectly.
- *
- * There used to be another issue where the RTs bit in the Clear Tile
- * Buffers packet also cleared Z/S, but Broadcom confirmed this is
- * fixed since V3D 4.1.
- *
- * So if we have to emit a clear of depth or stencil we don't use
- * the per-buffer store clear bit, even if we need to store the buffers,
- * instead we always have to use the Clear Tile Buffers Z/S bit.
- * If we have configured the job to do early Z/S clearing, then we
- * don't want to emit any Clear Tile Buffers command at all here.
- *
- * Note that GFXH-1689 is not reproduced in the simulator, where
- * using the clear buffer bit in depth/stencil stores works fine.
- */
- use_global_zs_clear = !state->job->early_zs_clear &&
- (needs_depth_clear || needs_stencil_clear);
if (needs_depth_store || needs_stencil_store) {
const uint32_t zs_buffer =
v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
* bit and instead we have to emit a single clear of all tile buffers.
*/
if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = use_global_zs_clear;
clear.clear_all_render_targets = use_global_rt_clear;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
}
@@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
}
}
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary
+ * right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+ if (vk_format_is_int(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_INT;
+ else if (vk_format_is_srgb(vk_format))
+ return V3D_RENDER_TARGET_CLAMP_NORM;
+ else
+ return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+ switch (rt_type) {
+ case V3D_INTERNAL_TYPE_8I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+ case V3D_INTERNAL_TYPE_8UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_8:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ case V3D_INTERNAL_TYPE_16I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+ case V3D_INTERNAL_TYPE_16UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_16F:
+ return vk_format_is_srgb(vk_format) ?
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+ case V3D_INTERNAL_TYPE_32I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+ case V3D_INTERNAL_TYPE_32UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_32F:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+ default:
+ unreachable("Unknown internal render target type");
+ }
+
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+ int rt,
+ uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+ uint32_t *rt_type,
+ uint32_t *rt_clamp)
+#else
+ uint32_t *rt_type_clamp)
+#endif
+{
+ const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+ assert(state->subpass_idx < state->pass->subpass_count);
+ const struct v3dv_subpass *subpass =
+ &state->pass->subpasses[state->subpass_idx];
+
+ if (rt >= subpass->color_count)
+ return;
+
+ struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+ const uint32_t attachment_idx = attachment->attachment;
+ if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ return;
+
+ assert(attachment_idx < state->framebuffer->attachment_count &&
+ attachment_idx < state->attachment_alloc_count);
+ struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+ assert(vk_format_is_color(iview->vk.format));
+
+ assert(iview->plane_count == 1);
+ *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+ *rt_type = iview->planes[0].internal_type;
+ *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+ iview->vk.format);
+#endif
+}
+
void
v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
config.number_of_render_targets = MAX2(subpass->color_count, 1);
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
const struct v3dv_image_view *iview =
@@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
* Early-Z/S clearing is independent of Early Z/S testing, so it is
* possible to enable one but not the other so long as their
* respective requirements are met.
+ *
+ * From V3D 4.5.6, Z/S buffers are always cleared automatically
+ * between tiles, but we still want to enable early ZS clears
+ * when Z/S are not loaded or stored.
*/
struct v3dv_render_pass_attachment *ds_attachment =
&pass->attachments[ds_attachment_idx];
@@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const VkImageAspectFlags ds_aspects =
vk_format_aspects(ds_attachment->desc.format);
- bool needs_depth_clear =
- check_needs_clear(state,
- ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
- ds_attachment->first_subpass,
- ds_attachment->desc.loadOp,
- subpass->do_depth_clear_with_draw);
-
bool needs_depth_store =
v3dv_cmd_buffer_check_needs_store(state,
ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
ds_attachment->last_subpass,
ds_attachment->desc.storeOp) ||
subpass->resolve_depth;
+#if V3D_VERSION <= 42
+ bool needs_depth_clear =
+ check_needs_clear(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ subpass->do_depth_clear_with_draw);
do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+ bool needs_depth_load =
+ v3dv_cmd_buffer_check_needs_load(state,
+ ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+ ds_attachment->first_subpass,
+ ds_attachment->desc.loadOp,
+ ds_attachment->last_subpass,
+ ds_attachment->desc.storeOp);
+ do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
if (do_early_zs_clear &&
vk_format_has_stencil(ds_attachment->desc.format)) {
bool needs_stencil_load =
@@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
*/
job->early_zs_clear = do_early_zs_clear;
+#if V3D_VERSION >= 71
+ uint32_t base_addr = 0;
+#endif
for (uint32_t i = 0; i < subpass->color_count; i++) {
uint32_t attachment_idx = subpass->color_attachments[i].attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
+ if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.render_target_number = i;
+ rt.stride = 1; /* Unused */
+ }
+#endif
continue;
+ }
struct v3dv_image_view *iview =
state->attachments[attachment_idx].image_view;
@@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
const struct v3d_resource_slice *slice =
&image->planes[plane].slices[iview->vk.base_mip_level];
- const uint32_t *clear_color =
+ UNUSED const uint32_t *clear_color =
&state->attachments[attachment_idx].clear_value.color[0];
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
slice->tiling == V3D_TILING_UIF_XOR) {
int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
@@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = clear_color[0];
clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
@@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
clear.render_target_number = i;
};
}
+#endif
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.clear_color_low_bits = clear_color[0];
+ cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+ &rt.internal_type_and_clamping);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = base_addr;
+ rt.render_target_number = i;
+
+ /* base_addr in multiples of 512 bits. We divide by 8 because stride
+ * is in 128-bit units, but it is packing 2 rows worth of data, so we
+ * need to divide it by 2 so it is only 1 row, and then again by 4 so
+ * it is in 512-bit units.
+ */
+ base_addr += (tiling->tile_height * rt.stride) / 8;
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) clear_color[1]) |
+ (((uint64_t) (clear_color[2] & 0xff)) << 32);
+ rt.render_target_number = i;
+ }
+ }
+
+ if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (clear_color[3])) << 24);
+ rt.render_target_number = i;
+ }
+ }
+#endif
}
+#if V3D_VERSION >= 71
+ /* If we don't have any color RTs, we still need to emit one and flag
+ * it as not used using stride = 1.
+ */
+ if (subpass->color_count == 0) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.stride = 1;
+ }
+ }
+#endif
+
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 0, &rt.render_target_0_internal_bpp,
&rt.render_target_0_internal_type, &rt.render_target_0_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 1, &rt.render_target_1_internal_bpp,
&rt.render_target_1_internal_type, &rt.render_target_1_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 2, &rt.render_target_2_internal_bpp,
&rt.render_target_2_internal_type, &rt.render_target_2_clamp);
- v3dX(cmd_buffer_render_pass_setup_render_target)
+ cmd_buffer_render_pass_setup_render_target
(cmd_buffer, 3, &rt.render_target_3_internal_bpp,
&rt.render_target_3_internal_type, &rt.render_target_3_clamp);
}
+#endif
/* Ends rendering mode config. */
if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
}
if (cmd_buffer->state.tile_aligned_render_area &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -1054,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
cl_emit(rcl, END_OF_RENDERING, end);
}
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+ float scale[3],
+ float translate[3])
+{
+ float x = viewport->x;
+ float y = viewport->y;
+ float half_width = 0.5f * viewport->width;
+ float half_height = 0.5f * viewport->height;
+ double n = viewport->minDepth;
+ double f = viewport->maxDepth;
+
+ scale[0] = half_width;
+ translate[0] = half_width + x;
+ scale[1] = half_height;
+ translate[1] = half_height + y;
+
+ scale[2] = (f - n);
+ translate[2] = n;
+
+ /* It seems that if the scale is small enough the hardware won't clip
+ * correctly so we work around this my choosing the smallest scale that
+ * seems to work.
+ *
+ * This case is exercised by CTS:
+ * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+ *
+ * V3D 7.x fixes this by using the new
+ * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
+ */
+#if V3D_VERSION <= 42
+ const float min_abs_scale = 0.0005f;
+ if (fabs(scale[2]) < min_abs_scale)
+ scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+#endif
+}
+
void
v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
v3dv_return_if_oom(cmd_buffer, NULL);
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+ clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+ clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+ }
+#endif
float translate_z, scale_z;
v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
&translate_z, &scale_z);
+#if V3D_VERSION == 42
cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
clip.viewport_z_offset_zc_to_zs = translate_z;
clip.viewport_z_scale_zc_to_zs = scale_z;
}
+#endif
+
+#if V3D_VERSION >= 71
+ /* If the Z scale is too small guardband clipping may not clip correctly */
+ if (fabsf(scale_z) < 0.01f) {
+ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
+ } else {
+ cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+ clip.viewport_z_offset_zc_to_zs = translate_z;
+ clip.viewport_z_scale_zc_to_zs = scale_z;
+ }
+ }
+#endif
+
cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
/* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
* we are using OpenGL's [-1, 1] instead.
@@ -1205,14 +1499,48 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+#if V3D_VERSION <= 42
if (pipeline->depth_bias.is_z16)
bias.depth_offset_units *= 256.0f;
+#endif
bias.limit = dynamic->depth_bias.depth_bias_clamp;
}
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
}
+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+ /* No depthBounds support for v42, so this method is empty on that case.
+ *
+ * Note that this method is being called as v3dv_job_init flag all state as
+ * dirty. See FIXME note at v3dv_job_init.
+ */
+
+#if V3D_VERSION >= 71
+ struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+ assert(pipeline);
+
+ if (!pipeline->depth_bounds_test_enabled)
+ return;
+
+ struct v3dv_job *job = cmd_buffer->state.job;
+ assert(job);
+
+ v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
+ v3dv_return_if_oom(cmd_buffer, NULL);
+
+ struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+ cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
+ bounds.lower_test_limit = dynamic->depth_bounds.min;
+ bounds.upper_test_limit = dynamic->depth_bounds.max;
+ }
+
+ cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+#endif
+}
+
void
v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
{
@@ -1256,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
+ const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
+ const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
+
const uint32_t blend_packets_size =
cl_packet_length(BLEND_ENABLES) +
cl_packet_length(BLEND_CONSTANT_COLOR) +
- cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+ cl_packet_length(BLEND_CFG) * max_color_rts;
v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
v3dv_return_if_oom(cmd_buffer, NULL);
@@ -1271,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
}
}
- for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+ for (uint32_t i = 0; i < max_color_rts; i++) {
if (pipeline->blend.enables & (1 << i))
cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
}
@@ -1298,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+ uint32_t color_write_mask = ~dynamic->color_write_enable |
+ pipeline->blend.color_write_masks;
+#if V3D_VERSION <= 42
+ /* Only 4 RTs */
+ color_write_mask &= 0xffff;
+#endif
+
cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
- mask.mask = (~dynamic->color_write_enable |
- pipeline->blend.color_write_masks) & 0xffff;
+ mask.mask = color_write_mask;
}
cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
@@ -1591,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
assert(pipeline);
- bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
-
v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
v3dv_return_if_oom(cmd_buffer, NULL);
cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+#if V3D_VERSION == 42
+ bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
config.early_z_enable = enable_ez;
config.early_z_updates_enable = config.early_z_enable &&
pipeline->z_updates_enable;
+#endif
}
}
@@ -1845,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
gs_bin->prog_data.gs->base.threads == 4;
shader.geometry_bin_mode_shader_start_in_final_thread_section =
gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
shader.geometry_bin_mode_shader_uniforms_address =
gs_bin_uniforms;
@@ -1855,7 +2195,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
gs->prog_data.gs->base.threads == 4;
shader.geometry_render_mode_shader_start_in_final_thread_section =
gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
shader.geometry_render_mode_shader_uniforms_address =
gs_render_uniforms;
}
@@ -2031,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
pipeline->vpm_cfg.Gv);
}
+#if V3D_VERSION == 42
struct v3dv_bo *default_attribute_values =
pipeline->default_attribute_values != NULL ?
pipeline->default_attribute_values :
pipeline->device->default_attribute_float;
+#endif
cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
pipeline->shader_state_record, shader) {
@@ -2060,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+#if V3D_VERSION == 42
shader.address_of_default_attribute_values =
v3dv_cl_address(default_attribute_values, 0);
+#endif
shader.any_shader_reads_hardware_written_primitive_id =
(pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -2370,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
buffer->mem_offset + offset);
}
}
-
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp)
-{
- const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
- assert(state->subpass_idx < state->pass->subpass_count);
- const struct v3dv_subpass *subpass =
- &state->pass->subpasses[state->subpass_idx];
-
- if (rt >= subpass->color_count)
- return;
-
- struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
- const uint32_t attachment_idx = attachment->attachment;
- if (attachment_idx == VK_ATTACHMENT_UNUSED)
- return;
-
- assert(attachment_idx < state->framebuffer->attachment_count &&
- attachment_idx < state->attachment_alloc_count);
- struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
- assert(vk_format_is_color(iview->vk.format));
-
- assert(iview->plane_count == 1);
- *rt_bpp = iview->planes[0].internal_bpp;
- *rt_type = iview->planes[0].internal_type;
- if (vk_format_is_int(iview->vk.view_format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
- else if (vk_format_is_srgb(iview->vk.view_format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
- else
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-}
diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
index e235983864c..1b50d51e19f 100644
--- a/src/broadcom/vulkan/v3dvx_device.c
+++ b/src/broadcom/vulkan/v3dvx_device.c
@@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = {
[VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS,
};
-
static union pipe_color_union encode_border_color(
+ const struct v3dv_device *device,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
{
const struct util_format_description *desc =
@@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color(
* colors so we need to fix up the swizzle manually for this case.
*/
uint8_t swizzle[4];
- if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+ const bool v3d_has_reverse_swap_rb_bits =
+ v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
+ if (!v3d_has_reverse_swap_rb_bits &&
+ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
swizzle[0] = PIPE_SWIZZLE_W;
swizzle[1] = PIPE_SWIZZLE_X;
swizzle[2] = PIPE_SWIZZLE_Y;
swizzle[3] = PIPE_SWIZZLE_Z;
+ }
+ /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
+ * we have to use the new reverse and swap_r/b flags in the texture shader
+ * state which will apply the format swizzle automatically when sampling
+ * the border color too and we should not apply it manually here.
+ */
+ else if (v3d_has_reverse_swap_rb_bits &&
+ (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
+ v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
+ swizzle[0] = PIPE_SWIZZLE_X;
+ swizzle[1] = PIPE_SWIZZLE_Y;
+ swizzle[2] = PIPE_SWIZZLE_Z;
+ swizzle[3] = PIPE_SWIZZLE_W;
} else {
memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
}
@@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color(
(1 << (desc->channel[i].size - 1)) - 1);
}
- /* convert from float to expected format */
+#if V3D_VERSION <= 42
+ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+ * for us. In V3D 4.x we need to manually convert floating point color
+ * values to the expected format.
+ */
if (vk_format_is_srgb(bc_info->format) ||
vk_format_is_compressed(bc_info->format)) {
for (int i = 0; i < 4; i++)
@@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color(
}
}
}
+#endif
return border;
}
void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+ struct v3dv_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
{
@@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
s.border_color_mode = border_color_mode;
if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
- union pipe_color_union border = encode_border_color(bc_info);
+ union pipe_color_union border = encode_border_color(device, bc_info);
s.border_color_word_0 = border.ui[0];
s.border_color_word_1 = border.ui[1];
@@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
const struct v3dv_framebuffer *framebuffer,
const struct v3dv_cmd_buffer_attachment_state *attachments,
const struct v3dv_subpass *subpass,
- uint8_t *max_bpp,
+ uint8_t *max_internal_bpp,
+ uint8_t *total_color_bpp,
bool *msaa)
{
STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
- *max_bpp = V3D_INTERNAL_BPP_32;
+ *max_internal_bpp = V3D_INTERNAL_BPP_32;
+ *total_color_bpp = 0;
*msaa = false;
if (subpass) {
@@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
assert(att);
assert(att->plane_count == 1);
- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+ const uint32_t internal_bpp = att->planes[0].internal_bpp;
+ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ }
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
@@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
}
-
return;
}
@@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
assert(att);
assert(att->plane_count == 1);
- if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
- *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+ if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+ const uint32_t internal_bpp = att->planes[0].internal_bpp;
+ *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+ *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ }
if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
*msaa = true;
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index 80a3e5bfde8..de984e81220 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
- tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
-
tex.texture_type = image_view->format->planes[plane].tex_type;
if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
@@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
- tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
-
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
* add the bo to the job. This also means that we need to add manually
@@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
iplane);
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+ bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+
+ /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
+ * the reverse and/or swap_r/b swizzle from the format table with the
+ * image view swizzle. This, however, doesn't work for border colors,
+ * for that there is the reverse_standard_border_color.
+ *
+ * In v3d 7.x, however, there is no reverse_standard_border_color bit,
+ * since the reverse and swap_r/b bits also affect border colors. It is
+ * because of this that we absolutely need to use these bits with
+ * reversed and swpaped formats, since that's the only way to ensure
+ * correct border colors. In that case we don't want to program the
+ * swizzle to the composition of the format swizzle and the view
+ * swizzle like we do in v3d 4.x, since the format swizzle is applied
+ * via the reverse and swap_r/b bits.
+ */
+#if V3D_VERSION == 42
+ tex.srgb = is_srgb;
+ tex.reverse_standard_border_color =
+ image_view->planes[plane].channel_reverse;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+
+ tex.reverse = image_view->planes[plane].channel_reverse;
+ tex.r_b_swap = image_view->planes[plane].swap_rb;
+
+ if (tex.reverse || tex.r_b_swap) {
+ tex.swizzle_r =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
+ tex.swizzle_g =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
+ tex.swizzle_b =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
+ tex.swizzle_a =
+ v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
+ }
+
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
}
@@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
assert(buffer_view->format->plane_count == 1);
tex.texture_type = buffer_view->format->planes[0].tex_type;
- tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+
+ bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+#if V3D_VERSION == 42
+ tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
/* At this point we don't have the job. That's the reason the first
* parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
buffer_view->offset;
tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+ tex.chroma_offset_x = 1;
+ tex.chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex.texture_base_pointer_cb = base_offset >> 6;
+ tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
}
}
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 04147b82cbd..858096f9e4b 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -26,6 +26,7 @@
#include "broadcom/common/v3d_macros.h"
#include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#include "broadcom/compiler/v3d_compiler.h"
@@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job,
config.number_of_render_targets = 1;
config.multisample_mode_4x = tiling->msaa;
config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(tiling->tile_width);
+ config.log2_tile_height = log2_tile_size(tiling->tile_height);
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
config.internal_depth_type = fb->internal_depth_type;
}
+ const uint32_t *color = NULL;
if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
if (clear_info->image) {
const struct v3dv_image *image = clear_info->image;
@@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
}
}
- const uint32_t *color = &clear_info->clear_value->color[0];
+ color = &clear_info->clear_value->color[0];
+
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
clear.clear_color_low_32_bits = color[0];
clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
@@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
clear.render_target_number = 0;
};
}
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = tiling->internal_bpp;
rt.render_target_0_internal_type = fb->internal_type;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ if (color)
+ rt.clear_color_low_bits = color[0];
+ rt.internal_bpp = tiling->internal_bpp;
+ rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+ fb->vk_format);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = 0;
+ rt.render_target_number = 0;
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) color[1]) |
+ (((uint64_t) (color[2] & 0xff)) << 32);
+ rt.render_target_number = 0;
+ }
+ }
+
+ if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (color[3])) << 24);
+ rt.render_target_number = 0;
+ }
+ }
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job,
*/
if (clear_value &&
(i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = true;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
cl_emit(rcl, END_OF_TILE_MARKER, end);
}
@@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
tfu.iia |= src_offset;
+#if V3D_VERSION <= 42
if (src_tiling == V3D_TILING_RASTER) {
tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
} else {
@@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
V3D33_TFU_ICFG_FORMAT_SHIFT;
}
tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
+#endif
+#if V3D_VERSION >= 71
+ if (src_tiling == V3D_TILING_RASTER) {
+ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ } else {
+ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ }
+ tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
+#endif
tfu.ioa = dst_offset;
+#if V3D_VERSION <= 42
tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
(dst_tiling - V3D_TILING_LINEARTILE)) <<
V3D33_TFU_IOA_FORMAT_SHIFT;
+#endif
+
+#if V3D_VERSION >= 71
+ tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
+ (dst_tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_IOC_FORMAT_SHIFT;
+
+ switch (dst_tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.v71.ioc |=
+ (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ case V3D_TILING_RASTER:
+ tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ default:
+ break;
+ }
+#endif
switch (src_tiling) {
case V3D_TILING_UIF_NO_XOR:
@@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
/* The TFU can handle raster sources but always produces UIF results */
assert(dst_tiling != V3D_TILING_RASTER);
+#if V3D_VERSION <= 42
/* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
* OPAD field for the destination (how many extra UIF blocks beyond
* those necessary to cover the height).
@@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
uif_block_h;
tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
}
+#endif
v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
}
@@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t width, height;
framebuffer_size_for_pixel_count(num_items, &width, &height);
- v3dv_job_start_frame(job, width, height, 1, true, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
@@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t width, height;
framebuffer_size_for_pixel_count(num_items, &width, &height);
- v3dv_job_start_frame(job, width, height, 1, true, true,
- 1, internal_bpp, false);
+ v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+ internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+ false);
struct v3dv_meta_framebuffer framebuffer;
v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 5d32d414ed8..ad22add155d 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -227,6 +227,45 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
pipeline->z_updates_enable = config.z_updates_enable;
+
+#if V3D_VERSION >= 71
+ /* From the Vulkan spec:
+ *
+ * "depthClampEnable controls whether to clamp the fragments depth
+ * values as described in Depth Test. If the pipeline is not created
+ * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+ * then enabling depth clamp will also disable clipping primitives to
+ * the z planes of the frustrum as described in Primitive Clipping.
+ * Otherwise depth clipping is controlled by the state set in
+ * VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+ *
+ * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
+ * supported in the driver yet, so in practice we are always enabling Z
+ * clipping for now.
+ */
+ bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+ bool z_clip_enable = false;
+ const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+ ds_info ? vk_find_struct_const(ds_info->pNext,
+ PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+ NULL;
+ if (clip_info)
+ z_clip_enable = clip_info->depthClipEnable;
+ else if (!z_clamp_enable)
+ z_clip_enable = true;
+
+ if (z_clip_enable) {
+ config.z_clipping_mode = pipeline->negative_one_to_one ?
+ V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
+ } else {
+ config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+ }
+
+ config.z_clamp_mode = z_clamp_enable;
+
+ config.depth_bounds_test_enable =
+ ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
+#endif
};
}
@@ -360,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
static void
pack_shader_state_record(struct v3dv_pipeline *pipeline)
{
- assert(sizeof(pipeline->shader_state_record) ==
+ assert(sizeof(pipeline->shader_state_record) >=
cl_packet_length(GL_SHADER_STATE_RECORD));
struct v3d_fs_prog_data *prog_data_fs =
@@ -435,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
shader.number_of_varyings_in_fragment_shader =
prog_data_fs->num_inputs;
- shader.coordinate_shader_propagate_nans = true;
- shader.vertex_shader_propagate_nans = true;
- shader.fragment_shader_propagate_nans = true;
-
/* Note: see previous note about addresses */
/* shader.coordinate_shader_code_address */
/* shader.vertex_shader_code_address */
/* shader.fragment_shader_code_address */
+#if V3D_VERSION == 42
+ shader.coordinate_shader_propagate_nans = true;
+ shader.vertex_shader_propagate_nans = true;
+ shader.fragment_shader_propagate_nans = true;
+
/* FIXME: Use combined input/output size flag in the common case (also
* on v3d, see v3dx_draw).
*/
@@ -451,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
prog_data_vs_bin->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
prog_data_vs->separate_segments;
-
shader.coordinate_shader_input_vpm_segment_size =
prog_data_vs_bin->separate_segments ?
prog_data_vs_bin->vpm_input_size : 1;
shader.vertex_shader_input_vpm_segment_size =
prog_data_vs->separate_segments ?
prog_data_vs->vpm_input_size : 1;
+#endif
+
+ /* On V3D 7.1 there isn't a specific flag to set if we are using
+ * shared/separate segments or not. We just set the value of
+ * vpm_input_size to 0, and set output to the max needed. That should be
+ * already properly set on prog_data_vs_bin
+ */
+#if V3D_VERSION == 71
+ shader.coordinate_shader_input_vpm_segment_size =
+ prog_data_vs_bin->vpm_input_size;
+ shader.vertex_shader_input_vpm_segment_size =
+ prog_data_vs->vpm_input_size;
+#endif
shader.coordinate_shader_output_vpm_segment_size =
prog_data_vs_bin->vpm_output_size;
@@ -659,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
}
}
}
+
+#if V3D_VERSION == 42
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+ for (uint8_t i = 0; i < pipeline->va_count; i++) {
+ if (vk_format_is_int(pipeline->va[i].vk_format))
+ return true;
+ }
+ return false;
+}
+#endif
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION == 42
+ return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+ return false;
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION >= 71
+ return NULL;
+#endif
+
+ uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+ struct v3dv_bo *bo;
+
+ bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+ if (!bo) {
+ fprintf(stderr, "failed to allocate memory for the default "
+ "attribute values\n");
+ return NULL;
+ }
+
+ bool ok = v3dv_bo_map(device, bo, size);
+ if (!ok) {
+ fprintf(stderr, "failed to map default attribute values buffer\n");
+ return NULL;
+ }
+
+ uint32_t *attrs = bo->map;
+ uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+ for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+ attrs[i * 4 + 0] = 0;
+ attrs[i * 4 + 1] = 0;
+ attrs[i * 4 + 2] = 0;
+ VkFormat attr_format =
+ pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+ if (i < va_count && vk_format_is_int(attr_format)) {
+ attrs[i * 4 + 3] = 1;
+ } else {
+ attrs[i * 4 + 3] = fui(1.0);
+ }
+ }
+
+ v3dv_bo_unmap(device, bo);
+
+ return bo;
+}
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ad8ddfa5731..0f5887eab93 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer);
void
v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
+
void
v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
@@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
uint32_t internal_size,
uint32_t *hw_color);
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
- int rt,
- uint32_t *rt_bpp,
- uint32_t *rt_type,
- uint32_t *rt_clamp);
-
/* Used at v3dv_device */
void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+ struct v3dv_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo,
const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
@@ -143,7 +140,9 @@ void
v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
const struct v3dv_cmd_buffer_attachment_state *attachments,
const struct v3dv_subpass *subpass,
- uint8_t *max_bpp, bool *msaa);
+ uint8_t *max_internal_bpp,
+ uint8_t *total_color_bpp,
+ bool *msaa);
#ifdef DEBUG
void
@@ -313,10 +312,24 @@ void
v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
const VkPipelineVertexInputStateCreateInfo *vi_info,
const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ struct v3dv_pipeline *pipeline);
+
/* Used at v3dv_queue */
void
v3dX(job_emit_noop)(struct v3dv_job *job);
+/* Used at v3dv_query */
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
+
/* Used at v3dv_descriptor_set, and other descriptor set utils */
uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
@@ -325,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ VkFormat vk_format);
+
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+ float scale[3],
+ float translate[3]);
diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
new file mode 100644
index 00000000000..e59a1e84ff6
--- /dev/null
+++ b/src/broadcom/vulkan/v3dvx_query.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "common/v3d_performance_counters.h"
+
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+ VkPerformanceCounterKHR *pCounters,
+ VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+ uint32_t desc_count = *pCounterCount;
+
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+ out, pCounters, pCounterCount);
+ VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+ out_desc, pCounterDescriptions, &desc_count);
+
+ for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+ vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+ counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+ counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+ counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+ unsigned char sha1_result[20];
+ _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+ strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+ sha1_result);
+
+ memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+ }
+
+ vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+ &out_desc, desc) {
+ desc->flags = 0;
+ snprintf(desc->name, sizeof(desc->name), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+ snprintf(desc->category, sizeof(desc->category), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+ snprintf(desc->description, sizeof(desc->description), "%s",
+ v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index efe63de425c..6eed2de9d54 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -29,7 +29,8 @@
void
v3dX(job_emit_noop)(struct v3dv_job *job)
{
- v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false);
+ v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
+ V3D_INTERNAL_BPP_32, 4, false);
v3dX(job_emit_binning_flush)(job);
struct v3dv_cl *rcl = &job->rcl;
@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
config.image_height_pixels = 1;
config.number_of_render_targets = 1;
config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = 3; /* Tile size 64 */
+ config.log2_tile_height = 3; /* Tile size 64 */
+#endif
}
+#if V3D_VERSION == 42
cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.internal_bpp = V3D_INTERNAL_BPP_32;
+ rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ rt.stride = 1; /* Unused RT */
+ }
+#endif
cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
clear.z_clear_value = 1.0f;
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index e6383b67737..46395d79a89 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -62,6 +62,8 @@ template = """\
#include "util/softfloat.h"
#include "util/bigmath.h"
#include "util/format/format_utils.h"
+#include "util/format_r11g11b10f.h"
+#include "util/u_math.h"
#include "nir_constant_expressions.h"
/**
@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u)
return _mesa_half_to_float(u);
}
+/* Broadcom v3d specific instructions */
+/**
+ * Packs 2 2x16 floating split into a r11g11b10f
+ */
+static uint32_t v11fpack_v3d(const uint32_t src0,
+ const uint32_t src1)
+{
+ float rgb[3];
+
+ rgb[0] = unpack_half_1x16((src0 & 0xffff));
+ rgb[1] = unpack_half_1x16((src0 >> 16));
+ rgb[2] = unpack_half_1x16((src1 & 0xffff));
+
+ return float3_to_r11g11b10f(rgb);
+}
+
+/**
+ * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
+ * as it receives a uint16_t val instead of a float
+ */
+static uint8_t _mesa_half_to_snorm8(uint16_t val)
+{
+ float x = _mesa_half_to_float(val);
+
+ return pack_snorm_1x8(x);
+}
+
+static uint16_t _mesa_float_to_snorm16(uint32_t val)
+{
+ union fi aux;
+ aux.ui = val;
+ return pack_snorm_1x16(aux.f);
+}
+
+static uint16_t _mesa_float_to_unorm16(uint32_t val)
+{
+ union fi aux;
+ aux.ui = val;
+ return pack_unorm_1x16(aux.f);
+}
+
+/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too
+ * verbose. It is likely that there would be a simpler way to implement
+ * it.
+ */
+static uint32_t float_pack16_v3d(uint32_t f32)
+{
+ float f = uif(f32);
+ return _mesa_float_to_half(f);
+}
+
+static uint32_t float_unpack16_v3d(uint32_t f16)
+{
+ float f = _mesa_half_to_float(f16);
+ return fui(f);
+}
+
+static uint32_t vfpack_v3d(uint32_t a, uint32_t b)
+{
+ return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
+}
+
+static uint32_t vfsat_v3d(uint32_t a)
+{
+ return vfpack_v3d(
+ fui(SATURATE(_mesa_half_to_float(a & 0xffff))),
+ fui(SATURATE(_mesa_half_to_float(a >> 16))));
+}
+
+static uint32_t fmul_v3d(uint32_t a, uint32_t b)
+{
+ float f = uif(a);
+ float g = uif(b);
+
+ float x = f * g;
+
+ return fui(x);
+}
+
+#define L(x) float_unpack16_v3d((x) & 0xffff)
+#define H(x) float_unpack16_v3d((x) >> 16)
+#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b)))
+
+static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
+{
+ return V(fmul_v3d, a, b);
+}
+
+/* Convert 2x16-bit floating point to 2x10-bit unorm */
+static uint32_t vftounorm10lo(uint32_t src0)
+{
+ return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
+}
+
+/*
+ * Convert 2x16-bit floating point to one 2-bit and one
+ * 10-bit unorm
+ */
+static uint32_t vftounorm10hi(uint32_t src0)
+{
+ return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
+}
+
+
/* Some typed vector structures to make things like src0.y work */
typedef int8_t int1_t;
typedef uint8_t uint1_t;
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index e4d87aa6126..63aa7cfa315 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) {
}
""")
+# v3d-specific opcodes
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into
+# r11g11b10 bits, rounding to nearest even
+binop_convert("v11fpack_v3d", tuint32, tuint32, "",
+ "v11fpack_v3d(src0, src1)")
+
+# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
+# difference with pack_32_2x16_split is that the sources are 32bit too. So it
+# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit
+# pack.
+binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
+ "(src0.x & 0xffff) | (src1.x << 16)")
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2
+binop_convert("v10pack_v3d", tuint32, tuint32, "",
+ "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
+
+# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
+# dst[7:0] = src0[7:0]
+# dst[15:8] = src0[23:16]
+# dst[23:16] = src1[7:0]
+# dst[31:24] = src1[23:16]
+opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
+ False, "",
+ "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
+
+# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
+unop("vftounorm8_v3d", tuint32,
+ "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
+unop("vftosnorm8_v3d", tuint32,
+ "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
+
+# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
+unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
+unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
+unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
+# and one 10 bit unorm
+unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)")
+
# Mali-specific opcodes
unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
diff --git a/src/gallium/drivers/v3d/driinfo_v3d.h b/src/gallium/drivers/v3d/driinfo_v3d.h
index 147ad0b49bd..8f989e8aa57 100644
--- a/src/gallium/drivers/v3d/driinfo_v3d.h
+++ b/src/gallium/drivers/v3d/driinfo_v3d.h
@@ -2,4 +2,6 @@
DRI_CONF_SECTION_MISCELLANEOUS
DRI_CONF_V3D_NONMSAA_TEXTURE_SIZE_LIMIT(false)
+ DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(false)
+ DRI_CONF_V3D_IS_XSERVER_PROCESS(false)
DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
index dfa1e88097b..e47682db1aa 100644
--- a/src/gallium/drivers/v3d/meson.build
+++ b/src/gallium/drivers/v3d/meson.build
@@ -34,7 +34,6 @@ files_libv3d = files(
'v3d_query.c',
'v3d_query.h',
'v3d_query_pipe.c',
- 'v3d_query_perfcnt.c',
'v3d_resource.c',
'v3d_resource.h',
'v3d_screen.c',
@@ -47,8 +46,10 @@ files_per_version = files(
'v3dx_emit.c',
'v3dx_format_table.c',
'v3dx_job.c',
+ 'v3dx_query_perfcnt.c',
'v3dx_rcl.c',
'v3dx_state.c',
+ 'v3dx_tfu.c',
)
v3d_args = ['-DV3D_BUILD_NEON']
@@ -58,7 +59,17 @@ if dep_v3dv3.found()
v3d_args += '-DUSE_V3D_SIMULATOR'
endif
-v3d_versions = ['33', '42']
+v3d_versions = ['33', '42', '71']
+
+v3d_deps = [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers]
+
+if with_platform_x11
+ v3d_deps += dep_xcb
+endif
+
+if with_platform_wayland
+ v3d_deps += dep_wayland_client
+endif
per_version_libs = []
foreach ver : v3d_versions
@@ -71,7 +82,7 @@ foreach ver : v3d_versions
],
c_args : [v3d_args, '-DV3D_VERSION=' + ver],
gnu_symbol_visibility : 'hidden',
- dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers],
+ dependencies : v3d_deps,
)
endforeach
@@ -94,10 +105,7 @@ libv3d = static_library(
c_args : [v3d_args],
cpp_args : [v3d_args],
gnu_symbol_visibility : 'hidden',
- dependencies : [
- dep_v3dv3, dep_libdrm, dep_valgrind,
- idep_nir_headers, idep_mesautil,
- ],
+ dependencies : v3d_deps + idep_mesautil,
link_with: [per_version_libs],
)
diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
index b7dc56a044e..ee3c14b154c 100644
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
info->mask &= ~PIPE_MASK_S;
}
-static bool
-v3d_tfu(struct pipe_context *pctx,
- struct pipe_resource *pdst,
- struct pipe_resource *psrc,
- unsigned int src_level,
- unsigned int base_level,
- unsigned int last_level,
- unsigned int src_layer,
- unsigned int dst_layer,
- bool for_mipmap)
-{
- struct v3d_context *v3d = v3d_context(pctx);
- struct v3d_screen *screen = v3d->screen;
- struct v3d_resource *src = v3d_resource(psrc);
- struct v3d_resource *dst = v3d_resource(pdst);
- struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
- struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
- int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
- int width = u_minify(pdst->width0, base_level) * msaa_scale;
- int height = u_minify(pdst->height0, base_level) * msaa_scale;
- enum pipe_format pformat;
-
- if (psrc->format != pdst->format)
- return false;
- if (psrc->nr_samples != pdst->nr_samples)
- return false;
-
- /* Can't write to raster. */
- if (dst_base_slice->tiling == V3D_TILING_RASTER)
- return false;
-
- /* When using TFU for blit, we are doing exact copies (both input and
- * output format must be the same, no scaling, etc), so there is no
- * pixel format conversions. Thus we can rewrite the format to use one
- * that is TFU compatible based on its texel size.
- */
- if (for_mipmap) {
- pformat = pdst->format;
- } else {
- switch (dst->cpp) {
- case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
- case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break;
- case 4: pformat = PIPE_FORMAT_R32_FLOAT; break;
- case 2: pformat = PIPE_FORMAT_R16_FLOAT; break;
- case 1: pformat = PIPE_FORMAT_R8_UNORM; break;
- default: unreachable("unsupported format bit-size"); break;
- };
- }
-
- uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
- struct v3d_device_info *devinfo = &screen->devinfo;
-
- if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) {
- assert(for_mipmap);
- return false;
- }
-
- v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
- v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
-
- struct drm_v3d_submit_tfu tfu = {
- .ios = (height << 16) | width,
- .bo_handles = {
- dst->bo->handle,
- src != dst ? src->bo->handle : 0
- },
- .in_sync = v3d->out_sync,
- .out_sync = v3d->out_sync,
- };
- uint32_t src_offset = (src->bo->offset +
- v3d_layer_offset(psrc, src_level, src_layer));
- tfu.iia |= src_offset;
- if (src_base_slice->tiling == V3D_TILING_RASTER) {
- tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
- V3D33_TFU_ICFG_FORMAT_SHIFT);
- } else {
- tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
- (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
- V3D33_TFU_ICFG_FORMAT_SHIFT);
- }
-
- uint32_t dst_offset = (dst->bo->offset +
- v3d_layer_offset(pdst, base_level, dst_layer));
- tfu.ioa |= dst_offset;
- if (last_level != base_level)
- tfu.ioa |= V3D33_TFU_IOA_DIMTW;
- tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
- (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
- V3D33_TFU_IOA_FORMAT_SHIFT);
-
- tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
- tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
-
- switch (src_base_slice->tiling) {
- case V3D_TILING_UIF_NO_XOR:
- case V3D_TILING_UIF_XOR:
- tfu.iis |= (src_base_slice->padded_height /
- (2 * v3d_utile_height(src->cpp)));
- break;
- case V3D_TILING_RASTER:
- tfu.iis |= src_base_slice->stride / src->cpp;
- break;
- case V3D_TILING_LINEARTILE:
- case V3D_TILING_UBLINEAR_1_COLUMN:
- case V3D_TILING_UBLINEAR_2_COLUMN:
- break;
- }
-
- /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
- * OPAD field for the destination (how many extra UIF blocks beyond
- * those necessary to cover the height). When filling mipmaps, the
- * miplevel 1+ tiling state is inferred.
- */
- if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
- dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
- int uif_block_h = 2 * v3d_utile_height(dst->cpp);
- int implicit_padded_height = align(height, uif_block_h);
-
- tfu.icfg |= (((dst_base_slice->padded_height -
- implicit_padded_height) / uif_block_h) <<
- V3D33_TFU_ICFG_OPAD_SHIFT);
- }
-
- int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
- if (ret != 0) {
- fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
- return false;
- }
-
- dst->writes++;
-
- return true;
-}
-
bool
v3d_generate_mipmap(struct pipe_context *pctx,
struct pipe_resource *prsc,
@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx,
if (first_layer != last_layer)
return false;
- return v3d_tfu(pctx,
- prsc, prsc,
- base_level,
- base_level, last_level,
- first_layer, first_layer,
- true);
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_device_info *devinfo = &screen->devinfo;
+
+ return v3d_X(devinfo, tfu)(pctx,
+ prsc, prsc,
+ base_level,
+ base_level, last_level,
+ first_layer, first_layer,
+ true);
}
static void
@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
if (info->dst.format != info->src.format)
return;
- if (v3d_tfu(pctx, info->dst.resource, info->src.resource,
- info->src.level,
- info->dst.level, info->dst.level,
- info->src.box.z, info->dst.box.z,
- false)) {
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_device_info *devinfo = &screen->devinfo;
+
+ if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource,
+ info->src.level,
+ info->dst.level, info->dst.level,
+ info->src.box.z, info->dst.box.z,
+ false)) {
info->mask &= ~PIPE_MASK_RGBA;
}
}
@@ -495,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa;
uint32_t tile_width, tile_height, max_bpp;
- v3d_get_tile_buffer_size(msaa, double_buffer,
+ v3d_get_tile_buffer_size(devinfo, msaa, double_buffer,
is_color_blit ? 1 : 0, surfaces, src_surf,
&tile_width, &tile_height, &max_bpp);
diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
index f12e8c92139..1dc4bd017fe 100644
--- a/src/gallium/drivers/v3d/v3d_context.c
+++ b/src/gallium/drivers/v3d/v3d_context.c
@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
}
void
-v3d_get_tile_buffer_size(bool is_msaa,
+v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+ bool is_msaa,
bool double_buffer,
uint32_t nr_cbufs,
struct pipe_surface **cbufs,
@@ -232,11 +233,13 @@ v3d_get_tile_buffer_size(bool is_msaa,
assert(!is_msaa || !double_buffer);
uint32_t max_cbuf_idx = 0;
+ uint32_t total_bpp = 0;
*max_bpp = 0;
for (int i = 0; i < nr_cbufs; i++) {
if (cbufs[i]) {
struct v3d_surface *surf = v3d_surface(cbufs[i]);
*max_bpp = MAX2(*max_bpp, surf->internal_bpp);
+ total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp);
max_cbuf_idx = MAX2(i, max_cbuf_idx);
}
}
@@ -245,9 +248,11 @@ v3d_get_tile_buffer_size(bool is_msaa,
struct v3d_surface *bsurf = v3d_surface(bbuf);
assert(bbuf->texture->nr_samples <= 1 || is_msaa);
*max_bpp = MAX2(*max_bpp, bsurf->internal_bpp);
+ total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp);
}
- v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp,
+ v3d_choose_tile_size(devinfo, max_cbuf_idx + 1,
+ *max_bpp, total_bpp,
is_msaa, double_buffer,
tile_width, tile_height);
}
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
index 97850b0363e..eb184b4b203 100644
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj {
unsigned num_elements;
uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)];
+ /* defaults can be NULL for some hw generation */
struct pipe_resource *defaults;
uint32_t defaults_offset;
};
@@ -794,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx);
void v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
enum pipe_shader_type shader);
-void v3d_get_tile_buffer_size(bool is_msaa,
+void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+ bool is_msaa,
bool double_buffer,
uint32_t nr_cbufs,
struct pipe_surface **cbufs,
@@ -818,16 +820,52 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
/* Helper to call hw ver specific functions */
#define v3d_X(devinfo, thing) ({ \
- __typeof(&v3d42_##thing) v3d_X_thing; \
- if ((devinfo)->ver >= 42) \
- v3d_X_thing = &v3d42_##thing; \
- else if ((devinfo)->ver >= 33) \
+ __typeof(&v3d33_##thing) v3d_X_thing; \
+ switch (devinfo->ver) { \
+ case 33: \
+ case 40: \
v3d_X_thing = &v3d33_##thing; \
- else \
+ break; \
+ case 42: \
+ v3d_X_thing = &v3d42_##thing; \
+ break; \
+ case 71: \
+ v3d_X_thing = &v3d71_##thing; \
+ break; \
+ default: \
unreachable("Unsupported hardware generation"); \
+ } \
v3d_X_thing; \
})
+/* FIXME: The same for vulkan/opengl. Common place? define it at the
+ * v3d_packet files?
+ */
+#define V3D33_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+/* Helper to get hw-specific macro values */
+#define V3DV_X(devinfo, thing) ({ \
+ __typeof(V3D33_##thing) V3D_X_THING; \
+ switch (devinfo->ver) { \
+ case 33: \
+ case 40: \
+ V3D_X_THING = V3D33_##thing; \
+ break; \
+ case 41: \
+ case 42: \
+ V3D_X_THING = V3D42_##thing; \
+ break; \
+ case 71: \
+ V3D_X_THING = V3D71_##thing; \
+ break; \
+ default: \
+ unreachable("Unsupported hardware generation"); \
+ } \
+ V3D_X_THING; \
+})
+
#ifdef v3dX
# include "v3dx_context.h"
#else
@@ -838,6 +876,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
# define v3dX(x) v3d42_##x
# include "v3dx_context.h"
# undef v3dX
+
+# define v3dX(x) v3d71_##x
+# include "v3dx_context.h"
+# undef v3dX
#endif
#endif /* V3D_CONTEXT_H */
diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c
index b022ed45073..577890a06c3 100644
--- a/src/gallium/drivers/v3d/v3d_job.c
+++ b/src/gallium/drivers/v3d/v3d_job.c
@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
job->double_buffer = false;
}
- v3d_get_tile_buffer_size(job->msaa, job->double_buffer,
+ v3d_get_tile_buffer_size(&v3d->screen->devinfo,
+ job->msaa, job->double_buffer,
job->nr_cbufs, job->cbufs, job->bbuf,
- &job->tile_width, &job->tile_height,
+ &job->tile_width,
+ &job->tile_height,
&job->internal_bpp);
/* The dirty flags are tracking what's been updated while v3d->job has
diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c
index db98c89625f..83f82e44a3d 100644
--- a/src/gallium/drivers/v3d/v3d_query.c
+++ b/src/gallium/drivers/v3d/v3d_query.c
@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index,
struct pipe_driver_query_group_info *info)
{
struct v3d_screen *screen = v3d_screen(pscreen);
+ struct v3d_device_info *devinfo = &screen->devinfo;
- return v3d_get_driver_query_group_info_perfcnt(screen, index, info);
+ return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen,
+ index,
+ info);
}
int
@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
struct pipe_driver_query_info *info)
{
struct v3d_screen *screen = v3d_screen(pscreen);
+ struct v3d_device_info *devinfo = &screen->devinfo;
- return v3d_get_driver_query_info_perfcnt(screen, index, info);
+ return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen,
+ index,
+ info);
}
static struct pipe_query *
@@ -53,9 +59,13 @@ static struct pipe_query *
v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
unsigned *query_types)
{
- return v3d_create_batch_query_perfcnt(v3d_context(pctx),
- num_queries,
- query_types);
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_device_info *devinfo = &screen->devinfo;
+
+ return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx),
+ num_queries,
+ query_types);
}
static void
diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h
index 3e1426b8d86..605ed1a12f9 100644
--- a/src/gallium/drivers/v3d/v3d_query.h
+++ b/src/gallium/drivers/v3d/v3d_query.h
@@ -42,11 +42,5 @@ struct v3d_query
};
struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index);
-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
- unsigned *query_types);
-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_group_info *info);
-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_info *info);
#endif /* V3D_QUERY_H */
diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
index a0a210ccad5..8e31acb0ff0 100644
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -439,7 +439,7 @@ v3d_resource_get_handle(struct pipe_screen *pscreen,
case WINSYS_HANDLE_TYPE_SHARED:
return v3d_bo_flink(bo, &whandle->handle);
case WINSYS_HANDLE_TYPE_KMS:
- if (screen->ro) {
+ if (screen->ro && rsc->scanout) {
if (renderonly_get_handle(rsc->scanout, whandle)) {
whandle->stride = rsc->slices[0].stride;
return true;
@@ -785,6 +785,27 @@ v3d_resource_setup(struct pipe_screen *pscreen,
return rsc;
}
+static bool
+v3d_resource_should_scanout(struct pipe_screen *pscreen,
+ const struct pipe_resource *tmpl,
+ const uint64_t *modifiers,
+ int count)
+{
+ struct v3d_screen *screen = v3d_screen(pscreen);
+
+ if (tmpl->bind & PIPE_BIND_SCANOUT) {
+ if (screen->maintain_ignorable_scanout)
+ return true;
+ if (screen->has_x_session && screen->ignore_scanout_usages) {
+ if (drm_find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF,
+ modifiers, count))
+ return false;
+ }
+ return true;
+ }
+ return false;
+}
+
static struct pipe_resource *
v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
const struct pipe_resource *tmpl,
@@ -798,6 +819,8 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
struct pipe_resource *prsc = &rsc->base;
/* Use a tiled layout if we can, for better 3D performance. */
bool should_tile = true;
+ bool should_scanout = v3d_resource_should_scanout(pscreen, tmpl,
+ modifiers, count);
assert(tmpl->target != PIPE_BUFFER ||
(tmpl->format == PIPE_FORMAT_NONE ||
@@ -827,7 +850,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
/* If using the old-school SCANOUT flag, we don't know what the screen
* might support other than linear. Just force linear.
*/
- if (tmpl->bind & PIPE_BIND_SCANOUT)
+ if ((tmpl->bind & PIPE_BIND_SCANOUT) && should_scanout)
should_tile = false;
/* No user-specified modifier; determine our own. */
@@ -849,7 +872,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
v3d_setup_slices(rsc, 0, tmpl->bind & PIPE_BIND_SHARED);
- if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) {
+ if (screen->ro && should_scanout) {
struct winsys_handle handle;
struct pipe_resource scanout_tmpl = {
.target = prsc->target,
@@ -979,7 +1002,7 @@ v3d_resource_from_handle(struct pipe_screen *pscreen,
}
}
- if (screen->ro) {
+ if (screen->ro && !rsc->tiled) {
/* Make sure that renderonly has a handle to our buffer in the
* display's fd, so that a later renderonly_get_handle()
* returns correct handles or GEM names.
@@ -1025,7 +1048,9 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
assert(view->texture != pview->texture);
- if (shadow->writes == orig->writes && orig->bo->private)
+ if (shadow->writes == orig->writes &&
+ orig->base.sync_status == 0 &&
+ (orig->bo->private || orig->base.sync_condition))
return;
perf_debug("Updating %dx%d@%d shadow for linear texture\n",
@@ -1068,6 +1093,7 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
}
shadow->writes = orig->writes;
+ orig->base.sync_status = 0;
}
static struct pipe_surface *
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
index bce1eeafcd9..4d2478b130d 100644
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -47,6 +47,42 @@
#include "compiler/v3d_compiler.h"
#include "drm-uapi/drm_fourcc.h"
+#ifdef HAVE_WAYLAND_PLATFORM
+#include <wayland-client.h>
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+#include <xcb/xcb.h>
+#endif
+
+static bool
+check_x_session()
+{
+ bool xcb_connection = false;
+
+#ifdef HAVE_WAYLAND_PLATFORM
+ struct wl_display *display;
+
+ display = wl_display_connect(NULL);
+
+ if (display) {
+ wl_display_disconnect(display);
+ return xcb_connection;
+ }
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+ xcb_connection_t *conn;
+
+ conn = xcb_connect(NULL, NULL);
+
+ if (!xcb_connection_has_error(conn))
+ xcb_connection = true;
+ xcb_disconnect(conn);
+#endif
+ return xcb_connection;
+}
+
static const char *
v3d_screen_get_name(struct pipe_screen *pscreen)
{
@@ -255,9 +291,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
return V3D_MAX_ARRAY_LAYERS;
- /* Render targets. */
case PIPE_CAP_MAX_RENDER_TARGETS:
- return 4;
+ return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver);
case PIPE_CAP_VENDOR_ID:
return 0x14E4;
@@ -919,6 +954,12 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
if (!v3d_get_device_info(screen->fd, &screen->devinfo, &v3d_ioctl))
goto fail;
+ if (screen->devinfo.ver >= 71) {
+ fprintf(stderr, "WARNING: v3d support for hw version %i is neither "
+ "a complete nor a conformant OpenGL implementation. Testing "
+ "use only.\n", screen->devinfo.ver);
+ }
+
driParseConfigFiles(config->options, config->options_info, 0, "v3d",
NULL, NULL, NULL, 0, NULL, 0);
@@ -937,6 +978,29 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH);
screen->has_perfmon = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+ screen->ignore_scanout_usages = getenv("V3D_IGNORE_SCANOUT_USAGES");
+
+ const char *is_xserver_process =
+ "v3d_is_xserver_process";
+ screen->is_xserver_process =
+ driCheckOption(config->options,
+ is_xserver_process,
+ DRI_BOOL) &&
+ driQueryOptionb(config->options,
+ is_xserver_process);
+
+ const char *maintain_ignorable_scanout_name =
+ "v3d_maintain_ignorable_scanout";
+ screen->maintain_ignorable_scanout =
+ driCheckOption(config->options,
+ maintain_ignorable_scanout_name,
+ DRI_BOOL) &&
+ driQueryOptionb(config->options,
+ maintain_ignorable_scanout_name);
+
+ screen->has_x_session = !screen->is_xserver_process &&
+ check_x_session();
+
v3d_fence_init(screen);
v3d_process_debug_variable();
diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h
index 1da9b83c965..c0f22707075 100644
--- a/src/gallium/drivers/v3d/v3d_screen.h
+++ b/src/gallium/drivers/v3d/v3d_screen.h
@@ -83,6 +83,12 @@ struct v3d_screen {
bool has_cache_flush;
bool has_perfmon;
bool nonmsaa_texture_size_limit;
+ bool ignore_scanout_usages;
+ bool is_xserver_process;
+ bool maintain_ignorable_scanout;
+
+ /* Are we running in an X session? */
+ bool has_x_session;
struct v3d_simulator_file *sim_file;
diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
index 95eb838954f..1b8758bae7d 100644
--- a/src/gallium/drivers/v3d/v3d_uniforms.c
+++ b/src/gallium/drivers/v3d/v3d_uniforms.c
@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
struct v3d_compiled_shader *shader,
enum pipe_shader_type stage)
{
+ struct v3d_device_info *devinfo = &v3d->screen->devinfo;
struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage];
struct v3d_texture_stateobj *texstate = &v3d->tex[stage];
struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
@@ -282,6 +283,9 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
struct v3d_cl_out *uniforms =
cl_start(&job->indirect);
+ float clipper_xy_granularity =
+ V3DV_X(devinfo, CLIPPER_XY_GRANULARITY);
+
for (int i = 0; i < uinfo->count; i++) {
uint32_t data = uinfo->data[i];
@@ -293,10 +297,10 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
cl_aligned_u32(&uniforms, gallium_uniforms[data]);
break;
case QUNIFORM_VIEWPORT_X_SCALE:
- cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f);
+ cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Y_SCALE:
- cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f);
+ cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity);
break;
case QUNIFORM_VIEWPORT_Z_OFFSET:
diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
index 03d7c244ea2..c487ac3b996 100644
--- a/src/gallium/drivers/v3d/v3dx_context.h
+++ b/src/gallium/drivers/v3d/v3dx_context.h
@@ -51,3 +51,23 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
*/
bool v3dX(tfu_supports_tex_format)(uint32_t tex_format,
bool for_mipmap);
+
+bool v3dX(tfu)(struct pipe_context *pctx,
+ struct pipe_resource *pdst,
+ struct pipe_resource *psrc,
+ unsigned int src_level,
+ unsigned int base_level,
+ unsigned int last_level,
+ unsigned int src_layer,
+ unsigned int dst_layer,
+ bool for_mipmap);
+
+int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info);
+int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_info *info);
+struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d,
+ unsigned num_queries,
+ unsigned *query_types);
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 0640dab1884..85083035ea6 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -95,7 +95,25 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
#endif
assert(!job->msaa || !job->double_buffer);
-#if V3D_VERSION >= 40
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+ config.width_in_pixels = job->draw_width;
+ config.height_in_pixels = job->draw_height;
+
+ config.log2_tile_width = log2_tile_size(job->tile_width);
+ config.log2_tile_height = log2_tile_size(job->tile_height);
+
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+ }
+
+#endif
+
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
config.width_in_pixels = job->draw_width;
config.height_in_pixels = job->draw_height;
@@ -107,7 +125,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
config.maximum_bpp_of_all_render_targets = job->internal_bpp;
}
-#else /* V3D_VERSION < 40 */
+#endif
+#if V3D_VERSION < 40
/* "Binning mode lists start with a Tile Binning Mode Configuration
* item (120)"
*
@@ -134,7 +153,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
config.maximum_bpp_of_all_render_targets = job->internal_bpp;
}
-#endif /* V3D_VERSION < 40 */
+#endif
/* There's definitely nothing in the VCD cache we want. */
cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
@@ -377,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
gs_bin->prog_data.gs->base.threads == 4;
shader.geometry_bin_mode_shader_start_in_final_thread_section =
gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
shader.geometry_bin_mode_shader_uniforms_address =
gs_bin_uniforms;
@@ -387,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
gs->prog_data.gs->base.threads == 4;
shader.geometry_render_mode_shader_start_in_final_thread_section =
gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
shader.geometry_render_mode_shader_uniforms_address =
gs_render_uniforms;
}
@@ -638,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
shader.number_of_varyings_in_fragment_shader =
v3d->prog.fs->prog_data.fs->num_inputs;
- shader.coordinate_shader_propagate_nans = true;
- shader.vertex_shader_propagate_nans = true;
- shader.fragment_shader_propagate_nans = true;
-
shader.coordinate_shader_code_address =
cl_address(v3d_resource(v3d->prog.cs->resource)->bo,
v3d->prog.cs->offset);
@@ -652,6 +671,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
cl_address(v3d_resource(v3d->prog.fs->resource)->bo,
v3d->prog.fs->offset);
+#if V3D_VERSION <= 42
+ shader.coordinate_shader_propagate_nans = true;
+ shader.vertex_shader_propagate_nans = true;
+ shader.fragment_shader_propagate_nans = true;
+
/* XXX: Use combined input/output size flag in the common
* case.
*/
@@ -659,13 +683,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
v3d->prog.cs->prog_data.vs->separate_segments;
shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
v3d->prog.vs->prog_data.vs->separate_segments;
-
shader.coordinate_shader_input_vpm_segment_size =
v3d->prog.cs->prog_data.vs->separate_segments ?
v3d->prog.cs->prog_data.vs->vpm_input_size : 1;
shader.vertex_shader_input_vpm_segment_size =
v3d->prog.vs->prog_data.vs->separate_segments ?
v3d->prog.vs->prog_data.vs->vpm_input_size : 1;
+#endif
+ /* On V3D 7.1 there isn't a specific flag to set if we are using
+ * shared/separate segments or not. We just set the value of
+ * vpm_input_size to 0, and set output to the max needed. That should be
+ * already properly set on prog_data_vs_bin
+ */
+#if V3D_VERSION == 71
+ shader.coordinate_shader_input_vpm_segment_size =
+ v3d->prog.cs->prog_data.vs->vpm_input_size;
+ shader.vertex_shader_input_vpm_segment_size =
+ v3d->prog.vs->prog_data.vs->vpm_input_size;
+#endif
shader.coordinate_shader_output_vpm_segment_size =
v3d->prog.cs->prog_data.vs->vpm_output_size;
@@ -724,9 +759,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
shader.instance_id_read_by_vertex_shader =
v3d->prog.vs->prog_data.vs->uses_iid;
+#if V3D_VERSION <= 42
shader.address_of_default_attribute_values =
cl_address(v3d_resource(vtx->defaults)->bo,
vtx->defaults_offset);
+#endif
}
bool cs_loaded_any = false;
@@ -1436,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
- /* Number of batches the dispatch will invoke (minus 1). */
- submit.cfg[4] = num_batches - 1;
+ /* Number of batches the dispatch will invoke.
+ * V3D 7.1.6 and later don't subtract 1 from the number of batches
+ */
+ if (v3d->screen->devinfo.ver < 71 ||
+ (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) {
+ submit.cfg[4] = num_batches - 1;
+ } else {
+ submit.cfg[4] = num_batches;
+ }
/* Make sure we didn't accidentally underflow. */
assert(submit.cfg[4] != ~0);
@@ -1445,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo);
submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset +
v3d->prog.compute->offset);
- submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+ if (v3d->screen->devinfo.ver < 71)
+ submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
if (v3d->prog.compute->prog_data.base->single_seg)
submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
if (v3d->prog.compute->prog_data.base->threads == 4)
@@ -1560,9 +1605,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
/* GFXH-1461: If we were to emit a load of just depth or just stencil,
* then the clear for the other may get lost. We need to decide now
* if it would be possible to need to emit a load of just one after
- * we've set up our TLB clears.
+ * we've set up our TLB clears. This issue is fixed since V3D 4.3.18.
*/
- if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
+ if (v3d->screen->devinfo.ver <= 42 &&
+ buffers & PIPE_CLEAR_DEPTHSTENCIL &&
(buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
job->zsbuf &&
util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
index 0ad3fb68b1e..82a45e44f82 100644
--- a/src/gallium/drivers/v3d/v3dx_emit.c
+++ b/src/gallium/drivers/v3d/v3dx_emit.c
@@ -512,13 +512,17 @@ v3dX(emit_state)(struct pipe_context *pctx)
/* Note: EZ state may update based on the compiled FS,
* along with ZSA
*/
+#if V3D_VERSION <= 42
config.early_z_updates_enable =
(job->ez_state != V3D_EZ_DISABLED);
+#endif
if (v3d->zsa->base.depth_enabled) {
config.z_updates_enable =
v3d->zsa->base.depth_writemask;
+#if V3D_VERSION <= 42
config.early_z_enable =
config.early_z_updates_enable;
+#endif
config.depth_test_function =
v3d->zsa->base.depth_func;
} else {
@@ -535,13 +539,27 @@ v3dX(emit_state)(struct pipe_context *pctx)
v3d_line_smoothing_enabled(v3d) ?
V3D_LINE_RASTERIZATION_PERP_END_CAPS :
V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
- }
+#if V3D_VERSION >= 71
+ /* The following follows the logic implemented at v3dv
+ * plus the definition of depth_clip_near/far and
+ * depth_clamp.
+ *
+ * Note: some extensions are not supported by v3d
+ * (like ARB_depth_clamp) that would affect this, but
+ * the values on rasterizer are taking that into
+ * account.
+ */
+ config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near ||
+ v3d->rasterizer->base.depth_clip_far;
+#endif
+ }
}
if (v3d->dirty & V3D_DIRTY_RASTERIZER &&
v3d->rasterizer->base.offset_tri) {
- if (job->zsbuf &&
+ if (v3d->screen->devinfo.ver <= 42 &&
+ job->zsbuf &&
job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
cl_emit_prepacked_sized(&job->bcl,
v3d->rasterizer->depth_offset_z16,
@@ -564,12 +582,23 @@ v3dX(emit_state)(struct pipe_context *pctx)
}
if (v3d->dirty & V3D_DIRTY_VIEWPORT) {
+#if V3D_VERSION <= 42
cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
clip.viewport_half_width_in_1_256th_of_pixel =
v3d->viewport.scale[0] * 256.0f;
clip.viewport_half_height_in_1_256th_of_pixel =
v3d->viewport.scale[1] * 256.0f;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+ clip.viewport_half_width_in_1_64th_of_pixel =
+ v3d->viewport.scale[0] * 64.0f;
+ clip.viewport_half_height_in_1_64th_of_pixel =
+ v3d->viewport.scale[1] * 64.0f;
+ }
+#endif
+
cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
clip.viewport_z_offset_zc_to_zs =
@@ -633,8 +662,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
}
#endif
+ const uint32_t max_rts =
+ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
if (blend->base.independent_blend_enable) {
- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+ for (int i = 0; i < max_rts; i++)
emit_rt_blend(v3d, job, &blend->base, i,
(1 << i),
v3d->blend_dst_alpha_one & (1 << i));
@@ -650,16 +681,16 @@ v3dX(emit_state)(struct pipe_context *pctx)
* RTs without.
*/
emit_rt_blend(v3d, job, &blend->base, 0,
- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
+ ((1 << max_rts) - 1) &
v3d->blend_dst_alpha_one,
true);
emit_rt_blend(v3d, job, &blend->base, 0,
- ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
+ ((1 << max_rts) - 1) &
~v3d->blend_dst_alpha_one,
false);
} else {
emit_rt_blend(v3d, job, &blend->base, 0,
- (1 << V3D_MAX_DRAW_BUFFERS) - 1,
+ (1 << max_rts) - 1,
v3d->blend_dst_alpha_one);
}
}
@@ -668,8 +699,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
if (v3d->dirty & V3D_DIRTY_BLEND) {
struct pipe_blend_state *blend = &v3d->blend->base;
+ const uint32_t max_rts =
+ V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
- for (int i = 0; i < 4; i++) {
+ for (int i = 0; i < max_rts; i++) {
int rt = blend->independent_blend_enable ? i : 0;
int rt_mask = blend->rt[rt].colormask;
diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
similarity index 94%
rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c
rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c
index e00d84e375f..431aad14b4f 100644
--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c
+++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon)
}
int
-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_group_info *info)
+v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index,
+ struct pipe_driver_query_group_info *info)
{
if (!screen->has_perfmon)
return 0;
@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde
}
int
-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
- struct pipe_driver_query_info *info)
+v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index,
+ struct pipe_driver_query_info *info)
{
if (!screen->has_perfmon)
return 0;
@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = {
};
struct pipe_query *
-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
- unsigned *query_types)
+v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries,
+ unsigned *query_types)
{
struct v3d_query_perfcnt *pquery = NULL;
struct v3d_query *query;
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
index 82547437c25..d3fbc9aff5d 100644
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -23,8 +23,9 @@
#include "util/format/u_format.h"
#include "v3d_context.h"
-#include "broadcom/common/v3d_tiling.h"
#include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_tiling.h"
+#include "broadcom/common/v3d_util.h"
#include "broadcom/cle/v3dx_pack.h"
#define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 | \
@@ -419,10 +420,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
* clearing Z/S.
*/
if (job->clear) {
+#if V3D_VERSION <= 42
cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
+
}
#endif /* V3D_VERSION >= 40 */
}
@@ -483,10 +490,64 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
}
}
-#if V3D_VERSION >= 40
+#if V3D_VERSION > 33
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ */
+static uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ enum pipe_format format)
+{
+#if V3D_VERSION == 42
+ if (util_format_is_pure_integer(format)) {
+ return V3D_RENDER_TARGET_CLAMP_INT;
+ } else if (util_format_is_srgb(format)) {
+ return V3D_RENDER_TARGET_CLAMP_NORM;
+ } else {
+ return V3D_RENDER_TARGET_CLAMP_NONE;
+ }
+#endif
+#if V3D_VERSION >= 71
+ switch (rt_type) {
+ case V3D_INTERNAL_TYPE_8I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+ case V3D_INTERNAL_TYPE_8UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_8:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+ case V3D_INTERNAL_TYPE_16I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+ case V3D_INTERNAL_TYPE_16UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_16F:
+ return util_format_is_srgb(format) ?
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+ V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+ case V3D_INTERNAL_TYPE_32I:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+ case V3D_INTERNAL_TYPE_32UI:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+ case V3D_INTERNAL_TYPE_32F:
+ return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+ default:
+ unreachable("Unknown internal render target type");
+ }
+ return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+ return 0;
+}
+#endif
+
+#if V3D_VERSION >= 71
static void
-v3d_setup_render_target(struct v3d_job *job, int cbuf,
- uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
+v3d_setup_render_target(struct v3d_job *job,
+ int cbuf,
+ uint32_t *rt_bpp,
+ uint32_t *rt_type_clamp)
{
if (!job->cbufs[cbuf])
return;
@@ -497,19 +558,35 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf,
struct v3d_surface *bsurf = v3d_surface(job->bbuf);
*rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
}
- *rt_type = surf->internal_type;
- if (util_format_is_srgb(surf->base.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
-#if V3D_VERSION >= 42
- else if (util_format_is_pure_integer(surf->base.format))
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-#endif
- else
- *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+ *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
+ surf->base.format);
}
+#endif
-#else /* V3D_VERSION < 40 */
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+static void
+v3d_setup_render_target(struct v3d_job *job,
+ int cbuf,
+ uint32_t *rt_bpp,
+ uint32_t *rt_type,
+ uint32_t *rt_clamp)
+{
+ if (!job->cbufs[cbuf])
+ return;
+
+ struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]);
+ *rt_bpp = surf->internal_bpp;
+ if (job->bbuf) {
+ struct v3d_surface *bsurf = v3d_surface(job->bbuf);
+ *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
+ }
+ *rt_type = surf->internal_type;
+ *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
+ surf->base.format);
+}
+#endif
+#if V3D_VERSION < 40
static void
v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
struct v3d_resource *rsc, bool is_separate_stencil)
@@ -656,7 +733,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
-#else
+#endif
+#if V3D_VERSION >= 40
for (int i = 0; i < 2; i++) {
if (i > 0)
cl_emit(&job->rcl, TILE_COORDINATES, coords);
@@ -664,16 +742,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
store.buffer_to_store = NONE;
}
+
if (i == 0 || do_double_initial_tile_clear(job)) {
+#if V3D_VERSION < 71
cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
clear.clear_z_stencil_buffer = !job->early_zs_clear;
clear.clear_all_render_targets = true;
}
+#else
+ cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
}
cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
}
#endif
-
cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
v3d_rcl_emit_generic_per_tile_list(job, layer);
@@ -775,18 +857,52 @@ v3dX(emit_rcl)(struct v3d_job *job)
config.multisample_mode_4x = job->msaa;
config.double_buffer_in_non_ms_mode = job->double_buffer;
+#if V3D_VERSION <= 42
config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+ config.log2_tile_width = log2_tile_size(job->tile_width);
+ config.log2_tile_height = log2_tile_size(job->tile_height);
+
+ /* FIXME: ideallly we would like next assert on the packet header (as is
+ * general, so also applies to GL). We would need to expand
+ * gen_pack_header for that.
+ */
+ assert(config.log2_tile_width == config.log2_tile_height ||
+ config.log2_tile_width == config.log2_tile_height + 1);
+#endif
+
}
+#if V3D_VERSION >= 71
+ uint32_t base_addr = 0;
+
+ /* If we don't have any color RTs, we sill need to emit one and flat
+ * it as not used using stride = 1
+ */
+ if (job->nr_cbufs == 0) {
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.stride = 1; /* Unused */
+ }
+ }
+#endif
for (int i = 0; i < job->nr_cbufs; i++) {
struct pipe_surface *psurf = job->cbufs[i];
- if (!psurf)
+ if (!psurf) {
+#if V3D_VERSION >= 71
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.render_target_number = i;
+ rt.stride = 1; /* Unused */
+ }
+#endif
continue;
+ }
+
struct v3d_surface *surf = v3d_surface(psurf);
struct v3d_resource *rsc = v3d_resource(psurf->texture);
UNUSED uint32_t config_pad = 0;
- uint32_t clear_pad = 0;
+ UNUSED uint32_t clear_pad = 0;
/* XXX: Set the pad for raster. */
if (surf->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -819,6 +935,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
}
#endif /* V3D_VERSION < 40 */
+#if V3D_VERSION <= 42
cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
clear) {
clear.clear_color_low_32_bits = job->clear_color[i][0];
@@ -847,9 +964,42 @@ v3dX(emit_rcl)(struct v3d_job *job)
clear.render_target_number = i;
};
}
+#endif
+#if V3D_VERSION >= 71
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+ rt.clear_color_low_bits = job->clear_color[i][0];
+ v3d_setup_render_target(job, i, &rt.internal_bpp,
+ &rt.internal_type_and_clamping);
+ rt.stride =
+ v3d_compute_rt_row_row_stride_128_bits(job->tile_width,
+ v3d_internal_bpp_words(rt.internal_bpp));
+ rt.base_address = base_addr;
+ rt.render_target_number = i;
+
+ base_addr += (job->tile_height * rt.stride) / 8;
+ }
+
+ if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+ rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
+ ((uint64_t) job->clear_color[i][1]) |
+ (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32);
+ rt.render_target_number = i;
+ }
+ }
+
+ if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) {
+ cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+ rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+ (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) |
+ (((uint64_t) (job->clear_color[i][3])) << 24);
+ rt.render_target_number = i;
+ }
+ }
+#endif
}
-#if V3D_VERSION >= 40
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
v3d_setup_render_target(job, 0,
&rt.render_target_0_internal_bpp,
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
index 0f1735fee66..a7fad572a2d 100644
--- a/src/gallium/drivers/v3d/v3dx_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
#endif
}
- /* The HW treats polygon offset units based on a Z24 buffer, so we
+ /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we
* need to scale up offset_units if we're only Z16.
*/
+#if V3D_VERSION <= 42
v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) {
depth.depth_offset_factor = cso->offset_scale;
depth.depth_offset_units = cso->offset_units * 256.0;
@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
depth.limit = cso->offset_clamp;
#endif
}
+#endif
return so;
}
@@ -138,8 +140,9 @@ v3d_create_blend_state(struct pipe_context *pctx,
so->base = *cso;
+ uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION);
if (cso->independent_blend_enable) {
- for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+ for (int i = 0; i < max_rts; i++) {
so->blend_enables |= cso->rt[i].blend_enable << i;
/* V3D 4.x is when we got independent blend enables. */
@@ -148,7 +151,7 @@ v3d_create_blend_state(struct pipe_context *pctx,
}
} else {
if (cso->rt[0].blend_enable)
- so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1;
+ so->blend_enables = (1 << max_rts) - 1;
}
return so;
@@ -337,6 +340,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
v3d->dirty |= V3D_DIRTY_ZSA;
}
+
+static bool
+needs_default_attribute_values(void)
+{
+#if V3D_VERSION <= 42
+ /* FIXME: on vulkan we are able to refine even further, as we know in
+ * advance when we create the pipeline if we have a integer vertex
+ * attrib. Pending to check if we could do something similar here.
+ */
+ return true;
+#endif
+ return false;
+}
+
static void *
v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
const struct pipe_vertex_element *elements)
@@ -414,24 +431,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
}
}
- /* Set up the default attribute values in case any of the vertex
- * elements use them.
- */
- uint32_t *attrs;
- u_upload_alloc(v3d->state_uploader, 0,
- V3D_MAX_VS_INPUTS * sizeof(float), 16,
- &so->defaults_offset, &so->defaults, (void **)&attrs);
-
- for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
- attrs[i * 4 + 0] = 0;
- attrs[i * 4 + 1] = 0;
- attrs[i * 4 + 2] = 0;
- if (i < so->num_elements &&
- util_format_is_pure_integer(so->pipe[i].src_format)) {
- attrs[i * 4 + 3] = 1;
- } else {
- attrs[i * 4 + 3] = fui(1.0);
+ if (needs_default_attribute_values()) {
+ /* Set up the default attribute values in case any of the vertex
+ * elements use them.
+ */
+ uint32_t *attrs;
+ u_upload_alloc(v3d->state_uploader, 0,
+ V3D_MAX_VS_INPUTS * sizeof(float), 16,
+ &so->defaults_offset, &so->defaults, (void **)&attrs);
+
+ for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
+ attrs[i * 4 + 0] = 0;
+ attrs[i * 4 + 1] = 0;
+ attrs[i * 4 + 2] = 0;
+ if (i < so->num_elements &&
+ util_format_is_pure_integer(so->pipe[i].src_format)) {
+ attrs[i * 4 + 3] = 1;
+ } else {
+ attrs[i * 4 + 3] = fui(1.0);
+ }
}
+ } else {
+ so->defaults = NULL;
+ so->defaults_offset = 0;
}
u_upload_unmap(v3d->state_uploader);
@@ -699,21 +721,22 @@ v3d_upload_sampler_state_variant(void *map,
break;
}
- if (variant >= V3D_SAMPLER_STATE_32) {
- sampler.border_color_word_0 = border.ui[0];
- sampler.border_color_word_1 = border.ui[1];
- sampler.border_color_word_2 = border.ui[2];
- sampler.border_color_word_3 = border.ui[3];
- } else {
- sampler.border_color_word_0 =
- _mesa_float_to_half(border.f[0]);
- sampler.border_color_word_1 =
- _mesa_float_to_half(border.f[1]);
- sampler.border_color_word_2 =
- _mesa_float_to_half(border.f[2]);
- sampler.border_color_word_3 =
- _mesa_float_to_half(border.f[3]);
+#if V3D_VERSION <= 42
+ /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+ * for us. In V3D 4.x we need to manually convert floating point color
+ * values to the expected format.
+ */
+ if (variant < V3D_SAMPLER_STATE_32) {
+ border.ui[0] = _mesa_float_to_half(border.f[0]);
+ border.ui[1] = _mesa_float_to_half(border.f[1]);
+ border.ui[2] = _mesa_float_to_half(border.f[2]);
+ border.ui[3] = _mesa_float_to_half(border.f[3]);
}
+#endif
+ sampler.border_color_word_0 = border.ui[0];
+ sampler.border_color_word_1 = border.ui[1];
+ sampler.border_color_word_2 = border.ui[2];
+ sampler.border_color_word_3 = border.ui[3];
}
}
}
@@ -869,7 +892,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te
}
static void
-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
+v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo,
+ struct V3DX(TEXTURE_SHADER_STATE) *tex,
struct pipe_resource *prsc,
int base_level, int last_level,
int first_layer, int last_layer,
@@ -917,19 +941,29 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
}
tex->base_level = base_level;
+
#if V3D_VERSION >= 40
tex->max_level = last_level;
/* Note that we don't have a job to reference the texture's sBO
* at state create time, so any time this sampler view is used
* we need to add the texture to the job.
*/
- tex->texture_base_pointer =
- cl_address(NULL,
- rsc->bo->offset +
- v3d_layer_offset(prsc, 0, first_layer));
+ const uint32_t base_offset = rsc->bo->offset +
+ v3d_layer_offset(prsc, 0, first_layer);
+
+ tex->texture_base_pointer = cl_address(NULL, base_offset);
#endif
+
tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
+#if V3D_VERSION >= 71
+ tex->chroma_offset_x = 1;
+ tex->chroma_offset_y = 1;
+ /* See comment in XML field definition for rationale of the shifts */
+ tex->texture_base_pointer_cb = base_offset >> 6;
+ tex->texture_base_pointer_cr = base_offset >> 6;
+#endif
+
/* Since other platform devices may produce UIF images even
* when they're not big enough for V3D to assume they're UIF,
* we force images with level 0 as UIF to be always treated
@@ -977,7 +1011,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
if (prsc->target != PIPE_BUFFER) {
- v3d_setup_texture_shader_state(&tex, prsc,
+ v3d_setup_texture_shader_state(&v3d->screen->devinfo,
+ &tex, prsc,
cso->u.tex.first_level,
cso->u.tex.last_level,
cso->u.tex.first_layer,
@@ -990,7 +1025,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
cso->u.buf.size);
}
- tex.srgb = util_format_is_srgb(cso->format);
+ bool is_srgb = util_format_is_srgb(cso->format);
+#if V3D_VERSION <= 42
+ tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+ tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif
#if V3D_VERSION >= 40
tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]);
@@ -1040,7 +1081,10 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
* shader code if we wanted to read an MSAA sRGB
* texture without sRGB decode.
*/
+#if V3D_VERSION <= 42
tex.srgb = false;
+#endif
+
} else {
tex.texture_type = v3d_get_tex_format(&screen->devinfo,
cso->format);
@@ -1404,7 +1448,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d,
v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
if (prsc->target != PIPE_BUFFER) {
- v3d_setup_texture_shader_state(&tex, prsc,
+ v3d_setup_texture_shader_state(&v3d->screen->devinfo,
+ &tex, prsc,
iview->base.u.tex.level,
iview->base.u.tex.level,
iview->base.u.tex.first_layer,
diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c
new file mode 100644
index 00000000000..d6b51390a11
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_tfu.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright © 2021 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_context.h"
+#include "broadcom/common/v3d_tfu.h"
+
+bool
+v3dX(tfu)(struct pipe_context *pctx,
+ struct pipe_resource *pdst,
+ struct pipe_resource *psrc,
+ unsigned int src_level,
+ unsigned int base_level,
+ unsigned int last_level,
+ unsigned int src_layer,
+ unsigned int dst_layer,
+ bool for_mipmap)
+{
+ struct v3d_context *v3d = v3d_context(pctx);
+ struct v3d_screen *screen = v3d->screen;
+ struct v3d_resource *src = v3d_resource(psrc);
+ struct v3d_resource *dst = v3d_resource(pdst);
+ struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
+ struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
+ int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
+ int width = u_minify(pdst->width0, base_level) * msaa_scale;
+ int height = u_minify(pdst->height0, base_level) * msaa_scale;
+ enum pipe_format pformat;
+
+ if (psrc->format != pdst->format)
+ return false;
+ if (psrc->nr_samples != pdst->nr_samples)
+ return false;
+
+ if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D)
+ return false;
+
+ /* Can't write to raster. */
+ if (dst_base_slice->tiling == V3D_TILING_RASTER)
+ return false;
+
+ /* When using TFU for blit, we are doing exact copies (both input and
+ * output format must be the same, no scaling, etc), so there is no
+ * pixel format conversions. Thus we can rewrite the format to use one
+ * that is TFU compatible based on its texel size.
+ */
+ if (for_mipmap) {
+ pformat = pdst->format;
+ } else {
+ switch (dst->cpp) {
+ case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
+ case 8: pformat = PIPE_FORMAT_R16G16B16A16_FLOAT; break;
+ case 4: pformat = PIPE_FORMAT_R32_FLOAT; break;
+ case 2: pformat = PIPE_FORMAT_R16_FLOAT; break;
+ case 1: pformat = PIPE_FORMAT_R8_UNORM; break;
+ default: unreachable("unsupported format bit-size"); break;
+ };
+ }
+
+ uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
+
+ if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) {
+ assert(for_mipmap);
+ return false;
+ }
+
+ v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
+ v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
+
+ struct drm_v3d_submit_tfu tfu = {
+ .ios = (height << 16) | width,
+ .bo_handles = {
+ dst->bo->handle,
+ src != dst ? src->bo->handle : 0
+ },
+ .in_sync = v3d->out_sync,
+ .out_sync = v3d->out_sync,
+ };
+ uint32_t src_offset = (src->bo->offset +
+ v3d_layer_offset(psrc, src_level, src_layer));
+ tfu.iia |= src_offset;
+
+ uint32_t dst_offset = (dst->bo->offset +
+ v3d_layer_offset(pdst, base_level, dst_layer));
+ tfu.ioa |= dst_offset;
+
+ switch (src_base_slice->tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.iis |= (src_base_slice->padded_height /
+ (2 * v3d_utile_height(src->cpp)));
+ break;
+ case V3D_TILING_RASTER:
+ tfu.iis |= src_base_slice->stride / src->cpp;
+ break;
+ case V3D_TILING_LINEARTILE:
+ case V3D_TILING_UBLINEAR_1_COLUMN:
+ case V3D_TILING_UBLINEAR_2_COLUMN:
+ break;
+ }
+
+#if V3D_VERSION <= 42
+ if (src_base_slice->tiling == V3D_TILING_RASTER) {
+ tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
+ V3D33_TFU_ICFG_FORMAT_SHIFT);
+ } else {
+ tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D33_TFU_ICFG_FORMAT_SHIFT);
+ }
+ tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
+
+ if (last_level != base_level)
+ tfu.ioa |= V3D33_TFU_IOA_DIMTW;
+
+ tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
+ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D33_TFU_IOA_FORMAT_SHIFT);
+
+ tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
+
+ /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+ * OPAD field for the destination (how many extra UIF blocks beyond
+ * those necessary to cover the height). When filling mipmaps, the
+ * miplevel 1+ tiling state is inferred.
+ */
+ if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
+ dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
+ int uif_block_h = 2 * v3d_utile_height(dst->cpp);
+ int implicit_padded_height = align(height, uif_block_h);
+
+ tfu.icfg |= (((dst_base_slice->padded_height -
+ implicit_padded_height) / uif_block_h) <<
+ V3D33_TFU_ICFG_OPAD_SHIFT);
+ }
+#endif /* V3D_VERSION <= 42 */
+
+#if V3D_VERSION >= 71
+ if (src_base_slice->tiling == V3D_TILING_RASTER) {
+ tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ } else {
+ tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+ (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_ICFG_IFORMAT_SHIFT;
+ }
+ tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT;
+
+ if (last_level != base_level)
+ tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW;
+
+ tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
+ (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+ V3D71_TFU_IOC_FORMAT_SHIFT);
+
+ switch (dst_base_slice->tiling) {
+ case V3D_TILING_UIF_NO_XOR:
+ case V3D_TILING_UIF_XOR:
+ tfu.v71.ioc |=
+ (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ case V3D_TILING_RASTER:
+ tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) <<
+ V3D71_TFU_IOC_STRIDE_SHIFT;
+ break;
+ default:
+ break;
+ }
+
+ tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT;
+#endif /* V3D_VERSION >= 71*/
+
+ int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
+ if (ret != 0) {
+ fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
+ return false;
+ }
+
+ dst->writes++;
+
+ return true;
+}
+
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 1c3f77f6588..9bdefb55194 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -610,6 +610,10 @@ struct pipe_resource
unsigned bind; /**< bitmask of PIPE_BIND_x */
unsigned flags; /**< bitmask of PIPE_RESOURCE_FLAG_x */
+ /* Hack for avoiding sync on v3d */
+ unsigned sync_condition;
+ unsigned sync_status;
+
/**
* For planar images, ie. YUV EGLImage external, etc, pointer to the
* next plane.
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 32135770e9d..2534c817dcc 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -275,7 +275,7 @@ dri3_update_max_num_back(struct loader_dri3_drawable *draw)
if (draw->swap_interval == 0)
draw->max_num_back = 4;
else
- draw->max_num_back = 3;
+ draw->max_num_back = 2;
assert(draw->max_num_back <= LOADER_DRI3_MAX_BACK);
break;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 77c38bf48d5..1eb2dac8018 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1058,6 +1058,9 @@ struct gl_texture_object
* the pipe_resource *pt above.
*/
bool needs_validation;
+
+ /* Hack for avoiding sync on v3d */
+ GLboolean SyncCondition;
};
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index d8fb1ed4317..048deaa02f6 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -273,6 +273,13 @@ set_tex_parameteri(struct gl_context *ctx,
}
switch (pname) {
+ case GL_SYNC_CONDITION:
+ if (!!texObj->SyncCondition == !!params[0])
+ return GL_FALSE;
+ texObj->SyncCondition = !!params[0];
+ return GL_TRUE;
+ case GL_SYNC_STATUS:
+ return GL_TRUE;
case GL_TEXTURE_MIN_FILTER:
if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
goto invalid_dsa;
@@ -930,6 +937,17 @@ _mesa_texture_parameter_invalidate(struct gl_context *ctx,
{
if (texparam_invalidates_sampler_views(pname))
st_texture_release_all_sampler_views(st_context(ctx), texObj);
+
+ switch (pname) {
+ case GL_SYNC_CONDITION:
+ texObj->pt->sync_condition = texObj->SyncCondition;
+ break;
+ case GL_SYNC_STATUS:
+ texObj->pt->sync_status = 1;
+ break;
+ default:
+ ; /* nothing */
+ }
}
void
diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
index 24cc2888755..2bc2748e7fe 100644
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -77,6 +77,7 @@ TODO: document the other workarounds.
<!-- using vulkan wsi for xservers causes deadlocks -->
<application name="Xwayland" executable="Xwayland">
<option name="disable_xcb_surface" value="true" />
+ <option name="v3d_is_xserver_process" value="true" />
</application>
<application name="Unigine Heaven (32-bit)" executable="heaven_x86">
@@ -750,6 +751,7 @@ TODO: document the other workarounds.
<application name="mutter" executable="mutter">
<option name="adaptive_sync" value="false" />
<option name="v3d_nonmsaa_texture_size_limit" value="true" />
+ <option name="v3d_maintain_ignorable_scanout" value="true" />
</application>
<application name="muffin" executable="muffin">
<option name="adaptive_sync" value="false" />
@@ -801,6 +803,7 @@ TODO: document the other workarounds.
</application>
<application name="Xorg" executable="Xorg">
<option name="v3d_nonmsaa_texture_size_limit" value="true" />
+ <option name="v3d_is_xserver_process" value="true" />
</application>
<application name="gfxbench" executable="testfw_app">
diff --git a/src/util/driconf.h b/src/util/driconf.h
index ab7aa2c6553..70fa9f7b41b 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -517,6 +517,14 @@
DRI_CONF_OPT_B(v3d_nonmsaa_texture_size_limit, def, \
"Report the non-MSAA-only texture size limit")
+#define DRI_CONF_V3D_IS_XSERVER_PROCESS(def) \
+ DRI_CONF_OPT_B(v3d_is_xserver_process, def, \
+ "Identifies if the application is the Xserver.")
+
+#define DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(def) \
+ DRI_CONF_OPT_B(v3d_maintain_ignorable_scanout, def, \
+ "Maintain SCANOUT usage on resource allocations when the environment allows ignoring SCANOUT usage.")
+
/**
* \brief virgl specific configuration options
*/