nixfiles/hosts/raspberry-pi5/profiles/rbp2-001-add-raspberrypi5-support.patch

diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h
index 3dfc0af8756..1a7d7a689de 100644
--- a/include/drm-uapi/v3d_drm.h
+++ b/include/drm-uapi/v3d_drm.h
@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu {

 	/* Pointer to an array of ioctl extensions*/
 	__u64 extensions;
+
+	struct {
+		__u32 ioc;
+		__u32 pad;
+	} v71;
 };

 /* Submits a compute shader for dispatch.  This job will block on any
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index 31a0d5bfa94..8ac32b313e4 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -23,7 +23,8 @@ v3d_versions = [
   [21, 21],
   [33, 33],
   [41, 33],
-  [42, 33]
+  [42, 33],
+  [71, 33]
 ]

 v3d_xml_files = []
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index a0242b5f1c2..624353ca2bf 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="33" max_ver="42">
+<vcxml gen="3.3" min_ver="33" max_ver="71">

   <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
     <value name="NEVER" value="0"/>
@@ -167,13 +167,36 @@
     <value name="depth_16" value="2"/>
   </enum>

-  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
     <value name="none" value="0"/> <!-- no clamping -->
     <value name="norm" value="1"/> <!-- [0,1] for f16 -->
     <value name="pos" value="2"/> <!-- [0, for f16 -->
     <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
   </enum>

+  <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
+    <value name="8i"             value="0"/>  <!-- no clamping -->
+    <value name="16i"            value="1"/>  <!-- no clamping -->
+    <value name="32i"            value="2"/>  <!-- no clamping -->
+    <value name="8ui"            value="4"/>  <!-- no clamping -->
+    <value name="16ui"           value="5"/>  <!-- no clamping -->
+    <value name="32ui"           value="6"/>  <!-- no clamping -->
+    <value name="8"              value="8"/>  <!-- no clamping -->
+    <value name="16f"            value="9"/>  <!-- no clamping -->
+    <value name="32f"            value="10"/> <!-- no clamping -->
+    <value name="8i_clamped"     value="16"/> <!-- clamp to integer RT's range -->
+    <value name="16i_clamped"    value="17"/> <!-- clamp to integer RT's range -->
+    <value name="32i_clamped"    value="18"/> <!-- clamp to integer RT's range -->
+    <value name="8ui_clamped"    value="20"/> <!-- clamp to integer RT's range -->
+    <value name="16ui_clamped"   value="21"/> <!-- clamp to integer RT's range -->
+    <value name="32ui_clamped"   value="22"/> <!-- clamp to integer RT's range -->
+    <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
+    <value name="16f_clamp_pos"  value="25"/> <!-- [0, for f16 -->
+    <value name="16f_clamp_pq"   value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
+    <value name="16f_clamp_hlg"  value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
+    <value name="invalid"        value="32"/>
+  </enum>
+
   <!---
     CL cache flush commands are not fully documented and subject to a
     number of hardware issues that make them unreliable. Specifically:
@@ -263,13 +286,27 @@
     <value name="r8ui"     value="36"/>
     <value name="srgbx8"   value="37" max_ver="33"/>
     <value name="rgbx8"    value="38" max_ver="33"/>
-    <value name="bstc"     value="39" min_ver="41"/>
+    <value name="bstc8"    value="39" min_ver="41"/>
     <value name="d32f"     value="40" min_ver="41"/>
     <value name="d24"      value="41" min_ver="41"/>
     <value name="d16"      value="42" min_ver="41"/>
     <value name="d24s8"    value="43" min_ver="41"/>
     <value name="s8"       value="44" min_ver="41"/>
     <value name="rgba5551" value="45" min_ver="41"/>
+    <value name="bstc8_srgb"          value="46" min_ver="71"/>
+    <value name="bstc10"              value="47" min_ver="71"/>
+    <value name="bstc10_srgb"         value="48" min_ver="71"/>
+    <value name="bstc10_pq"           value="49" min_ver="71"/>
+    <value name="rgba10x6"            value="50" min_ver="71"/>
+    <value name="bstc10_hlg"          value="55" min_ver="71"/>
+    <value name="rgba10x6_hlg"        value="56" min_ver="71"/>
+    <value name="rgb10_a2_hlg"        value="57" min_ver="71"/>
+    <value name="bstc10_pq_bt1886"    value="58" min_ver="71"/>
+    <value name="rgba10x6_pq_bt1886"  value="59" min_ver="71"/>
+    <value name="rgb10_a2_pq_bt1886"  value="60" min_ver="71"/>
+    <value name="bstc10_hlg_bt1886"   value="61" min_ver="71"/>
+    <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
+    <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
   </enum>

   <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
@@ -314,6 +351,12 @@
     <value name="perp end caps" value="1"/>
   </enum>

+  <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
+    <value name="NONE" value="0"/>
+    <value name="MIN_ONE_TO_ONE" value="1"/>
+    <value name="ZERO_TO_ONE" value="2"/>
+  </enum>
+
   <packet code="0" name="Halt"/>
   <packet code="1" name="NOP"/>
   <packet code="4" name="Flush"/>
@@ -381,11 +424,13 @@
     <field name="Last Tile of Frame" size="1" start="0" type="bool"/>
   </packet>

-  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
     <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
     <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
   </packet>

+  <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
+
   <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
     <field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
     <field name="Enable Z load" size="1" start="7" type="bool"/>
@@ -443,6 +488,10 @@
       <value name="Render target 1" value="1"/>
       <value name="Render target 2" value="2"/>
       <value name="Render target 3" value="3"/>
+      <value name="Render target 4" value="4" min_ver="71"/>
+      <value name="Render target 5" value="5" min_ver="71"/>
+      <value name="Render target 6" value="6" min_ver="71"/>
+      <value name="Render target 7" value="7" min_ver="71"/>
       <value name="None" value="8"/>
       <value name="Z" value="9"/>
       <value name="Stencil" value="10"/>
@@ -789,7 +838,7 @@
     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
   </packet>

-  <packet code="84" name="Blend Cfg" min_ver="41">
+  <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
     <field name="Render Target Mask" size="4" start="24" type="uint"/>
     <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
     <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
@@ -799,6 +848,16 @@
     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
   </packet>

+  <packet code="84" name="Blend Cfg" min_ver="71">
+    <field name="Render Target Mask" size="8" start="24" type="uint"/>
+    <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+    <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+    <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
+    <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
+    <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
+    <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+  </packet>
+
   <packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
     <field name="Alpha (F16)" size="16" start="48" type="uint"/>
     <field name="Blue (F16)" size="16" start="32" type="uint"/>
@@ -828,7 +887,12 @@
     <field name="address" size="32" start="0" type="address"/>
   </packet>

-  <packet code="96" name="Cfg Bits">
+  <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
+    <field name="Lower Test Limit" size="32" start="0" type="float"/>
+    <field name="Upper Test Limit" size="32" start="32" type="float"/>
+  </packet>
+
+  <packet code="96" name="Cfg Bits" max_ver="42">
     <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
     <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
     <field name="Blend enable" size="1" start="19" type="bool"/>
@@ -846,6 +910,25 @@
     <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
   </packet>

+  <packet code="96" name="Cfg Bits" min_ver="71">
+    <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
+    <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+    <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+    <field name="Blend enable" size="1" start="19" type="bool"/>
+    <field name="Stencil enable" size="1" start="18" type="bool"/>
+    <field name="Z updates enable" size="1" start="15" type="bool"/>
+    <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
+    <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
+    <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
+    <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
+    <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
+    <field name="Line Rasterization" size="1" start="4" type="uint"/>
+    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
+    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
+    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
+    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+  </packet>
+
   <packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>

   <packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
@@ -907,16 +990,26 @@
     <field name="Minimum Zw" size="32" start="0" type="float"/>
   </packet>

-  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
+  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
     <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
     <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
   </packet>

+  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
+    <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
+    <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
+  </packet>
+
   <packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
     <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
     <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
   </packet>

+  <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
+    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+  </packet>
+
   <packet name="Number of Layers" code="119" min_ver="41">
     <field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
   </packet>
@@ -947,7 +1040,7 @@
     <field name="sub-id" size="1" start="0" type="uint" default="0"/>
   </packet>

-  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
+  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">

     <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
     <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
@@ -971,6 +1064,35 @@
     </field>
   </packet>

+  <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
+    <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+    <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
+
+    <field name="Log2 Tile Height" size="3" start="11" type="uint">
+      <value name="tile height 8 pixels" value="0"/>
+      <value name="tile height 16 pixels" value="1"/>
+      <value name="tile height 32 pixels" value="2"/>
+      <value name="tile height 64 pixels" value="3"/>
+    </field>
+    <field name="Log2 Tile Width"  size="3" start="8" type="uint">
+      <value name="tile width 8 pixels" value="0"/>
+      <value name="tile width 16 pixels" value="1"/>
+      <value name="tile width 32 pixels" value="2"/>
+      <value name="tile width 64 pixels" value="3"/>
+    </field>
+
+    <field name="tile allocation block size" size="2" start="4" type="uint">
+      <value name="tile allocation block size 64b" value="0"/>
+      <value name="tile allocation block size 128b" value="1"/>
+      <value name="tile allocation block size 256b" value="2"/>
+    </field>
+    <field name="tile allocation initial block size" size="2" start="2" type="uint">
+      <value name="tile allocation initial block size 64b" value="0"/>
+      <value name="tile allocation initial block size 128b" value="1"/>
+      <value name="tile allocation initial block size 256b" value="2"/>
+    </field>
+  </packet>
+
   <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
     <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
     <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
@@ -1002,7 +1124,7 @@
     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
   </packet>

-  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
     <field name="Pad" size="12" start="52" type="uint"/>

     <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
@@ -1018,7 +1140,11 @@
     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>

-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
+      <value name="Render target maximum 32bpp" value="0"/>
+      <value name="Render target maximum 64bpp" value="1"/>
+      <value name="Render target maximum 128bpp" value="2"/>
+    </field>

     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
@@ -1027,6 +1153,43 @@
     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
   </packet>

+  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
+    <field name="Pad" size="6" start="58" type="uint"/>
+
+    <field name="Log2 Tile Height" size="3" start="55" type="uint">
+      <value name="tile height 8 pixels" value="0"/>
+      <value name="tile height 16 pixels" value="1"/>
+      <value name="tile height 32 pixels" value="2"/>
+      <value name="tile height 64 pixels" value="3"/>
+    </field>
+    <field name="Log2 Tile Width"  size="3" start="52" type="uint">
+      <value name="tile width 8 pixels" value="0"/>
+      <value name="tile width 16 pixels" value="1"/>
+      <value name="tile width 32 pixels" value="2"/>
+      <value name="tile width 64 pixels" value="3"/>
+    </field>
+
+    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
+
+    <field name="Early-Z disable" size="1" start="46" type="bool"/>
+
+    <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
+      <value name="Early-Z direction LT/LE" value="0"/>
+      <value name="Early-Z direction GT/GE" value="1"/>
+    </field>
+
+    <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
+    <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+    <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+
+    <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+    <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+    <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
+
+    <field name="sub-id" size="3" start="0" type="uint" default="0"/>
+  </packet>
+
   <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
     <field name="Address" size="32" start="32" type="address"/>

@@ -1048,7 +1211,8 @@
     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
   </packet>

-  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+  <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
+  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">

     <field name="Pad" size="28" start="36" type="uint"/>

@@ -1099,7 +1263,7 @@
     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
   </packet>

-  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
     <field name="unused" size="16" start="48" type="uint"/>

     <field name="Z Clear Value" size="32" start="16" type="float"/>
@@ -1108,6 +1272,15 @@
     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
   </packet>

+  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
+    <field name="unused" size="16" start="48" type="uint"/>
+
+    <field name="Z Clear Value" size="32" start="16" type="float"/>
+
+    <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
+  </packet>
+
   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
@@ -1117,7 +1290,7 @@
     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
   </packet>

-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
     <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -1126,6 +1299,19 @@
     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
   </packet>

+  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
+
+    <field name="Clear Color low bits" size="32" start="32" type="uint"/>
+    <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
+    <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
+
+    <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
+    <!-- In multiples of 512 bits -->
+    <field name="Base Address" size="11" start="7" type="uint"/>
+    <field name="Render Target number" size="3" start="3" type="uint"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="2"/>
+  </packet>
+
   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
@@ -1135,7 +1321,7 @@
     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
   </packet>

-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
     <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -1144,6 +1330,13 @@
     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
   </packet>

+  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
+    <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
+
+    <field name="Render Target number" size="3" start="3" type="uint"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="3"/>
+  </packet>
+
   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
     <field name="pad" size="11" start="53" type="uint"/>
     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
@@ -1155,7 +1348,7 @@
     <field name="sub-id" size="4" start="0" type="uint" default="6"/>
   </packet>

-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
     <field name="pad" size="11" start="53" type="uint"/>
     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
     <!-- image height is for Y flipping -->
@@ -1166,6 +1359,13 @@
     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
   </packet>

+  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
+    <field name="Clear Color top bits" size="56" start="8" type="uint"/>
+
+    <field name="Render Target number" size="3" start="3" type="uint"/>
+    <field name="sub-id" size="3" start="0" type="uint" default="4"/>
+  </packet>
+
   <packet code="124" shortname="tile_coords" name="Tile Coordinates">
     <field name="tile row number" size="12" start="12" type="uint"/>
     <field name="tile column number" size="12" start="0" type="uint"/>
@@ -1240,7 +1440,7 @@
     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
   </struct>

-  <struct name="GL Shader State Record" min_ver="41">
+  <struct name="GL Shader State Record" min_ver="41" max_ver="42">
     <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
     <field name="Enable clipping" size="1" start="1" type="bool"/>

@@ -1299,6 +1499,63 @@
     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
   </struct>

+  <struct name="GL Shader State Record" min_ver="71">
+    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+    <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+
+    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+    <field name="No prim pack" size="1" start="19" type="bool"/>
+    <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
+
+    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+    <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+    <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+    <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
+    <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
+    <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
+    <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+    <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
+    <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
+    <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
+    <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
+
+    <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
+    <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
+    <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
+    <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
+  </struct>
+
   <struct name="Geometry Shader State Record" min_ver="41">
     <field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
     <field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
@@ -1543,7 +1800,7 @@
     <field name="Offset Format 8" size="1" start="0" type="bool"/>
   </struct>

-  <struct name="TMU Config Parameter 2" min_ver="42">
+  <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
     <field name="Pad" size="7" start="25" type="uint"/>
     <field name="LOD Query" size="1" start="24" type="bool"/>
     <field name="Op" size="4" start="20" type="TMU Op"/>
@@ -1558,6 +1815,23 @@
     <field name="Offset Format 8" size="1" start="0" type="bool"/>
   </struct>

+  <struct name="TMU Config Parameter 2" min_ver="71">
+    <field name="Pad" size="5" start="27" type="uint"/>
+    <field name="Write conversion" size="1" start="26" type="bool"/>
+    <field name="DIM query" size="1" start="25" type="bool"/>
+    <field name="LOD Query" size="1" start="24" type="bool"/>
+    <field name="Op" size="4" start="20" type="TMU Op"/>
+    <field name="Offset R" size="4" start="16" type="int"/>
+    <field name="Offset T" size="4" start="12" type="int"/>
+    <field name="Offset S" size="4" start="8" type="int"/>
+    <field name="Gather Mode" size="1" start="7" type="bool"/>
+    <field name="Gather Component" size="2" start="5" type="uint"/>
+    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+    <field name="Sample Number" size="2" start="2" type="uint"/>
+    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+    <field name="Offset Format 8" size="1" start="0" type="bool"/>
+  </struct>
+
   <struct name="Texture Shader State" max_ver="33">
     <field name="UIF XOR disable" size="1" start="255" type="bool"/>
     <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
@@ -1611,7 +1885,7 @@
     <field name="Filter" size="4" start="0" type="TMU Filter"/>
   </struct>

-  <struct name="Texture Shader State" min_ver="41">
+  <struct name="Texture Shader State" min_ver="41" max_ver="42">
     <field name="Pad" size="56" start="136" type="uint"/>
     <field name="UIF XOR disable" size="1" start="135" type="bool"/>
     <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
@@ -1652,6 +1926,82 @@
     <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
   </struct>

+  <struct name="Texture Shader State" min_ver="71">
+    <field name="Pad" size="2" start="190" type="uint"/>
+    <!-- When we use an address type, there is an implicit requirement
+         that the address is a 32-bit that is encoded starting at a 32-bit
+         aligned bit offset into the packet. If the address field has less than
+         32 bits, it is assumed that the address is aligned. For example, a
+         26-bit address field is expected to be 64-byte aligned (6 lsb bits
+         are 0) and that this will be encoded into a packet starting at bit
+         offset 6 into a 32-bit dword (since bits 0..5 of the address are
+         implicitly 0 and don't need to be explicitly encoded).
+
+         Unfortunately, the CB address below doesn't match this requirement:
+         it starts at bit 138, which is 10 bits into a 32-bit dword, but it
+         represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
+         encode it as an address type. To fix this we encode these addresses
+         as uint types which has two implications:
+         1. the driver is responsible for manually addinng the buffer objects
+            for these addresses to the job BO list.
+         2. the driver needs to pass an actual 26-bit address value by manually
+            shifting the 6 lsb bits (that are implicitly 0).
+    -->
+    <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
+    <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
+    <field name="Chroma offset y" size="1" start="137" type="uint"/>
+    <field name="Chroma offset x" size="1" start="136" type="uint"/>
+
+    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+    <field name="Base Level" size="4" start="124" type="uint"/>
+    <field name="Max Level" size="4" start="120" type="uint"/>
+
+    <field name="Swizzle A" size="3" start="117" type="uint">
+      <value name="Swizzle Zero" value="0"/>
+      <value name="Swizzle One" value="1"/>
+      <value name="Swizzle Red" value="2"/>
+      <value name="Swizzle Green" value="3"/>
+      <value name="Swizzle Blue" value="4"/>
+      <value name="Swizzle Alpha" value="5"/>
+    </field>
+
+    <field name="Swizzle B" size="3" start="114" type="uint"/>
+    <field name="Swizzle G" size="3" start="111" type="uint"/>
+    <field name="Swizzle R" size="3" start="108" type="uint"/>
+    <field name="Extended" size="1" start="107" type="bool"/>
+
+    <field name="Texture type" size="7" start="100" type="uint"/>
+    <field name="Image Depth" size="14" start="86" type="uint"/>
+    <field name="Image Height" size="14" start="72" type="uint"/>
+    <field name="Image Width" size="14" start="58" type="uint"/>
+
+    <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
+         at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
+         Array Stride starting at 33, which is backwards incompatible,
+         We use the definition from 7.1.5.
+    -->
+    <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
+    <field name="R/B swap" size="1" start="32" type="bool"/>
+
+    <field name="Texture base pointer" size="32" start="0" type="address"/>
+
+    <field name="Reverse" size="1" start="5" type="bool"/>
+    <field name="Transfer func" size="3" start="2" type="uint">
+      <value name="Transfer Func None" value="0"/>
+      <value name="Transfer Func sRGB" value="1"/>
+      <value name="Transfer Func PQ" value="2"/>
+      <value name="Transfer Func HLG" value="3"/>
+      <value name="Transfer Func PQ BT1886" value="4"/>
+      <value name="Transfer Func HLG BT1886" value="5"/>
+    </field>
+    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+  </struct>
+
   <struct name="Sampler State" min_ver="41">
     <field name="Border color word 3" size="32" start="160" type="uint"/>
     <field name="Border color word 2" size="32" start="128" type="uint"/>
diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
index 5762e5aaa70..e5a1eb26698 100644
--- a/src/broadcom/cle/v3dx_pack.h
+++ b/src/broadcom/cle/v3dx_pack.h
@@ -37,6 +37,8 @@
 #  include "cle/v3d_packet_v41_pack.h"
 #elif (V3D_VERSION == 42)
 #  include "cle/v3d_packet_v42_pack.h"
+#elif (V3D_VERSION == 71)
+#  include "cle/v3d_packet_v71_pack.h"
 #else
 #  error "Need to add a pack header include for this v3d version"
 #endif
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index 6ace62b0310..cda407a00bf 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
 bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
+bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+                            const uint8_t *cl, uint32_t *size, bool reloc_mode);

 static inline void
 out(struct clif_dump *clif, const char *fmt, ...)
diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
index 272190eb2e5..7bc2b662cfc 100644
--- a/src/broadcom/common/v3d_device_info.c
+++ b/src/broadcom/common/v3d_device_info.c
@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
     struct drm_v3d_get_param ident1 = {
             .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
     };
+    struct drm_v3d_get_param hub_ident3 = {
+            .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
+    };
     int ret;

     ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
@@ -62,10 +65,13 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
     int qups = (ident1.value >> 8) & 0xf;
     devinfo->qpu_count = nslc * qups;

+    devinfo->has_accumulators = devinfo->ver < 71;
+
     switch (devinfo->ver) {
         case 33:
         case 41:
         case 42:
+        case 71:
                 break;
         default:
                 fprintf(stderr,
@@ -75,5 +81,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
                 return false;
     }

-    return true;
+    ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
+    if (ret != 0) {
+            fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
+                    strerror(errno));
+            return false;
+    }
+
+   devinfo->rev = (hub_ident3.value >> 8) & 0xff;
+
+   return true;
 }
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 97abd9b8d9f..8dfc7858727 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -34,11 +34,17 @@ struct v3d_device_info {
         /** Simple V3D version: major * 10 + minor */
         uint8_t ver;

+        /** V3D revision number */
+        uint8_t rev;
+
         /** Size of the VPM, in bytes. */
         int vpm_size;

         /* NSLC * QUPS from the core's IDENT registers. */
         int qpu_count;
+
+        /* If the hw has accumulator registers */
+        bool has_accumulators;
 };

 typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
index 46f38bd7484..354c8784914 100644
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@@ -42,7 +42,8 @@

 #define V3D_MAX_SAMPLES 4

-#define V3D_MAX_DRAW_BUFFERS 4
+#define V3D_MAX_DRAW_BUFFERS 8
+#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)

 #define V3D_MAX_POINT_SIZE 512.0f
 #define V3D_MAX_LINE_WIDTH 32
diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
index fe89398208a..b4291fb5350 100644
--- a/src/broadcom/common/v3d_macros.h
+++ b/src/broadcom/common/v3d_macros.h
@@ -41,6 +41,9 @@
 #elif (V3D_VERSION == 42)
 #  define V3DX(x) V3D42_##x
 #  define v3dX(x) v3d42_##x
+#elif (V3D_VERSION == 71)
+#  define V3DX(x) V3D71_##x
+#  define v3dX(x) v3d71_##x
 #else
 #  error "Need to add prefixing macros for this v3d version"
 #endif
diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
index 08d750c2cbe..a8f0cff8784 100644
--- a/src/broadcom/common/v3d_performance_counters.h
+++ b/src/broadcom/common/v3d_performance_counters.h
@@ -28,6 +28,110 @@
 #define V3D_PERFCNT_NAME 1
 #define V3D_PERFCNT_DESCRIPTION 2

+#ifndef V3D_VERSION
+#  error "The V3D_VERSION macro must be defined"
+#endif
+
+#if (V3D_VERSION >= 71)
+
+static const char *v3d_performance_counters[][3] = {
+   {"CORE", "cycle-count", "[CORE] Cycle counter"},
+   {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
+   {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
+   {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
+   {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+   {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+   {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+   {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
+   {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
+   {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
+   {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
+   {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
+   {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
+   {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
+   {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
+   {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
+   {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
+   {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
+   {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
+   {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
+   {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
+   {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
+   {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
+   {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
+   {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
+   {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
+   {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
+   {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
+   {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
+   {"L2T", "L2T-local", "[L2T] Local mode access"},
+   {"L2T", "L2T-writeback", "[L2T] Writeback"},
+   {"L2T", "L2T-zero", "[L2T] Zero"},
+   {"L2T", "L2T-merge", "[L2T] Merge"},
+   {"L2T", "L2T-fill", "[L2T] Fill"},
+   {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
+   {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
+   {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
+   {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
+   {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
+   {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
+   {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
+   {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
+   {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
+   {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
+   {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
+   {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
+   {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
+   {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
+   {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
+   {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
+   {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
+   {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
+   {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
+   {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
+   {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
+   {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
+   {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
+   {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
+   {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
+   {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
+   {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
+   {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
+   {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
+   {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
+   {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
+   {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
+   {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
+   {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
+   {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
+   {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
+   {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
+   {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
+   {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
+   {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
+   {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
+   {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
+   {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
+   {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
+   {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
+   {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
+   {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
+   {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
+   {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
+   {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
+   {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
+   {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
+   {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
+   {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
+   {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
+   {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
+   {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
+   {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
+};
+
+#elif (V3D_VERSION >= 41)
+
 static const char *v3d_performance_counters[][3] = {
    {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
    {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = {
    {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
 };

+#else
+static const char *v3d_performance_counters[][3] = { };
+#endif
+
 #endif
diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
index 80da224ca2d..572d0074794 100644
--- a/src/broadcom/common/v3d_tfu.h
+++ b/src/broadcom/common/v3d_tfu.h
@@ -48,4 +48,27 @@
 #define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
 #define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15

+/* Disable level 0 write, just write following mipmaps */
+#define V3D71_TFU_IOC_DIMTW (1 << 0)
+#define V3D71_TFU_IOC_FORMAT_SHIFT              12
+#define V3D71_TFU_IOC_FORMAT_LINEARTILE          3
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN   4
+#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN   5
+#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR          6
+#define V3D71_TFU_IOA_FORMAT_UIF_XOR             7
+
+#define V3D71_TFU_IOC_STRIDE_SHIFT              16
+#define V3D71_TFU_IOC_NUMMM_SHIFT                4
+
+#define V3D71_TFU_ICFG_OTYPE_SHIFT              16
+#define V3D71_TFU_ICFG_IFORMAT_SHIFT            23
+#define V3D71_TFU_ICFG_FORMAT_RASTER             0
+#define V3D71_TFU_ICFG_FORMAT_SAND_128           1
+#define V3D71_TFU_ICFG_FORMAT_SAND_256           2
+#define V3D71_TFU_ICFG_FORMAT_LINEARTILE        11
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR        14
+#define V3D71_TFU_ICFG_FORMAT_UIF_XOR           15
+
 #endif
diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
index 57872a923d3..8a50d279985 100644
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@@ -87,10 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
    return best_wgs_per_sg;
 }

+#define V3D71_TLB_COLOR_SIZE     (16 * 1024)
+#define V3D71_TLB_DETPH_SIZE     (16 * 1024)
+#define V3D71_TLB_AUX_DETPH_SIZE  (8 * 1024)
+
+static bool
+tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
+{
+   /* First, we check if we can fit this tile size allocating the depth
+    * TLB memory to color.
+    */
+   if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
+       pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
+      return true;
+   }
+
+   /* Otherwise the tile must fit in the main TLB buffers */
+   return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
+          pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
+}
+
 void
-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
-                     bool msaa, bool double_buffer,
-                     uint32_t *width, uint32_t *height)
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+                     uint32_t color_attachment_count,
+                     /* V3D 4.x max internal bpp of all RTs */
+                     uint32_t max_internal_bpp,
+                     /* V3D 7.x accumulated bpp for all RTs (in bytes) */
+                     uint32_t total_color_bpp,
+                     bool msaa,
+                     bool double_buffer,
+                     uint32_t *width,
+                     uint32_t *height)
 {
    static const uint8_t tile_sizes[] = {
       64, 64,
@@ -103,19 +130,65 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
    };

    uint32_t idx = 0;
-   if (color_attachment_count > 2)
-      idx += 2;
-   else if (color_attachment_count > 1)
-      idx += 1;
+   if (devinfo->ver >= 71) {
+      /* In V3D 7.x, we use the actual bpp used by color attachments to compute
+       * the tile size instead of the maximum bpp. This may allow us to choose a
+       * larger tile size than we would in 4.x in scenarios with multiple RTs
+       * with different bpps.
+       *
+       * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
+       * used for depth instead of the main 16KB depth TLB buffer when the depth
+       * tile fits in the auxiliary buffer, allowing the hardware to allocate
+       * the 16KB from the main depth TLB to the color TLB. If we can do that,
+       * then we are effectively doubling the memory we have for color and we
+       * can also select a larger tile size. This is necessary to support
+       * the most expensive configuration: 8x128bpp RTs + MSAA.
+       *
+       * FIXME: the docs state that depth TLB memory can be used for color
+       * if depth testing is not used by setting the 'depth disable' bit in the
+       * rendering configuration. However, this comes with a requirement that
+       * occlussion queries must not be active. We need to clarify if this means
+       * active at the point at which we emit a tile rendering configuration
+       * item, meaning that the we have a query spanning a full render pass
+       * (this is something we can tell before we emit the rendering
+       * configuration item) or active in the subpass for which we are enabling
+       * the bit (which we can't tell until later, when we record commands for
+       * the subpass). If it is the latter, then we cannot use this feature.
+       *
+       * FIXME: pending handling double_buffer.
+       */
+      const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
+      const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
+      do {
+         const uint32_t tile_w = tile_sizes[idx * 2];
+         const uint32_t tile_h = tile_sizes[idx * 2 + 1];
+         if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
+            break;
+         idx++;
+      } while (idx < ARRAY_SIZE(tile_sizes) / 2);
+
+      /* FIXME: pending handling double_buffer */
+      assert(!double_buffer);
+   } else {
+      /* On V3D 4.x tile size is selected based on the number of RTs, the
+       * maximum bpp across all of them and whether 4x MSAA is used.
+       */
+      if (color_attachment_count > 4)
+         idx += 3;
+      else if (color_attachment_count > 2)
+         idx += 2;
+      else if (color_attachment_count > 1)
+         idx += 1;

-   /* MSAA and double-buffer are mutually exclusive */
-   assert(!msaa || !double_buffer);
-   if (msaa)
-      idx += 2;
-   else if (double_buffer)
-      idx += 1;
+      /* MSAA and double-buffer are mutually exclusive */
+      assert(!msaa || !double_buffer);
+      if (msaa)
+         idx += 2;
+      else if (double_buffer)
+         idx += 1;

-   idx += max_color_bpp;
+      idx += max_internal_bpp;
+   }

    assert(idx < ARRAY_SIZE(tile_sizes) / 2);

@@ -170,3 +243,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
       unreachable("Unsupported primitive type");
    }
 }
+
+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp)
+{
+        switch (internal_bpp) {
+        case 0 /* V3D_INTERNAL_BPP_32 */:
+                return 1;
+        case 1 /* V3D_INTERNAL_BPP_64 */:
+                return 2;
+        case 2 /* V3D_INTERNAL_BPP_128 */:
+                return 4;
+        default:
+                unreachable("Unsupported internal BPP");
+        }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+                                       uint32_t bpp)
+{
+        /* stride in multiples of 128 bits, and covers 2 rows. This is the
+         * reason we divide by 2 instead of 4, as we divide number of 32-bit
+         * words per row by 2.
+         */
+
+        return (tile_width * bpp) / 2;
+}
diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
index eb802b77f67..d02d41dd089 100644
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@@ -24,6 +24,7 @@
 #ifndef V3D_UTIL_H
 #define V3D_UTIL_H

+#include "util/macros.h"
 #include "common/v3d_device_info.h"
 #include "pipe/p_defines.h"

@@ -36,9 +37,14 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
                                          uint32_t wg_size);

 void
-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
-                     bool msaa, bool double_buffer,
-                     uint32_t *width, uint32_t *height);
+v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+                     uint32_t color_attachment_count,
+                     uint32_t max_internal_bpp,
+                     uint32_t total_color_bpp,
+                     bool msaa,
+                     bool double_buffer,
+                     uint32_t *width,
+                     uint32_t *height);

 uint32_t
 v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
@@ -46,4 +52,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
 uint32_t
 v3d_hw_prim_type(enum mesa_prim prim_type);

+uint32_t
+v3d_internal_bpp_words(uint32_t internal_bpp);
+
+/* Some configuration packets want the size on log2, but starting at 0 for
+ * size 8.
+ */
+static inline uint8_t
+log2_tile_size(uint32_t size)
+{
+        switch(size) {
+        case 8:
+                return 0;
+        case 16:
+                return 1;
+        case 32:
+                return 2;
+        case 64:
+                return 3;
+        default:
+                unreachable("Unsupported tile width/height");
+        }
+}
+
+uint32_t
+v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
+                                       uint32_t bpp);
 #endif
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index ad461dbe24c..4536d3bc67b 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)

 static struct qreg
 emit_smooth_varying(struct v3d_compile *c,
-                    struct qreg vary, struct qreg w, struct qreg r5)
+                    struct qreg vary, struct qreg w, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
+        return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
 }

 static struct qreg
 emit_noperspective_varying(struct v3d_compile *c,
-                           struct qreg vary, struct qreg r5)
+                           struct qreg vary, struct qreg c_reg)
 {
-        return vir_FADD(c, vir_MOV(c, vary), r5);
+        return vir_FADD(c, vir_MOV(c, vary), c_reg);
 }

 static struct qreg
 emit_flat_varying(struct v3d_compile *c,
-                  struct qreg vary, struct qreg r5)
+                  struct qreg vary, struct qreg c_reg)
 {
         vir_MOV_dest(c, c->undef, vary);
-        return vir_MOV(c, r5);
+        return vir_MOV(c, c_reg);
 }

 static struct qreg
 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                       int8_t input_idx, uint8_t swizzle, int array_index)
 {
-        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
-        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        struct qreg c_reg; /* C coefficient */
+
+        if (c->devinfo->has_accumulators)
+                c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
+        else
+                c_reg = vir_reg(QFILE_REG, 0);

         struct qinst *ldvary = NULL;
         struct qreg vary;
@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 vary = vir_emit_def(c, ldvary);
         } else {
                 vir_NOP(c)->qpu.sig.ldvary = true;
-                vary = r3;
+                vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
         }

         /* Store the input value before interpolation so we can implement
@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
         if (input_idx >= 0) {
                 assert(var);
                 c->interp[input_idx].vp = vary;
-                c->interp[input_idx].C = vir_MOV(c, r5);
+                c->interp[input_idx].C = vir_MOV(c, c_reg);
                 c->interp[input_idx].mode = var->data.interpolation;
         }

@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
          */
         if (!var) {
                 assert(input_idx < 0);
-                return emit_smooth_varying(c, vary, c->payload_w, r5);
+                return emit_smooth_varying(c, vary, c->payload_w, c_reg);
         }

         int i = c->num_inputs++;
@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
                 if (var->data.centroid) {
                         BITSET_SET(c->centroid_flags, i);
                         result = emit_smooth_varying(c, vary,
-                                                     c->payload_w_centroid, r5);
+                                                     c->payload_w_centroid, c_reg);
                 } else {
-                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
+                        result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
                 }
                 break;

         case INTERP_MODE_NOPERSPECTIVE:
                 BITSET_SET(c->noperspective_flags, i);
-                result = emit_noperspective_varying(c, vary, r5);
+                result = emit_noperspective_varying(c, vary, c_reg);
                 break;

         case INTERP_MODE_FLAT:
                 BITSET_SET(c->flat_shade_flags, i);
-                result = emit_flat_varying(c, vary, r5);
+                result = emit_flat_varying(c, vary, c_reg);
                 break;

         default:
@@ -1685,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 result = vir_VFPACK(c, src[0], src[1]);
                 break;

+        case nir_op_vpack_v3d:
+                result = vir_VPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_v11fpack_v3d:
+                result = vir_V11FPACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_v10pack_v3d:
+                result = vir_V10PACK(c, src[0], src[1]);
+                break;
+
+        case nir_op_v8pack_v3d:
+                result = vir_V8PACK(c, src[0], src[1]);
+                break;
+
         case nir_op_unpack_half_2x16_split_x:
                 result = vir_FMOV(c, src[0]);
                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
@@ -1715,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
                 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
                 break;
         }
+        case nir_op_vftounorm8_v3d:
+                result = vir_VFTOUNORM8(c, src[0]);
+                break;
+
+        case nir_op_vftosnorm8_v3d:
+                result = vir_VFTOSNORM8(c, src[0]);
+                break;
+
+        case nir_op_vftounorm10lo_v3d:
+                result = vir_VFTOUNORM10LO(c, src[0]);
+                break;
+
+        case nir_op_vftounorm10hi_v3d:
+                result = vir_VFTOUNORM10HI(c, src[0]);
+                break;
+
+        case nir_op_ftounorm16_v3d:
+                result = vir_FTOUNORM16(c, src[0]);
+                break;
+
+        case nir_op_ftosnorm16_v3d:
+                result = vir_FTOSNORM16(c, src[0]);
+                break;

         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
@@ -2440,15 +2483,17 @@ ntq_setup_outputs(struct v3d_compile *c)

                 switch (var->data.location) {
                 case FRAG_RESULT_COLOR:
-                        c->output_color_var[0] = var;
-                        c->output_color_var[1] = var;
-                        c->output_color_var[2] = var;
-                        c->output_color_var[3] = var;
+                        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+                                c->output_color_var[i] = var;
                         break;
                 case FRAG_RESULT_DATA0:
                 case FRAG_RESULT_DATA1:
                 case FRAG_RESULT_DATA2:
                 case FRAG_RESULT_DATA3:
+                case FRAG_RESULT_DATA4:
+                case FRAG_RESULT_DATA5:
+                case FRAG_RESULT_DATA6:
+                case FRAG_RESULT_DATA7:
                         c->output_color_var[var->data.location -
                                             FRAG_RESULT_DATA0] = var;
                         break;
@@ -4321,7 +4366,11 @@ nir_to_vir(struct v3d_compile *c)
 {
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
-                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                if (c->devinfo->ver < 71)
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                else
+                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
+
                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));

@@ -4361,8 +4410,13 @@ nir_to_vir(struct v3d_compile *c)
                                                       V3D_QPU_WADDR_SYNC));
                 }

-                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
-                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                if (c->devinfo->ver <= 42) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                } else if (c->devinfo->ver >= 71) {
+                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
+                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
+                }

                 /* Set up the division between gl_LocalInvocationIndex and
                  * wg_in_mem in the payload reg.
@@ -4541,8 +4595,8 @@ vir_check_payload_w(struct v3d_compile *c)

         vir_for_each_inst_inorder(inst, c) {
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
-                        if (inst->src[i].file == QFILE_REG &&
-                            inst->src[i].index == 0) {
+                        if (inst->src[i].file == c->payload_w.file &&
+                            inst->src[i].index == c->payload_w.index) {
                                 c->uses_center_w = true;
                                 return;
                         }
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 3b32b48f86f..4f767296860 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -155,12 +155,13 @@ static void
 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
                  enum v3d_qpu_mux mux)
 {
+        assert(state->devinfo->ver < 71);
         switch (mux) {
         case V3D_QPU_MUX_A:
                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
                 break;
         case V3D_QPU_MUX_B:
-                if (!n->inst->qpu.sig.small_imm) {
+                if (!n->inst->qpu.sig.small_imm_b) {
                         add_read_dep(state,
                                      state->last_rf[n->inst->qpu.raddr_b], n);
                 }
@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
         }
 }

+
+static void
+process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
+                   uint8_t raddr, bool is_small_imm)
+{
+        assert(state->devinfo->ver >= 71);
+
+        if (!is_small_imm)
+                add_read_dep(state, state->last_rf[raddr], n);
+}
+
 static bool
 tmu_write_is_sequence_terminator(uint32_t waddr)
 {
@@ -285,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
         /* If the input and output segments are shared, then all VPM reads to
          * a location need to happen before all writes.  We handle this by
          * serializing all VPM operations for now.
+         *
+         * FIXME: we are assuming that the segments are shared. That is
+         * correct right now as we are only using shared, but technically you
+         * can choose.
          */
         bool separate_vpm_segment = false;

@@ -305,15 +321,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)

         /* XXX: LOAD_IMM */

-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
-                process_mux_deps(state, n, inst->alu.add.a);
-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
-                process_mux_deps(state, n, inst->alu.add.b);
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.add.a.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.add.a.raddr,
+                                           inst->sig.small_imm_a);
+                }
+        }
+        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.add.b.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.add.b.raddr,
+                                           inst->sig.small_imm_b);
+                }
+        }

-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
-                process_mux_deps(state, n, inst->alu.mul.a);
-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
-                process_mux_deps(state, n, inst->alu.mul.b);
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.mul.a.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.mul.a.raddr,
+                                           inst->sig.small_imm_c);
+                }
+        }
+        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+                if (devinfo->ver < 71) {
+                        process_mux_deps(state, n, inst->alu.mul.b.mux);
+                } else {
+                        process_raddr_deps(state, n, inst->alu.mul.b.raddr,
+                                           inst->sig.small_imm_d);
+                }
+        }

         switch (inst->alu.add.op) {
         case V3D_QPU_A_VPMSETUP:
@@ -386,6 +426,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                 add_write_dep(state, &state->last_r[4], n);
         if (v3d_qpu_writes_r5(devinfo, inst))
                 add_write_dep(state, &state->last_r[5], n);
+        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
+                add_write_dep(state, &state->last_rf[0], n);

         /* If we add any more dependencies here we should consider whether we
          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
@@ -500,6 +542,10 @@ struct choose_scoreboard {
         int ldvary_count;
         int pending_ldtmu_count;
         bool first_ldtmu_after_thrsw;
+
+        /* V3D 7.x */
+        int last_implicit_rf0_write_tick;
+        bool has_rf0_flops_conflict;
 };

 static bool
@@ -524,7 +570,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
 }

 static bool
-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+reads_too_soon(struct choose_scoreboard *scoreboard,
+               const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+        switch (raddr) {
+        case 0: /* ldvary delayed write of C coefficient to rf0 */
+                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
+                        return true;
+                break;
+        default:
+                break;
+        }
+
+        return false;
+}
+
+static bool
+reads_too_soon_after_write(const struct v3d_device_info *devinfo,
+                           struct choose_scoreboard *scoreboard,
                            struct qinst *qinst)
 {
         const struct v3d_qpu_instr *inst = &qinst->qpu;
@@ -536,24 +599,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);

         if (inst->alu.add.op != V3D_QPU_A_NOP) {
-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
-                        return true;
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
+                                        return true;
+                        }
                 }
-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
-                        return true;
+                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
+                                        return true;
+                        }
                 }
         }

         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
-                        return true;
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+                                        return true;
+                        }
                 }
-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
-                        return true;
+                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
+                        if (devinfo->ver < 71) {
+                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
+                                        return true;
+                        } else {
+                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
+                                        return true;
+                        }
                 }
         }

@@ -577,6 +660,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
             v3d_qpu_writes_r4(devinfo, inst))
                 return true;

+        if (devinfo->ver <= 42)
+           return false;
+
+        /* Don't schedule anything that writes rf0 right after ldvary, since
+         * that would clash with the ldvary's delayed rf0 write (the exception
+         * is another ldvary, since its implicit rf0 write would also have
+         * one cycle of delay and would not clash).
+         */
+        if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
+            (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+             (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+              !inst->sig.ldvary))) {
+            return true;
+       }
+
         return false;
 }

@@ -604,29 +702,36 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
 }

 static bool
-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *inst,
                         uint32_t waddr) {

         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
            return false;

-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
-            inst->raddr_a == waddr)
-              return true;
+        if (devinfo->ver < 71) {
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+                    inst->raddr_a == waddr)
+                        return true;

-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
-            !inst->sig.small_imm && (inst->raddr_b == waddr))
-              return true;
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+                    !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+                        return true;
+        } else {
+                if (v3d71_qpu_reads_raddr(inst, waddr))
+                        return true;
+        }

         return false;
 }

 static bool
-mux_read_stalls(struct choose_scoreboard *scoreboard,
-                const struct v3d_qpu_instr *inst)
+read_stalls(const struct v3d_device_info *devinfo,
+            struct choose_scoreboard *scoreboard,
+            const struct v3d_qpu_instr *inst)
 {
         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
-                qpu_instruction_uses_rf(inst,
+                qpu_instruction_uses_rf(devinfo, inst,
                                         scoreboard->last_stallable_sfu_reg);
 }

@@ -692,7 +797,8 @@ enum {
         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
         V3D_PERIPHERAL_TSY                = (1 << 8),
-        V3D_PERIPHERAL_TLB                = (1 << 9),
+        V3D_PERIPHERAL_TLB_READ           = (1 << 9),
+        V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
 };

 static uint32_t
@@ -717,8 +823,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
         if (v3d_qpu_uses_sfu(inst))
                 result |= V3D_PERIPHERAL_SFU;

-        if (v3d_qpu_uses_tlb(inst))
-                result |= V3D_PERIPHERAL_TLB;
+        if (v3d_qpu_reads_tlb(inst))
+                result |= V3D_PERIPHERAL_TLB_READ;
+        if (v3d_qpu_writes_tlb(inst))
+                result |= V3D_PERIPHERAL_TLB_WRITE;

         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
@@ -749,32 +857,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
         if (devinfo->ver < 41)
                 return false;

-        /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
-         * tmuc).
+        /* V3D 4.x can't do more than one peripheral access except in a
+         * few cases:
          */
-        if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
-            b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
-                return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+        if (devinfo->ver <= 42) {
+                /* WRTMUC signal with TMU register write (other than tmuc). */
+                if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                    b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
+                }
+                if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                    a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+                }
+
+                /* TMU read with VPM read/write. */
+                if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+                    (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+                     b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+                        return true;
+                }
+                if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+                    (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+                     a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+                        return true;
+                }
+
+                return false;
         }

-        if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
-            b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
-                return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
+        /* V3D 7.x can't have more than one of these restricted peripherals */
+        const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
+                                    V3D_PERIPHERAL_TMU_WRTMUC_SIG |
+                                    V3D_PERIPHERAL_TSY |
+                                    V3D_PERIPHERAL_TLB_READ |
+                                    V3D_PERIPHERAL_SFU |
+                                    V3D_PERIPHERAL_VPM_READ |
+                                    V3D_PERIPHERAL_VPM_WRITE;
+
+        const uint32_t a_restricted = a_peripherals & restricted;
+        const uint32_t b_restricted = b_peripherals & restricted;
+        if (a_restricted && b_restricted) {
+                /* WRTMUC signal with TMU register write (other than tmuc) is
+                 * allowed though.
+                 */
+                if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                       b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+                       v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
+                      (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+                       a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
+                       v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
+                        return false;
+                }
         }

-        /* V3D 4.1+ allows TMU read with VPM read/write. */
-        if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
-            (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
-             b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
-                return true;
+        /* Only one TMU read per instruction */
+        if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
+            (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
+                return false;
         }
-        if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
-            (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
-             a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
-                return true;
+
+        /* Only one TLB access per instruction */
+        if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+                              V3D_PERIPHERAL_TLB_READ)) &&
+            (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
+                              V3D_PERIPHERAL_TLB_READ))) {
+                return false;
         }

-        return false;
+        return true;
 }

 /* Compute a bitmask of which rf registers are used between
@@ -790,42 +941,67 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
         uint64_t raddrs_used = 0;
         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
                 raddrs_used |= (1ll << a->raddr_a);
-        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+        if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
                 raddrs_used |= (1ll << a->raddr_b);
         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
                 raddrs_used |= (1ll << b->raddr_a);
-        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+        if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
                 raddrs_used |= (1ll << b->raddr_b);

         return raddrs_used;
 }

-/* Take two instructions and attempt to merge their raddr fields
- * into one merged instruction. Returns false if the two instructions
- * access more than two different rf registers between them, or more
- * than one rf register and one small immediate.
+/* Takes two instructions and attempts to merge their raddr fields (including
+ * small immediates) into one merged instruction. For V3D 4.x, returns false
+ * if the two instructions access more than two different rf registers between
+ * them, or more than one rf register and one small immediate. For 7.x returns
+ * false if both instructions use small immediates.
  */
 static bool
 qpu_merge_raddrs(struct v3d_qpu_instr *result,
                  const struct v3d_qpu_instr *add_instr,
-                 const struct v3d_qpu_instr *mul_instr)
+                 const struct v3d_qpu_instr *mul_instr,
+                 const struct v3d_device_info *devinfo)
 {
+        if (devinfo->ver >= 71) {
+                assert(add_instr->sig.small_imm_a +
+                       add_instr->sig.small_imm_b <= 1);
+                assert(add_instr->sig.small_imm_c +
+                       add_instr->sig.small_imm_d == 0);
+                assert(mul_instr->sig.small_imm_a +
+                       mul_instr->sig.small_imm_b == 0);
+                assert(mul_instr->sig.small_imm_c +
+                       mul_instr->sig.small_imm_d <= 1);
+
+                result->sig.small_imm_a = add_instr->sig.small_imm_a;
+                result->sig.small_imm_b = add_instr->sig.small_imm_b;
+                result->sig.small_imm_c = mul_instr->sig.small_imm_c;
+                result->sig.small_imm_d = mul_instr->sig.small_imm_d;
+
+                return (result->sig.small_imm_a +
+                        result->sig.small_imm_b +
+                        result->sig.small_imm_c +
+                        result->sig.small_imm_d) <= 1;
+        }
+
+        assert(devinfo->ver <= 42);
+
         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
         int naddrs = util_bitcount64(raddrs_used);

         if (naddrs > 2)
                 return false;

-        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
+        if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
                 if (naddrs > 1)
                         return false;

-                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
+                if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
                         if (add_instr->raddr_b != mul_instr->raddr_b)
                                 return false;

-                result->sig.small_imm = true;
-                result->raddr_b = add_instr->sig.small_imm ?
+                result->sig.small_imm_b = true;
+                result->raddr_b = add_instr->sig.small_imm_b ?
                         add_instr->raddr_b : mul_instr->raddr_b;
         }

@@ -836,23 +1012,23 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         raddrs_used &= ~(1ll << raddr_a);
         result->raddr_a = raddr_a;

-        if (!result->sig.small_imm) {
+        if (!result->sig.small_imm_b) {
                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
                     raddr_a == add_instr->raddr_b) {
-                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
-                                result->alu.add.a = V3D_QPU_MUX_A;
-                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
+                        if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
+                                result->alu.add.a.mux = V3D_QPU_MUX_A;
+                        if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
-                                result->alu.add.b = V3D_QPU_MUX_A;
+                                result->alu.add.b.mux = V3D_QPU_MUX_A;
                         }
                 }
                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
                     raddr_a == mul_instr->raddr_b) {
-                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
-                                result->alu.mul.a = V3D_QPU_MUX_A;
-                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
+                        if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
+                                result->alu.mul.a.mux = V3D_QPU_MUX_A;
+                        if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
-                                result->alu.mul.b = V3D_QPU_MUX_A;
+                                result->alu.mul.b.mux = V3D_QPU_MUX_A;
                         }
                 }
         }
@@ -863,20 +1039,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
         result->raddr_b = raddr_b;
         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
             raddr_b == add_instr->raddr_a) {
-                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
-                        result->alu.add.a = V3D_QPU_MUX_B;
-                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
+                if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
+                        result->alu.add.a.mux = V3D_QPU_MUX_B;
+                if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
-                        result->alu.add.b = V3D_QPU_MUX_B;
+                        result->alu.add.b.mux = V3D_QPU_MUX_B;
                 }
         }
         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
             raddr_b == mul_instr->raddr_a) {
-                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
-                        result->alu.mul.a = V3D_QPU_MUX_B;
-                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
+                if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
+                        result->alu.mul.a.mux = V3D_QPU_MUX_B;
+                if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
-                        result->alu.mul.b = V3D_QPU_MUX_B;
+                        result->alu.mul.b.mux = V3D_QPU_MUX_B;
                 }
         }

@@ -909,7 +1085,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
 }

 static void
-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
+                       struct v3d_qpu_instr *inst)
 {
         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
         assert(inst->alu.add.op != V3D_QPU_A_NOP);
@@ -927,11 +1104,85 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
         inst->flags.auf = V3D_QPU_UF_NONE;

         inst->alu.mul.output_pack = inst->alu.add.output_pack;
-        inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
-        inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
+
+        inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
+        inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
-        inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
-        inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+
+        if (devinfo->ver >= 71) {
+                assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
+                assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
+                if (inst->sig.small_imm_a) {
+                        inst->sig.small_imm_c = true;
+                        inst->sig.small_imm_a = false;
+                } else if (inst->sig.small_imm_b) {
+                        inst->sig.small_imm_d = true;
+                        inst->sig.small_imm_b = false;
+                }
+        }
+}
+
+static bool
+can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
+{
+        switch (op) {
+        case V3D_QPU_M_MOV:
+        case V3D_QPU_M_FMOV:
+                return devinfo->ver >= 71;
+        default:
+                return false;
+        }
+}
+
+static enum v3d_qpu_mul_op
+mul_op_as_add_op(enum v3d_qpu_mul_op op)
+{
+        switch (op) {
+        case V3D_QPU_M_MOV:
+                return V3D_QPU_A_MOV;
+        case V3D_QPU_M_FMOV:
+                return V3D_QPU_A_FMOV;
+        default:
+                unreachable("unexpected mov opcode");
+        }
+}
+
+static void
+qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+{
+        STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
+        assert(inst->alu.mul.op != V3D_QPU_M_NOP);
+        assert(inst->alu.add.op == V3D_QPU_A_NOP);
+
+        memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
+        inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
+        inst->alu.mul.op = V3D_QPU_M_NOP;
+
+        inst->flags.ac = inst->flags.mc;
+        inst->flags.apf = inst->flags.mpf;
+        inst->flags.auf = inst->flags.muf;
+        inst->flags.mc = V3D_QPU_COND_NONE;
+        inst->flags.mpf = V3D_QPU_PF_NONE;
+        inst->flags.muf = V3D_QPU_UF_NONE;
+
+        inst->alu.add.output_pack = inst->alu.mul.output_pack;
+        inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
+        inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
+        inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+        inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+        inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+        assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
+        assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
+        if (inst->sig.small_imm_c) {
+                inst->sig.small_imm_a = true;
+                inst->sig.small_imm_c = false;
+        } else if (inst->sig.small_imm_d) {
+                inst->sig.small_imm_b = true;
+                inst->sig.small_imm_d = false;
+        }
 }

 static bool
@@ -970,20 +1221,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
                          can_do_add_as_mul(b->alu.add.op)) {
                         mul_inst = *b;
-                        qpu_convert_add_to_mul(&mul_inst);
+                        qpu_convert_add_to_mul(devinfo, &mul_inst);

                         merge.alu.mul = mul_inst.alu.mul;

-                        merge.flags.mc = b->flags.ac;
-                        merge.flags.mpf = b->flags.apf;
-                        merge.flags.muf = b->flags.auf;
+                        merge.flags.mc = mul_inst.flags.mc;
+                        merge.flags.mpf = mul_inst.flags.mpf;
+                        merge.flags.muf = mul_inst.flags.muf;

                         add_instr = a;
                         mul_instr = &mul_inst;
                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
                            can_do_add_as_mul(a->alu.add.op)) {
                         mul_inst = *a;
-                        qpu_convert_add_to_mul(&mul_inst);
+                        qpu_convert_add_to_mul(devinfo, &mul_inst);

                         merge = mul_inst;
                         merge.alu.add = b->alu.add;
@@ -999,22 +1250,62 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
                 }
         }

+        struct v3d_qpu_instr add_inst;
         if (b->alu.mul.op != V3D_QPU_M_NOP) {
-                if (a->alu.mul.op != V3D_QPU_M_NOP)
-                        return false;
-                merge.alu.mul = b->alu.mul;
+                if (a->alu.mul.op == V3D_QPU_M_NOP) {
+                        merge.alu.mul = b->alu.mul;
+
+                        merge.flags.mc = b->flags.mc;
+                        merge.flags.mpf = b->flags.mpf;
+                        merge.flags.muf = b->flags.muf;
+
+                        mul_instr = b;
+                        add_instr = a;
+                }
+                /* If a's mul op is used but its add op is not, then see if we
+                 * can convert either a's mul op or b's mul op to an add op
+                 * so we can merge.
+                 */
+                else if (a->alu.add.op == V3D_QPU_A_NOP &&
+                         can_do_mul_as_add(devinfo, b->alu.mul.op)) {
+                        add_inst = *b;
+                        qpu_convert_mul_to_add(&add_inst);

-                merge.flags.mc = b->flags.mc;
-                merge.flags.mpf = b->flags.mpf;
-                merge.flags.muf = b->flags.muf;
+                        merge.alu.add = add_inst.alu.add;

-                mul_instr = b;
-                add_instr = a;
+                        merge.flags.ac = add_inst.flags.ac;
+                        merge.flags.apf = add_inst.flags.apf;
+                        merge.flags.auf = add_inst.flags.auf;
+
+                        mul_instr = a;
+                        add_instr = &add_inst;
+                } else if (a->alu.add.op == V3D_QPU_A_NOP &&
+                           can_do_mul_as_add(devinfo, a->alu.mul.op)) {
+                        add_inst = *a;
+                        qpu_convert_mul_to_add(&add_inst);
+
+                        merge = add_inst;
+                        merge.alu.mul = b->alu.mul;
+
+                        merge.flags.mc = b->flags.mc;
+                        merge.flags.mpf = b->flags.mpf;
+                        merge.flags.muf = b->flags.muf;
+
+                        mul_instr = b;
+                        add_instr = &add_inst;
+                } else {
+                        return false;
+                }
         }

+        /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+         * they have restrictions on the number of raddrs that can be adressed
+         * in a single instruction. In V3D 7.x, we don't have that restriction,
+         * but we are still limited to a single small immediate per instruction.
+         */
         if (add_instr && mul_instr &&
-            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
-                        return false;
+            !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+                return false;
         }

         merge.sig.thrsw |= b->sig.thrsw;
@@ -1025,7 +1316,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
         merge.sig.ldtmu |= b->sig.ldtmu;
         merge.sig.ldvary |= b->sig.ldvary;
         merge.sig.ldvpm |= b->sig.ldvpm;
-        merge.sig.small_imm |= b->sig.small_imm;
         merge.sig.ldtlb |= b->sig.ldtlb;
         merge.sig.ldtlbu |= b->sig.ldtlbu;
         merge.sig.ucb |= b->sig.ucb;
@@ -1108,7 +1398,7 @@ retry:
                  *  regfile A or B that was written to by the previous
                  *  instruction."
                  */
-                if (reads_too_soon_after_write(scoreboard, n->inst))
+                if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
                         continue;

                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
@@ -1122,10 +1412,11 @@ retry:
                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
                         continue;

-                /* ldunif and ldvary both write r5, but ldunif does so a tick
-                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
+                /* ldunif and ldvary both write the same register (r5 for v42
+                 * and below, rf0 for v71), but ldunif does so a tick sooner.
+                 * If the ldvary's register wasn't used, then ldunif might
                  * otherwise get scheduled so ldunif and ldvary try to update
-                 * r5 in the same tick.
+                 * the register in the same tick.
                  */
                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
@@ -1204,11 +1495,20 @@ retry:
                          * ldvary now if the follow-up fixup would place
                          * it in the delay slots of a thrsw, which is not
                          * allowed and would prevent the fixup from being
-                         * successful.
+                         * successful. In V3D 7.x we can allow this to happen
+                         * as long as it is not the last delay slot.
                          */
-                        if (inst->sig.ldvary &&
-                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
-                                continue;
+                        if (inst->sig.ldvary) {
+                                if (c->devinfo->ver <= 42 &&
+                                    scoreboard->last_thrsw_tick + 2 >=
+                                    scoreboard->tick - 1) {
+                                        continue;
+                                }
+                                if (c->devinfo->ver >= 71 &&
+                                    scoreboard->last_thrsw_tick + 2 ==
+                                    scoreboard->tick - 1) {
+                                        continue;
+                                }
                         }

                         /* We can emit a new tmu lookup with a previous ldtmu
@@ -1243,7 +1543,7 @@ retry:

                 int prio = get_instruction_priority(c->devinfo, inst);

-                if (mux_read_stalls(scoreboard, inst)) {
+                if (read_stalls(c->devinfo, scoreboard, inst)) {
                         /* Don't merge an instruction that stalls */
                         if (prev_inst)
                                 continue;
@@ -1340,6 +1640,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
         }
 }

+static void
+set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
+                           const struct v3d_qpu_instr *inst,
+                           const struct v3d_device_info *devinfo)
+{
+        if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
+            v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+            !inst->sig_magic) {
+                scoreboard->has_rf0_flops_conflict = true;
+        }
+}
+
+static void
+update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
+                                const struct v3d_qpu_instr *inst,
+                                const struct v3d_device_info *devinfo)
+{
+        if (devinfo->ver < 71)
+                return;
+
+        /* Thread switch restrictions:
+         *
+         * At the point of a thread switch or thread end (when the actual
+         * thread switch or thread end happens, not when the signalling
+         * instruction is processed):
+         *
+         *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
+         *      ldvary instruction in which another signal also wrote to the
+         *      register file, and the final instruction of the thread section
+         *      contained a signal which wrote to the register file, then the
+         *      value of rf0 is undefined at the start of the new section
+         *
+         * Here we use the scoreboard to track if our last rf0 implicit write
+         * happens at the same time that another signal writes the register
+         * file (has_rf0_flops_conflict). We will use that information when
+         * scheduling thrsw instructions to avoid putting anything in their
+         * last delay slot which has a signal that writes to the register file.
+         */
+
+        /* Reset tracking if we have an explicit rf0 write or we are starting
+         * a new thread section.
+         */
+        if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
+            scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
+                scoreboard->last_implicit_rf0_write_tick = -10;
+                scoreboard->has_rf0_flops_conflict = false;
+        }
+
+        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
+                scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
+                        scoreboard->tick + 1 : scoreboard->tick;
+        }
+
+        set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+}
+
 static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                              const struct qinst *qinst,
@@ -1383,6 +1739,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
         if (inst->sig.ldvary)
                 scoreboard->last_ldvary_tick = scoreboard->tick;

+        update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
+
         update_scoreboard_tmu_tracking(scoreboard, qinst);
 }

@@ -1580,7 +1938,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
         if (slot > 0 && qinst->uniform != ~0)
                 return false;

-        if (v3d_qpu_waits_vpm(inst))
+        if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
                 return false;

         if (inst->sig.ldvary)
@@ -1588,35 +1946,67 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,

         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
-                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+                if (c->devinfo->ver <= 42 && slot == 2 &&
+                    inst->alu.add.op == V3D_QPU_A_TMUWT) {
                         return false;
+                }

-                /* No writing physical registers at the end. */
-                bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
-                bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
-                if ((!add_is_nop && !inst->alu.add.magic_write) ||
-                    (!mul_is_nop && !inst->alu.mul.magic_write)) {
-                        return false;
+                if (c->devinfo->ver <= 42) {
+                        /* No writing physical registers at the end. */
+                        bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+                        bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+                        if ((!add_is_nop && !inst->alu.add.magic_write) ||
+                            (!mul_is_nop && !inst->alu.mul.magic_write)) {
+                                return false;
+                        }
+
+                        if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+                            !inst->sig_magic) {
+                                return false;
+                        }
                 }

-                if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
-                    !inst->sig_magic) {
-                        return false;
+                if (c->devinfo->ver >= 71) {
+                        /* The thread end instruction must not write to the
+                         * register file via the add/mul ALUs.
+                         */
+                        if (slot == 0 &&
+                            (!inst->alu.add.magic_write ||
+                             !inst->alu.mul.magic_write)) {
+                                return false;
+                        }
                 }

                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
                         return false;

-                /* RF0-2 might be overwritten during the delay slots by
-                 * fragment shader setup.
-                 */
-                if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
-                        return false;
+                if (c->devinfo->ver <= 42) {
+                        /* RF0-2 might be overwritten during the delay slots by
+                         * fragment shader setup.
+                         */
+                        if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+                                return false;

-                if (inst->raddr_b < 3 &&
-                    !inst->sig.small_imm &&
-                    v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
-                        return false;
+                        if (inst->raddr_b < 3 &&
+                            !inst->sig.small_imm_b &&
+                            v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+                                return false;
+                        }
+                }
+
+                if (c->devinfo->ver >= 71) {
+                        /* RF2-3 might be overwritten during the delay slots by
+                         * fragment shader setup.
+                         */
+                        if (v3d71_qpu_reads_raddr(inst, 2) ||
+                            v3d71_qpu_reads_raddr(inst, 3)) {
+                                return false;
+                        }
+
+                        if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
+                            v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
+                                return false;
+                        }
                 }
         }

@@ -1632,6 +2022,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
  */
 static bool
 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+                                          struct choose_scoreboard *scoreboard,
                                           const struct qinst *qinst,
                                           uint32_t slot)
 {
@@ -1642,8 +2033,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
                 return false;

-        if (slot > 0 && qinst->qpu.sig.ldvary)
-                return false;
+        if (qinst->qpu.sig.ldvary) {
+                if (c->devinfo->ver <= 42 && slot > 0)
+                        return false;
+                if (c->devinfo->ver >= 71 && slot == 2)
+                        return false;
+        }

         /* unifa and the following 3 instructions can't overlap a
          * thread switch/end. The docs further clarify that this means
@@ -1662,6 +2057,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
                 return false;

+        /* See comment when we set has_rf0_flops_conflict for details */
+        if (c->devinfo->ver >= 71 &&
+            slot == 2 &&
+            v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
+            !qinst->qpu.sig_magic) {
+                if (scoreboard->has_rf0_flops_conflict)
+                        return false;
+                if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
+                        return false;
+        }
+
         return true;
 }

@@ -1694,7 +2100,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
          * also apply to instructions scheduled after the thrsw that we want
          * to place in its delay slots.
          */
-        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
                 return false;

         /* TLB access is disallowed until scoreboard wait is executed, which
@@ -1767,8 +2173,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
                      bool is_thrend)
 {
         for (int slot = 0; slot < instructions_in_sequence; slot++) {
-                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
+                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
+                                                               qinst, slot)) {
                         return false;
+                }

                 if (is_thrend &&
                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
@@ -1969,10 +2377,11 @@ emit_branch(struct v3d_compile *c,
         assert(scoreboard->last_branch_tick + 3 < branch_tick);
         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);

-        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
+        /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
          * setmsf.
          */
         bool is_safe_msf_branch =
+                c->devinfo->ver >= 71 ||
                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
@@ -2056,46 +2465,72 @@ emit_branch(struct v3d_compile *c,
 }

 static bool
-alu_reads_register(struct v3d_qpu_instr *inst,
+alu_reads_register(const struct v3d_device_info *devinfo,
+                   struct v3d_qpu_instr *inst,
                    bool add, bool magic, uint32_t index)
 {
         uint32_t num_src;
-        enum v3d_qpu_mux mux_a, mux_b;
-
-        if (add) {
+        if (add)
                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
-                mux_a = inst->alu.add.a;
-                mux_b = inst->alu.add.b;
-        } else {
+        else
                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
-                mux_a = inst->alu.mul.a;
-                mux_b = inst->alu.mul.b;
-        }

-        for (int i = 0; i < num_src; i++) {
-                if (magic) {
-                        if (i == 0 && mux_a == index)
-                                return true;
-                        if (i == 1 && mux_b == index)
-                                return true;
+        if (devinfo->ver <= 42) {
+                enum v3d_qpu_mux mux_a, mux_b;
+                if (add) {
+                        mux_a = inst->alu.add.a.mux;
+                        mux_b = inst->alu.add.b.mux;
                 } else {
-                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
-                            inst->raddr_a == index) {
-                                return true;
-                        }
-                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
-                            inst->raddr_b == index) {
-                                return true;
+                        mux_a = inst->alu.mul.a.mux;
+                        mux_b = inst->alu.mul.b.mux;
+                }
+
+                for (int i = 0; i < num_src; i++) {
+                        if (magic) {
+                                if (i == 0 && mux_a == index)
+                                        return true;
+                                if (i == 1 && mux_b == index)
+                                        return true;
+                        } else {
+                                if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+                                    inst->raddr_a == index) {
+                                        return true;
+                                }
+                                if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+                                    inst->raddr_b == index) {
+                                        return true;
+                                }
                         }
                 }
+
+                return false;
+        }
+
+        assert(devinfo->ver >= 71);
+        assert(!magic);
+
+        uint32_t raddr_a, raddr_b;
+        if (add) {
+                raddr_a = inst->alu.add.a.raddr;
+                raddr_b = inst->alu.add.b.raddr;
+        } else {
+                raddr_a = inst->alu.mul.a.raddr;
+                raddr_b = inst->alu.mul.b.raddr;
+        }
+
+        for (int i = 0; i < num_src; i++) {
+                if (i == 0 && raddr_a == index)
+                        return true;
+                if (i == 1 && raddr_b == index)
+                        return true;
         }

         return false;
@@ -2130,6 +2565,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
                        struct qblock *block,
                        struct v3d_qpu_instr *inst)
 {
+        const struct v3d_device_info *devinfo = c->devinfo;
+
         /* We only call this if we have successfully merged an ldvary into a
          * previous instruction.
          */
@@ -2142,9 +2579,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
          * the ldvary destination, if it does, then moving the ldvary before
          * it would overwrite it.
          */
-        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
                 return false;
-        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
+        if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
                 return false;

         /* The implicit ldvary destination may not be written to by a signal
@@ -2180,13 +2617,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         }

         /* The previous instruction cannot have a conflicting signal */
-        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
+        if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
                 return false;

         uint32_t sig;
         struct v3d_qpu_sig new_sig = prev->qpu.sig;
         new_sig.ldvary = true;
-        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
+        if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
                 return false;

         /* The previous instruction cannot use flags since ldvary uses the
@@ -2199,9 +2636,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,

         /* We can't put an ldvary in the delay slots of a thrsw. We should've
          * prevented this when pairing up the ldvary with another instruction
-         * and flagging it for a fixup.
+         * and flagging it for a fixup. In V3D 7.x this is limited only to the
+         * second delay slot.
          */
-        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
+        assert((devinfo->ver <= 42 &&
+                scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
+               (devinfo->ver >= 71 &&
+                scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));

         /* Move the ldvary to the previous instruction and remove it from the
          * current one.
@@ -2215,14 +2656,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
         inst->sig_magic = false;
         inst->sig_addr = 0;

-        /* By moving ldvary to the previous instruction we make it update
-         * r5 in the current one, so nothing else in it should write r5.
-         * This should've been prevented by our dependency tracking, which
+        /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
+        if (devinfo->ver >= 71) {
+                scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
+                set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
+        }
+
+        /* By moving ldvary to the previous instruction we make it update r5
+         * (rf0 for ver >= 71) in the current one, so nothing else in it
+         * should write this register.
+         *
+         * This should've been prevented by our depedency tracking, which
          * would not allow ldvary to be paired up with an instruction that
-         * writes r5 (since our dependency tracking doesn't know that the
-         * ldvary write r5 happens in the next instruction).
+         * writes r5/rf0 (since our dependency tracking doesn't know that the
+         * ldvary write to r5/rf0 happens in the next instruction).
          */
-        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
+        assert(!v3d_qpu_writes_r5(devinfo, inst));
+        assert(devinfo->ver <= 42 ||
+               (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
+                !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));

         return true;
 }
@@ -2313,7 +2765,7 @@ schedule_instructions(struct v3d_compile *c,
                                         }
                                 }
                         }
-                        if (mux_read_stalls(scoreboard, inst))
+                        if (read_stalls(c->devinfo, scoreboard, inst))
                                 c->qpu_inst_stalled_count++;
                 }

@@ -2538,6 +2990,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
         scoreboard.last_setmsf_tick = -10;
         scoreboard.last_stallable_sfu_tick = -10;
         scoreboard.first_ldtmu_after_thrsw = true;
+        scoreboard.last_implicit_rf0_write_tick = - 10;

         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 2cc7a0eb0ae..0466ee5d0b6 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
         int last_sfu_write;
         int last_branch_ip;
         int last_thrsw_ip;
+        int first_tlb_z_write;

         /* Set when we've found the last-THRSW signal, or if we were started
          * in single-segment mode.
@@ -110,11 +111,58 @@ static void
 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
 {
         const struct v3d_device_info *devinfo = state->c->devinfo;
+
+        if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
+                state->first_tlb_z_write = state->ip;
+
         const struct v3d_qpu_instr *inst = &qinst->qpu;

+        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
+            state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write &&
+            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
+            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
+                fail_instr(state, "Implicit branch MSF read after TLB Z write");
+        }
+
         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return;

+        if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
+            state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write) {
+                fail_instr(state, "SETMSF after TLB Z write");
+        }
+
+        if (state->first_tlb_z_write >= 0 &&
+            state->ip > state->first_tlb_z_write &&
+            inst->alu.add.op == V3D_QPU_A_MSF) {
+                fail_instr(state, "MSF read after TLB Z write");
+        }
+
+        if (devinfo->ver < 71) {
+                if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+                    inst->sig.small_imm_d) {
+                        fail_instr(state, "small imm a/c/d added after V3D 7.1");
+                }
+        } else {
+                if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
+                    !vir_is_add(qinst)) {
+                        fail_instr(state, "small imm a/b used but no ADD inst");
+                }
+                if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
+                    !vir_is_mul(qinst)) {
+                        fail_instr(state, "small imm c/d used but no MUL inst");
+                }
+                if (inst->sig.small_imm_a + inst->sig.small_imm_b +
+                    inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
+                        fail_instr(state, "only one small immediate can be "
+                                   "enabled per instruction");
+                }
+        }
+
         /* LDVARY writes r5 two instructions later and LDUNIF writes
          * r5 one instruction later, which is illegal to have
          * together.
@@ -194,8 +242,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
                                    "SFU write started during THRSW delay slots ");
                 }

-                if (inst->sig.ldvary)
-                        fail_instr(state, "LDVARY during THRSW delay slots");
+                if (inst->sig.ldvary) {
+                        if (devinfo->ver <= 42)
+                                fail_instr(state, "LDVARY during THRSW delay slots");
+                        if (devinfo->ver >= 71 &&
+                            state->ip - state->last_thrsw_ip == 2) {
+                                fail_instr(state, "LDVARY in 2nd THRSW delay slot");
+                        }
+                }
         }

         (void)qpu_magic_waddr_matches; /* XXX */
@@ -222,7 +276,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
             vpm_writes +
             tlb_writes +
             tsy_writes +
-            inst->sig.ldtmu +
+            (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
             inst->sig.ldtlb +
             inst->sig.ldvpm +
             inst->sig.ldtlbu > 1) {
@@ -262,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
             inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
                      !inst->alu.add.magic_write)) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver <= 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71) {
+                                if (state->last_thrsw_ip - state->ip == 0) {
+                                        fail_instr(state,
+                                                   "ADD RF write at THREND");
+                                }
+                                if (inst->alu.add.waddr == 2 ||
+                                    inst->alu.add.waddr == 3) {
+                                        fail_instr(state,
+                                                   "RF2-3 write after THREND");
+                                }
+                        }
                 }

                 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
                      !inst->alu.mul.magic_write)) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver <= 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71) {
+                                if (state->last_thrsw_ip - state->ip == 0) {
+                                        fail_instr(state,
+                                                   "MUL RF write at THREND");
+                                }
+
+                                if (inst->alu.mul.waddr == 2 ||
+                                    inst->alu.mul.waddr == 3) {
+                                        fail_instr(state,
+                                                   "RF2-3 write after THREND");
+                                }
+                        }
                 }

                 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
                     !inst->sig_magic) {
-                        fail_instr(state, "RF write after THREND");
+                        if (devinfo->ver <= 42) {
+                                fail_instr(state, "RF write after THREND");
+                        } else if (devinfo->ver >= 71 &&
+                                   (inst->sig_addr == 2 ||
+                                    inst->sig_addr == 3)) {
+                                fail_instr(state, "RF2-3 write after THREND");
+                        }
                 }

                 /* GFXH-1625: No TMUWT in the last instruction */
@@ -321,6 +406,7 @@ qpu_validate(struct v3d_compile *c)
                 .last_sfu_write = -10,
                 .last_thrsw_ip = -10,
                 .last_branch_ip = -10,
+                .first_tlb_z_write = INT_MAX,
                 .ip = 0,

                 .last_thrsw_found = !c->last_thrsw,
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 9f4129870e1..b437b5f5168 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -613,6 +613,11 @@ struct v3d_ra_node_info {
         struct {
                 uint32_t priority;
                 uint8_t class_bits;
+                bool is_program_end;
+                bool unused;
+
+                /* V3D 7.x */
+                bool is_ldunif_dst;
         } *info;
         uint32_t alloc_count;
 };
@@ -1150,8 +1155,8 @@ bool vir_is_raw_mov(struct qinst *inst);
 bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
 bool vir_is_add(struct qinst *inst);
 bool vir_is_mul(struct qinst *inst);
-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
+bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
 struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
 uint8_t vir_channels_written(struct qinst *inst);
 struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
@@ -1184,7 +1189,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
 bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
 bool v3d_nir_lower_scratch(nir_shader *s);
 bool v3d_nir_lower_txf_ms(nir_shader *s);
-bool v3d_nir_lower_image_load_store(nir_shader *s);
+bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
 bool v3d_nir_lower_load_store_bitsize(nir_shader *s);

 void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
@@ -1425,6 +1430,20 @@ VIR_SFU(LOG)
 VIR_SFU(SIN)
 VIR_SFU(RSQRT2)

+VIR_A_ALU2(VPACK)
+VIR_A_ALU2(V8PACK)
+VIR_A_ALU2(V10PACK)
+VIR_A_ALU2(V11FPACK)
+
+VIR_M_ALU1(FTOUNORM16)
+VIR_M_ALU1(FTOSNORM16)
+
+VIR_M_ALU1(VFTOUNORM8)
+VIR_M_ALU1(VFTOSNORM8)
+
+VIR_M_ALU1(VFTOUNORM10LO)
+VIR_M_ALU1(VFTOUNORM10HI)
+
 static inline struct qinst *
 vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
              struct qreg dest, struct qreg src)
diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
index 2900a29817f..bbb55be4a14 100644
--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
@@ -40,6 +40,10 @@
  * calculations and load/store using the TMU general memory access path.
  */

+static const unsigned bits_8[4] = {8, 8, 8, 8};
+static const unsigned bits_16[4] = {16, 16, 16, 16};
+static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+
 bool
 v3d_gl_format_is_return_32(enum pipe_format format)
 {
@@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format)

 /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
  * 32-bit SSA value, with as many channels as necessary to store all the bits
+ *
+ * This is the generic helper, using all common nir operations.
  */
 static nir_ssa_def *
 pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
         return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
 }

+/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
+ * just easier to read vfpack on the code, specially while using the PRM as
+ * reference
+ */
+static nir_ssa_def *
+nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2)
+{
+        return nir_pack_half_2x16_split(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_11f11f10f(nir_builder *b, nir_ssa_def *color)
+{
+        nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                     nir_channel(b, color, 1));
+        /* FIXME: we noted that we could just use p2 again as the second
+         * element to pack, and CTS tests still works. Just using undef as is
+         * slightly more correct
+         */
+        nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
+        nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
+
+        return nir_v11fpack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color)
+{
+        nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+                                        nir_channel(b, color, 1));
+        nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+                                        nir_channel(b, color, 3));
+
+        return nir_v10pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color)
+{
+        nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                     nir_channel(b, color, 1));
+        p1 = nir_vftounorm10lo_v3d(b, p1);
+
+        nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
+                                     nir_channel(b, color, 3));
+        p2 = nir_vftounorm10hi_v3d(b, p2);
+
+        return nir_v10pack_v3d(b, p1, p2);
+}
+
+enum hw_conversion {
+        NONE,
+        TO_SNORM,
+        TO_UNORM
+};
+
+static inline nir_ssa_def *
+pack_8bit(nir_builder *b, nir_ssa_def *color,
+                        unsigned num_components,
+                        enum hw_conversion conversion)
+{
+        /* Note that usually you should not use this method (that relies on
+         * custom packing) for 1 component if we are not doing any
+         * conversion. But we support also that case, and let the caller
+         * decide which method to use.
+         */
+        nir_ssa_def *p1;
+        nir_ssa_def *p2;
+
+        if (conversion == NONE) {
+                p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
+                                   nir_channel(b, color, num_components == 1 ? 0 : 1));
+        } else {
+                p1 = nir_vfpack(b, nir_channel(b, color, 0),
+                                nir_channel(b, color, num_components == 1 ? 0 : 1));
+                p1 = (conversion == TO_UNORM) ?
+                   nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1);
+        }
+        if (num_components == 4) {
+                if (conversion == NONE) {
+                        p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
+                                           nir_channel(b, color, 3));
+                } else {
+                        p2 = nir_vfpack(b, nir_channel(b, color, 2),
+                                        nir_channel(b, color, 3));
+                        p2 = (conversion == TO_UNORM) ?
+                           nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2);
+                }
+        } else {
+                /* As mentioned on the comment before, using an undef here
+                 * would be more correct. But for this case we are getting
+                 * worse values, and in fact even some worse instruction count
+                 * with some CTS tests, so we just reuse the first packing
+                 */
+                p2 = p1;
+        }
+
+        return nir_v8pack_v3d(b, p1, p2);
+}
+
+static inline nir_ssa_def *
+pack_16bit(nir_builder *b, nir_ssa_def *color,
+                         unsigned num_components,
+                         enum hw_conversion conversion)
+{
+        nir_ssa_def *results[2];
+        nir_ssa_def *channels[4];
+
+        /* Note that usually you should not use this method (that relies on
+         * custom packing) if we are not doing any conversion. But we support
+         * also that case, and let the caller decide which method to use.
+         */
+
+        for (unsigned i = 0; i < num_components; i++) {
+                channels[i] = nir_channel(b, color, i);
+                switch (conversion) {
+                case TO_SNORM:
+                        channels[i] = nir_ftosnorm16_v3d(b, channels[i]);
+                        break;
+                case TO_UNORM:
+                        channels[i] = nir_ftounorm16_v3d(b, channels[i]);
+                        break;
+                default:
+                        break;
+                }
+        }
+
+        switch (num_components) {
+        case 1:
+                results[0] = channels[0];
+                break;
+        case 4:
+                results[1] = nir_vpack_v3d(b, channels[2], channels[3]);
+                FALLTHROUGH;
+        case 2:
+                results[0] = nir_vpack_v3d(b, channels[0], channels[1]);
+                break;
+        }
+
+        return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
+}
+
+static inline nir_ssa_def *
+pack_xbit(nir_builder *b, nir_ssa_def *color,
+          unsigned num_components,
+          const struct util_format_channel_description *r_chan)
+{
+        bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
+        enum hw_conversion conversion = NONE;
+        if (r_chan->normalized) {
+                conversion =
+                        (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
+        }
+
+        switch (r_chan->size) {
+        case 8:
+                if (conversion == NONE && num_components < 2)
+                        return pack_bits(b, color, bits_8, num_components, pack_mask);
+                else
+                        return pack_8bit(b, color, num_components, conversion);
+                break;
+        case 16:
+                /* pack_mask implies that the generic packing method would
+                 * need to include extra operations to handle negative values,
+                 * so in that case, even without a conversion, it is better to
+                 * use the packing using custom hw operations.
+                 */
+                if (conversion == NONE && !pack_mask)
+                        return pack_bits(b, color, bits_16, num_components, pack_mask);
+                else
+                        return pack_16bit(b, color, num_components, conversion);
+                break;
+        default:
+                unreachable("unrecognized bits");
+        }
+}
+
 static bool
-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
 {
         enum pipe_format format = nir_intrinsic_format(instr);
         assert(format != PIPE_FORMAT_NONE);
@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
                  */
                 formatted = color;
         } else {
-                static const unsigned bits_8[4] = {8, 8, 8, 8};
-                static const unsigned bits_16[4] = {16, 16, 16, 16};
-                static const unsigned bits_1010102[4] = {10, 10, 10, 2};
                 const unsigned *bits;

                 switch (r_chan->size) {
@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
         return true;
 }

+
+static bool
+v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
+{
+        enum pipe_format format = nir_intrinsic_format(instr);
+        assert(format != PIPE_FORMAT_NONE);
+        const struct util_format_description *desc =
+                util_format_description(format);
+        const struct util_format_channel_description *r_chan = &desc->channel[0];
+        unsigned num_components = util_format_get_nr_components(format);
+        b->cursor = nir_before_instr(&instr->instr);
+
+        nir_ssa_def *color = nir_channels(b,
+                                          nir_ssa_for_src(b, instr->src[3], 4),
+                                          (1 << num_components) - 1);
+        nir_ssa_def *formatted = NULL;
+        if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+                formatted = nir_format_pack_r9g9b9e5(b, color);
+        } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+                formatted = pack_11f11f10f(b, color);
+        } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
+                formatted = pack_r10g10b10a2_uint(b, color);
+        } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+                formatted = pack_r10g10b10a2_unorm(b, color);
+        } else if (r_chan->size == 32) {
+                /* For 32-bit formats, we just have to move the vector
+                 * across (possibly reducing the number of channels).
+                 */
+                formatted = color;
+        } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
+                assert(r_chan->size == 16);
+                formatted = nir_format_float_to_half(b, color);
+                formatted = pack_bits(b, formatted, bits_16, num_components,
+                                      false);
+        } else {
+                assert(r_chan->size == 8 || r_chan->size == 16);
+                formatted = pack_xbit(b, color, num_components, r_chan);
+        }
+
+        nir_instr_rewrite_src(&instr->instr, &instr->src[3],
+                              nir_src_for_ssa(formatted));
+        instr->num_components = formatted->num_components;
+
+        return true;
+}
+
 static bool
 v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
 {
@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
         nir_intrinsic_instr *intr =
                 nir_instr_as_intrinsic(instr);

+        struct v3d_compile *c = (struct v3d_compile *) _state;
+
         switch (intr->intrinsic) {
         case nir_intrinsic_image_load:
                 return v3d_nir_lower_image_load(b, intr);
         case nir_intrinsic_image_store:
-                return v3d_nir_lower_image_store(b, intr);
+                if (c->devinfo->ver >= 71)
+                        return v3d_nir_lower_image_store_v71(b, intr);
+                else
+                        return v3d_nir_lower_image_store_v42(b, intr);
+                break;
         default:
                 return false;
         }
@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
 }

 bool
-v3d_nir_lower_image_load_store(nir_shader *s)
+v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
 {
         return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb,
                                             nir_metadata_block_index |
-                                            nir_metadata_dominance, NULL);
+                                            nir_metadata_dominance, c);
 }
diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
index 69929a145aa..8a314c8b5a9 100644
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
                          * The correct fix for this as recommended by Broadcom
                          * is to convert to .8 fixed-point with ffloor().
                          */
-                        pos = nir_f2i32(b, nir_ffloor(b, pos));
-                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
-                                             offset_reg, pos);
+                        if (c->devinfo->ver <= 42)
+                                 pos = nir_f2i32(b, nir_ffloor(b, pos));
+                        else
+                                 pos = nir_f2i32(b, nir_fround_even(b, pos));
+
+                       v3d_nir_store_output(b, state->vp_vpm_offset + i,
+                                            offset_reg, pos);
                 }
         }

diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 499215454c0..192872f368c 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
                 return false;
         }

-        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
-            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+        if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+            inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
                 return false;
         }

@@ -156,8 +156,12 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
 }

 bool
-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
+                         struct qinst *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         for (int i = 0; i < vir_get_nsrc(inst); i++) {
                 switch (inst->src[i].file) {
                 case QFILE_VPM:
@@ -178,8 +182,12 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
 }

 bool
-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
+vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
+                         struct qinst *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         switch (inst->dst.file) {
         case QFILE_MAGIC:
                 switch (inst->dst.index) {
@@ -209,15 +217,15 @@ vir_set_unpack(struct qinst *inst, int src,

         if (vir_is_add(inst)) {
                 if (src == 0)
-                        inst->qpu.alu.add.a_unpack = unpack;
+                        inst->qpu.alu.add.a.unpack = unpack;
                 else
-                        inst->qpu.alu.add.b_unpack = unpack;
+                        inst->qpu.alu.add.b.unpack = unpack;
         } else {
                 assert(vir_is_mul(inst));
                 if (src == 0)
-                        inst->qpu.alu.mul.a_unpack = unpack;
+                        inst->qpu.alu.mul.a.unpack = unpack;
                 else
-                        inst->qpu.alu.mul.b_unpack = unpack;
+                        inst->qpu.alu.mul.b.unpack = unpack;
         }
 }

@@ -737,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,

         /* Set us up for shared input/output segments.  This is apparently
          * necessary for our VCM setup to avoid varying corruption.
+         *
+         * FIXME: initially testing on V3D 7.1 seems to work fine when using
+         * separate segments. So we could try to reevaluate in the future, if
+         * there is any advantage of using separate segments.
          */
         prog_data->separate_segments = false;
         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
@@ -1572,7 +1584,7 @@ v3d_attempt_compile(struct v3d_compile *c)

         NIR_PASS(_, c->s, v3d_nir_lower_io, c);
         NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
-        NIR_PASS(_, c->s, v3d_nir_lower_image_load_store);
+        NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);

         NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
         nir_lower_idiv_options idiv_options = {
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 5c47bbdc1b0..ab5d4043039 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));

-                unpack[0] = instr->alu.add.a_unpack;
-                unpack[1] = instr->alu.add.b_unpack;
+                unpack[0] = instr->alu.add.a.unpack;
+                unpack[1] = instr->alu.add.b.unpack;
         } else {
                 fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
                 fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
                 vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));

-                unpack[0] = instr->alu.mul.a_unpack;
-                unpack[1] = instr->alu.mul.b_unpack;
+                unpack[0] = instr->alu.mul.a.unpack;
+                unpack[1] = instr->alu.mul.b.unpack;
         }

         for (int i = 0; i < nsrc; i++) {
diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
index 2fd6430a0f4..2907de9049f 100644
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
                                 flags_inst = NULL;
                         }

-                        /* Payload registers: r0/1/2 contain W, centroid W,
-                         * and Z at program start.  Register allocation will
-                         * force their nodes to R0/1/2.
+                        /* Payload registers: for fragment shaders, W,
+                         * centroid W, and Z will be initialized at r0/1/2
+                         * until v42, or r1/r2/r3 from v71.
+                         *
+                         * For compute shaders, payload would be r0/r2 until
+                         * v42, r3/r2 from v71
+                         *
+                         * Register allocation will force their nodes to those
+                         * registers.
                          */
                         if (inst->src[0].file == QFILE_REG) {
-                                switch (inst->src[0].index) {
-                                case 0:
-                                case 1:
-                                case 2:
+                                uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
+                                uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
+                                if (inst->src[0].index >= min_payload_r ||
+                                    inst->src[0].index <= max_payload_r) {
                                         c->temp_start[inst->dst.index] = 0;
-                                        break;
                                 }
                         }

diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
index da121c2a5bd..1260838ca05 100644
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -35,7 +35,7 @@
 #include "v3d_compiler.h"

 static bool
-is_copy_mov(struct qinst *inst)
+is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
 {
         if (!inst)
                 return false;
@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
                 return false;
         }

-        switch (inst->src[0].file) {
-        case QFILE_MAGIC:
-                /* No copy propagating from R3/R4/R5 -- the MOVs from those
-                 * are there to register allocate values produced into R3/4/5
-                 * to other regs (though hopefully r3/4/5).
-                 */
-                switch (inst->src[0].index) {
-                case V3D_QPU_WADDR_R3:
-                case V3D_QPU_WADDR_R4:
-                case V3D_QPU_WADDR_R5:
-                        return false;
+        if (devinfo->ver <= 42) {
+                switch (inst->src[0].file) {
+                case QFILE_MAGIC:
+                        /* No copy propagating from R3/R4/R5 -- the MOVs from
+                         * those are there to register allocate values produced
+                         * into R3/4/5 to other regs (though hopefully r3/4/5).
+                         */
+                        switch (inst->src[0].index) {
+                        case V3D_QPU_WADDR_R3:
+                        case V3D_QPU_WADDR_R4:
+                        case V3D_QPU_WADDR_R5:
+                                return false;
+                        default:
+                                break;
+                        }
+                        break;
+
+                case QFILE_REG:
+                        switch (inst->src[0].index) {
+                        case 0:
+                        case 1:
+                        case 2:
+                                /* MOVs from rf0/1/2 are only to track the live
+                                 * intervals for W/centroid W/Z.
+                                 */
+                                return false;
+                        }
+                        break;
+
                 default:
                         break;
                 }
-                break;
-
-        case QFILE_REG:
-                switch (inst->src[0].index) {
-                case 0:
-                case 1:
-                case 2:
-                        /* MOVs from rf0/1/2 are only to track the live
+        } else {
+                assert(devinfo->ver >= 71);
+                switch (inst->src[0].file) {
+                case QFILE_REG:
+                        switch (inst->src[0].index) {
+                        /* MOVs from rf1/2/3 are only to track the live
                          * intervals for W/centroid W/Z.
+                         *
+                         * Note: rf0 can be implicitly written by ldvary
+                         * (no temp involved), so it is not an SSA value and
+                         * could clash with writes to other temps that are
+                         * also allocated to rf0. In theory, that would mean
+                         * that we can't copy propagate from it, but we handle
+                         * this at register allocation time, preventing temps
+                         * from being allocated to rf0 while the rf0 value from
+                         * ldvary is still live.
                          */
-                        return false;
-                }
-                break;
+                        case 1:
+                        case 2:
+                        case 3:
+                                return false;
+                        }
+                        break;

-        default:
-                break;
+                default:
+                        break;
+                }
         }

         return true;
@@ -104,14 +133,14 @@ vir_has_unpack(struct qinst *inst, int chan)

         if (vir_is_add(inst)) {
                 if (chan == 0)
-                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
                 else
-                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
         } else {
                 if (chan == 0)
-                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
                 else
-                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+                        return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
         }
 }

@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                  */
                 struct qinst *mov = movs[inst->src[i].index];
                 if (!mov) {
-                        if (!is_copy_mov(c->defs[inst->src[i].index]))
+                        if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
                                 continue;
                         mov = c->defs[inst->src[i].index];

@@ -161,7 +190,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
                                 continue;

                         /* these ops can't represent abs. */
-                        if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+                        if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
                                 switch (inst->qpu.alu.add.op) {
                                 case V3D_QPU_A_VFPACK:
                                 case V3D_QPU_A_FROUND:
@@ -189,7 +218,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)

                 inst->src[i] = mov->src[0];
                 if (vir_has_unpack(mov, 0)) {
-                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;

                         vir_set_unpack(inst, i, unpack);
                 }
@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)

                         apply_kills(c, movs, inst);

-                        if (is_copy_mov(inst))
+                        if (is_copy_mov(c->devinfo, inst))
                                 movs[inst->dst.index] = inst;
                 }
         }
diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
index c7896d57f2b..6b61ed6a39a 100644
--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
+++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
             a->qpu.flags.mpf != b->qpu.flags.mpf ||
             a->qpu.alu.add.op != b->qpu.alu.add.op ||
             a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
-            a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
-            a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+            a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
+            a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
             a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
-            a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
-            a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+            a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
+            a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
             a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
                 return false;
         }
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
index 47d7722968d..ed5bc011964 100644
--- a/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
                 /* The small immediate value sits in the raddr B field, so we
                  * can't have 2 small immediates in one instruction (unless
                  * they're the same value, but that should be optimized away
-                 * elsewhere).
+                 * elsewhere). Since 7.x we can encode small immediates in
+                 * any raddr field, but each instruction can still only use
+                 * one.
                  */
                 bool uses_small_imm = false;
                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
                          */
                         struct v3d_qpu_sig new_sig = inst->qpu.sig;
                         uint32_t sig_packed;
-                        new_sig.small_imm = true;
+                        if (c->devinfo->ver <= 42) {
+                                new_sig.small_imm_b = true;
+                        } else {
+                               if (vir_is_add(inst)) {
+                                       if (i == 0)
+                                               new_sig.small_imm_a = true;
+                                       else
+                                               new_sig.small_imm_b = true;
+                               } else {
+                                       if (i == 0)
+                                               new_sig.small_imm_c = true;
+                                       else
+                                               new_sig.small_imm_d = true;
+                               }
+                        }
+
                         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
                                 continue;

@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
                                 vir_dump_inst(c, inst);
                                 fprintf(stderr, "\n");
                         }
-                        inst->qpu.sig.small_imm = true;
+                        inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
+                        inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
+                        inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
+                        inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
                         inst->qpu.raddr_b = packed;

                         inst->src[i].file = QFILE_SMALL_IMM;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index b22f915d1df..8eac2b75bd7 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -28,41 +28,73 @@

 #define ACC_INDEX     0
 #define ACC_COUNT     6
-#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
-#define PHYS_COUNT    64

+/* RA nodes used to track RF registers with implicit writes */
+#define IMPLICIT_RF_COUNT 1
+
+#define PHYS_COUNT 64
+
+static uint8_t
+get_phys_index(const struct v3d_device_info *devinfo)
+{
+        if (devinfo->has_accumulators)
+                return ACC_INDEX + ACC_COUNT;
+        else
+                return 0;
+}
+
+/* ACC as accumulator */
 #define CLASS_BITS_PHYS   (1 << 0)
 #define CLASS_BITS_ACC    (1 << 1)
 #define CLASS_BITS_R5     (1 << 4)
-#define CLASS_BITS_ANY    (CLASS_BITS_PHYS | \
-                           CLASS_BITS_ACC | \
-                           CLASS_BITS_R5)
+
+static uint8_t
+get_class_bit_any(const struct v3d_device_info *devinfo)
+{
+        if (devinfo->has_accumulators)
+                return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
+        else
+                return CLASS_BITS_PHYS;
+}
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+   if (!devinfo->has_accumulators) {
+      assert(class_bits & CLASS_BITS_PHYS);
+      class_bits = CLASS_BITS_PHYS;
+   }
+   return class_bits;
+}

 static inline uint32_t
-temp_to_node(uint32_t temp)
+temp_to_node(struct v3d_compile *c, uint32_t temp)
 {
-        return temp + ACC_COUNT;
+        return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
+                                                      IMPLICIT_RF_COUNT);
 }

 static inline uint32_t
-node_to_temp(uint32_t node)
+node_to_temp(struct v3d_compile *c, uint32_t node)
 {
-        assert(node >= ACC_COUNT);
-        return node - ACC_COUNT;
+        assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+               (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
+        return node - (c->devinfo->has_accumulators ? ACC_COUNT :
+                                                      IMPLICIT_RF_COUNT);
 }

 static inline uint8_t
-get_temp_class_bits(struct v3d_ra_node_info *nodes,
+get_temp_class_bits(struct v3d_compile *c,
                     uint32_t temp)
 {
-        return nodes->info[temp_to_node(temp)].class_bits;
+        return c->nodes.info[temp_to_node(c, temp)].class_bits;
 }

 static inline void
-set_temp_class_bits(struct v3d_ra_node_info *nodes,
+set_temp_class_bits(struct v3d_compile *c,
                     uint32_t temp, uint8_t class_bits)
 {
-        nodes->info[temp_to_node(temp)].class_bits = class_bits;
+        c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
 }

 static struct ra_class *
@@ -71,11 +103,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
         if (class_bits == CLASS_BITS_PHYS) {
                 return c->compiler->reg_class_phys[c->thread_index];
         } else if (class_bits == (CLASS_BITS_R5)) {
+                assert(c->devinfo->has_accumulators);
                 return c->compiler->reg_class_r5[c->thread_index];
         } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
+                assert(c->devinfo->has_accumulators);
                 return c->compiler->reg_class_phys_or_acc[c->thread_index];
         } else {
-                assert(class_bits == CLASS_BITS_ANY);
+                assert(class_bits == get_class_bit_any(c->devinfo));
                 return c->compiler->reg_class_any[c->thread_index];
         }
 }
@@ -84,7 +118,7 @@ static inline struct ra_class *
 choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
 {
         assert(temp < c->num_temps && temp < c->nodes.alloc_count);
-        return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
+        return choose_reg_class(c, get_temp_class_bits(c, temp));
 }

 static inline bool
@@ -313,7 +347,7 @@ v3d_choose_spill_node(struct v3d_compile *c)

         for (unsigned i = 0; i < c->num_temps; i++) {
                 if (BITSET_TEST(c->spillable, i)) {
-                        ra_set_node_spill_cost(c->g, temp_to_node(i),
+                        ra_set_node_spill_cost(c->g, temp_to_node(c, i),
                                                spill_costs[i]);
                 }
         }
@@ -331,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
         c->nodes.info = reralloc_array_size(c,
                                             c->nodes.info,
                                             sizeof(c->nodes.info[0]),
-                                            c->nodes.alloc_count + ACC_COUNT);
+                                            c->nodes.alloc_count +
+                                            MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
 }

 /* Creates the interference node for a new temp. We use this to keep the node
@@ -343,11 +378,15 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
         ensure_nodes(c);

         int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
-        assert(node == temp + ACC_COUNT);
+        assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
+                                              node == temp + IMPLICIT_RF_COUNT);

         /* We fill the node priority after we are done inserting spills */
         c->nodes.info[node].class_bits = class_bits;
         c->nodes.info[node].priority = 0;
+        c->nodes.info[node].is_ldunif_dst = false;
+        c->nodes.info[node].is_program_end = false;
+        c->nodes.info[node].unused = false;
 }

 /* The spill offset for this thread takes a bit of setup, so do it once at
@@ -395,8 +434,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
                  */
                 if (c->spilling) {
                         int temp_class = CLASS_BITS_PHYS;
-                        if (i != c->spill_base.index)
+                        if (c->devinfo->has_accumulators &&
+                            i != c->spill_base.index) {
                                 temp_class |= CLASS_BITS_ACC;
+                        }
                         add_node(c, i, temp_class);
                 }
         }
@@ -436,7 +477,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
          */
         assert(c->disable_ldunif_opt);
         struct qreg offset = vir_uniform_ui(c, spill_offset);
-        add_node(c, offset.index, CLASS_BITS_ANY);
+        add_node(c, offset.index, get_class_bit_any(c->devinfo));

         /* We always enable per-quad on spills/fills to ensure we spill
          * any channels involved with helper invocations.
@@ -455,14 +496,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
          * temp will be used immediately so just like the uniform above we
          * can allow accumulators.
          */
+        int temp_class =
+                filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
         if (!fill_dst) {
                 struct qreg dst = vir_TMUWT(c);
                 assert(dst.file == QFILE_TEMP);
-                add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+                add_node(c, dst.index, temp_class);
         } else {
                 *fill_dst = vir_LDTMU(c);
                 assert(fill_dst->file == QFILE_TEMP);
-                add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+                add_node(c, fill_dst->index, temp_class);
         }

         /* Temps across the thread switch we injected can't be assigned to
@@ -482,7 +525,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
                         c->temp_start[i] < ip && c->temp_end[i] >= ip :
                         c->temp_start[i] <= ip && c->temp_end[i] > ip;
                 if (thrsw_cross) {
-                        ra_set_node_class(c->g, temp_to_node(i),
+                        ra_set_node_class(c->g, temp_to_node(c, i),
                                           choose_reg_class(c, CLASS_BITS_PHYS));
                 }
         }
@@ -509,8 +552,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
          * same register class bits as the original.
          */
         if (inst == position) {
-                uint8_t class_bits = get_temp_class_bits(&c->nodes,
-                                                         inst->dst.index);
+                uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
                 inst->dst = vir_get_temp(c);
                 add_node(c, inst->dst.index, class_bits);
         } else {
@@ -542,7 +584,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
 }

 static void
-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
+              int spill_temp)
 {
         c->spill_start_num_temps = c->num_temps;
         c->spilling = true;
@@ -554,8 +597,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
                 spill_offset = c->spill_size;
                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);

-                if (spill_offset == 0)
+                if (spill_offset == 0) {
                         v3d_setup_spill_base(c);
+
+                        /* Don't allocate our spill base to rf0 to avoid
+                         * conflicts with instructions doing implicit writes
+                         * to that register.
+                         */
+                        if (!c->devinfo->has_accumulators) {
+                                ra_add_node_interference(
+                                        c->g,
+                                        temp_to_node(c, c->spill_base.index),
+                                        implicit_rf_nodes[0]);
+                        }
+                }
         }

         struct qinst *last_thrsw = c->last_thrsw;
@@ -574,7 +629,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
                 reconstruct_op = orig_def->qpu.alu.add.op;
         }

-        uint32_t spill_node = temp_to_node(spill_temp);
+        uint32_t spill_node = temp_to_node(c, spill_temp);

         /* We must disable the ldunif optimization if we are spilling uniforms */
         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
@@ -635,7 +690,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
                                          * instruction immediately after, so
                                          * we can use any register class for it.
                                          */
-                                        add_node(c, unif.index, CLASS_BITS_ANY);
+                                        add_node(c, unif.index,
+                                                 get_class_bit_any(c->devinfo));
                                 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
                                         struct qreg temp =
                                                 reconstruct_temp(c, reconstruct_op);
@@ -644,8 +700,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
                                          * instruction immediately after so we
                                          * can use ACC.
                                          */
-                                        add_node(c, temp.index, CLASS_BITS_PHYS |
-                                                                CLASS_BITS_ACC);
+                                        int temp_class =
+                                                filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+                                                                              CLASS_BITS_ACC);
+                                        add_node(c, temp.index, temp_class);
                                 } else {
                                         /* If we have a postponed spill, we
                                          * don't need a fill as the temp would
@@ -739,12 +797,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
          * update node priorities based one new liveness data.
          */
         uint32_t sb_temp =c->spill_base.index;
-        uint32_t sb_node = temp_to_node(sb_temp);
+        uint32_t sb_node = temp_to_node(c, sb_temp);
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 if (c->temp_end[i] == -1)
                         continue;

-                uint32_t node_i = temp_to_node(i);
+                uint32_t node_i = temp_to_node(c, i);
                 c->nodes.info[node_i].priority =
                         c->temp_end[i] - c->temp_start[i];

@@ -752,7 +810,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
                      j < c->num_temps; j++) {
                         if (interferes(c->temp_start[i], c->temp_end[i],
                                        c->temp_start[j], c->temp_end[j])) {
-                                uint32_t node_j = temp_to_node(j);
+                                uint32_t node_j = temp_to_node(c, j);
                                 ra_add_node_interference(c->g, node_i, node_j);
                         }
                 }
@@ -771,9 +829,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
 }

 struct v3d_ra_select_callback_data {
+        uint32_t phys_index;
         uint32_t next_acc;
         uint32_t next_phys;
         struct v3d_ra_node_info *nodes;
+        const struct v3d_device_info *devinfo;
 };

 /* Choosing accumulators improves chances of merging QPU instructions
@@ -785,6 +845,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
                    BITSET_WORD *regs,
                    int priority)
 {
+        if (!v3d_ra->devinfo->has_accumulators)
+                return false;
+
         /* Favor accumulators if we have less that this number of physical
          * registers. Accumulators have more restrictions (like being
          * invalidated through thrsw), so running out of physical registers
@@ -794,7 +857,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
         static const int available_rf_threshold = 5;
         int available_rf = 0 ;
         for (int i = 0; i < PHYS_COUNT; i++) {
-                if (BITSET_TEST(regs, PHYS_INDEX + i))
+                if (BITSET_TEST(regs, v3d_ra->phys_index + i))
                         available_rf++;
                 if (available_rf >= available_rf_threshold)
                         break;
@@ -820,6 +883,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
                     BITSET_WORD *regs,
                     unsigned int *out)
 {
+        if (!v3d_ra->devinfo->has_accumulators)
+                return false;
+
         /* Choose r5 for our ldunifs if possible (nobody else can load to that
          * reg, and it keeps the QPU cond field free from being occupied by
          * ldunifrf).
@@ -849,12 +915,47 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,

 static bool
 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 unsigned int node,
                  BITSET_WORD *regs,
                  unsigned int *out)
 {
+        /* If this node is for an unused temp, ignore. */
+        if (v3d_ra->nodes->info[node].unused) {
+                *out = 0;
+                return true;
+        }
+
+        /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+         * so we can avoid turning them into ldunifrf (which uses the
+         * cond field to encode the dst and would prevent merge with
+         * instructions that use cond flags).
+         */
+        if (v3d_ra->nodes->info[node].is_ldunif_dst &&
+            BITSET_TEST(regs, v3d_ra->phys_index)) {
+                assert(v3d_ra->devinfo->ver >= 71);
+                *out = v3d_ra->phys_index;
+                return true;
+        }
+
+        /* The last 3 instructions in a shader can't use some specific registers
+         * (usually early rf registers, depends on v3d version) so try to
+         * avoid allocating these to registers used by the last instructions
+         * in the shader.
+         */
+        const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
+        if (v3d_ra->nodes->info[node].is_program_end &&
+            v3d_ra->next_phys < safe_rf_start) {
+                v3d_ra->next_phys = safe_rf_start;
+        }
+
         for (int i = 0; i < PHYS_COUNT; i++) {
                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
-                int phys = PHYS_INDEX + phys_off;
+
+                /* Try to keep rf0 available for ldunif in 7.x (see above). */
+                if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
+                        continue;
+
+                int phys = v3d_ra->phys_index + phys_off;

                 if (BITSET_TEST(regs, phys)) {
                         v3d_ra->next_phys = phys_off + 1;
@@ -863,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
                 }
         }

+        /* If we couldn't allocate, do try to assign rf0 if it is available. */
+        if (v3d_ra->devinfo->ver >= 71 &&
+            BITSET_TEST(regs, v3d_ra->phys_index)) {
+                v3d_ra->next_phys = 1;
+                *out = v3d_ra->phys_index;
+                return true;
+        }
+
         return false;
 }

@@ -877,7 +986,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
                 return reg;
         }

-        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+        if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
                 return reg;

         /* If we ran out of physical registers try to assign an accumulator
@@ -896,8 +1005,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
          * register file can be divided up for fragment shader threading.
          */
         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
+        uint8_t phys_index = get_phys_index(compiler->devinfo);

-        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+        compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
                                           false);
         if (!compiler->regs)
                 return false;
@@ -905,31 +1015,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
         for (int threads = 0; threads < max_thread_index; threads++) {
                 compiler->reg_class_any[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);
-                compiler->reg_class_r5[threads] =
-                        ra_alloc_contig_reg_class(compiler->regs, 1);
-                compiler->reg_class_phys_or_acc[threads] =
-                        ra_alloc_contig_reg_class(compiler->regs, 1);
+                if (compiler->devinfo->has_accumulators) {
+                        compiler->reg_class_r5[threads] =
+                                ra_alloc_contig_reg_class(compiler->regs, 1);
+                        compiler->reg_class_phys_or_acc[threads] =
+                                ra_alloc_contig_reg_class(compiler->regs, 1);
+                }
                 compiler->reg_class_phys[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);

-                for (int i = PHYS_INDEX;
-                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                /* Init physical regs */
+                for (int i = phys_index;
+                     i < phys_index + (PHYS_COUNT >> threads); i++) {
+                        if (compiler->devinfo->has_accumulators)
+                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
                         ra_class_add_reg(compiler->reg_class_any[threads], i);
                 }

-                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
-                        ra_class_add_reg(compiler->reg_class_any[threads], i);
+                /* Init accumulator regs */
+                if (compiler->devinfo->has_accumulators) {
+                        for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                                ra_class_add_reg(compiler->reg_class_any[threads], i);
+                        }
+                        /* r5 can only store a single 32-bit value, so not much can
+                         * use it.
+                         */
+                        ra_class_add_reg(compiler->reg_class_r5[threads],
+                                         ACC_INDEX + 5);
+                        ra_class_add_reg(compiler->reg_class_any[threads],
+                                         ACC_INDEX + 5);
                 }
-                /* r5 can only store a single 32-bit value, so not much can
-                 * use it.
-                 */
-                ra_class_add_reg(compiler->reg_class_r5[threads],
-                                 ACC_INDEX + 5);
-                ra_class_add_reg(compiler->reg_class_any[threads],
-                                 ACC_INDEX + 5);
         }

         ra_set_finalize(compiler->regs, NULL);
@@ -944,7 +1061,10 @@ tmu_spilling_allowed(struct v3d_compile *c)
 }

 static void
-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+                                      int *acc_nodes,
+                                      int *implicit_rf_nodes,
+                                      int last_ldvary_ip,
                                       struct qinst *inst)
 {
         int32_t ip = inst->ip;
@@ -954,26 +1074,39 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
          * result to a temp), nothing else can be stored in r3/r4 across
          * it.
          */
-        if (vir_writes_r3(c->devinfo, inst)) {
+        if (vir_writes_r3_implicitly(c->devinfo, inst)) {
                 for (int i = 0; i < c->num_temps; i++) {
                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
                                 ra_add_node_interference(c->g,
-                                                         temp_to_node(i),
+                                                         temp_to_node(c, i),
                                                          acc_nodes[3]);
                         }
                 }
         }

-        if (vir_writes_r4(c->devinfo, inst)) {
+        if (vir_writes_r4_implicitly(c->devinfo, inst)) {
                 for (int i = 0; i < c->num_temps; i++) {
                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
                                 ra_add_node_interference(c->g,
-                                                         temp_to_node(i),
+                                                         temp_to_node(c, i),
                                                          acc_nodes[4]);
                         }
                 }
         }

+        /* If any instruction writes to a physical register implicitly
+         * nothing else can write the same register across it.
+         */
+        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, i),
+                                                         implicit_rf_nodes[0]);
+                        }
+                }
+        }
+
         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
                 switch (inst->qpu.alu.add.op) {
                 case V3D_QPU_A_LDVPMV_IN:
@@ -987,7 +1120,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
                          * decides whether the LDVPM is in or out)
                          */
                         assert(inst->dst.file == QFILE_TEMP);
-                        set_temp_class_bits(&c->nodes, inst->dst.index,
+                        set_temp_class_bits(c, inst->dst.index,
                                             CLASS_BITS_PHYS);
                         break;
                 }
@@ -1002,7 +1135,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
                          * phys regfile.
                          */
                         assert(inst->dst.file == QFILE_TEMP);
-                        set_temp_class_bits(&c->nodes, inst->dst.index,
+                        set_temp_class_bits(c, inst->dst.index,
                                             CLASS_BITS_PHYS);
                         break;
                 }
@@ -1015,6 +1148,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
         if (inst->src[0].file == QFILE_REG) {
                 switch (inst->src[0].index) {
                 case 0:
+                        /* V3D 7.x doesn't use rf0 for thread payload */
+                        if (c->devinfo->ver >= 71)
+                                break;
+                        else
+                                FALLTHROUGH;
                 case 1:
                 case 2:
                 case 3: {
@@ -1024,14 +1162,34 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
                          */
                         assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
                         assert(inst->dst.file == QFILE_TEMP);
-                        uint32_t node = temp_to_node(inst->dst.index);
+                        uint32_t node = temp_to_node(c, inst->dst.index);
                         ra_set_node_reg(c->g, node,
-                                        PHYS_INDEX + inst->src[0].index);
+                                        get_phys_index(c->devinfo) +
+                                        inst->src[0].index);
                         break;
                 }
                 }
         }

+        /* Don't allocate rf0 to temps that cross ranges where we have
+         * live implicit rf0 writes from ldvary. We can identify these
+         * by tracking the last ldvary instruction and explicit reads
+         * of rf0.
+         */
+        if (c->devinfo->ver >= 71 &&
+            ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
+              (vir_get_nsrc(inst) > 1 &&
+               inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
+                for (int i = 0; i < c->num_temps; i++) {
+                        if (c->temp_start[i] < ip &&
+                            c->temp_end[i] > last_ldvary_ip) {
+                                        ra_add_node_interference(c->g,
+                                                                 temp_to_node(c, i),
+                                                                 implicit_rf_nodes[0]);
+                        }
+                }
+        }
+
         if (inst->dst.file == QFILE_TEMP) {
                 /* Only a ldunif gets to write to R5, which only has a
                  * single 32-bit channel of storage.
@@ -1041,36 +1199,95 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
                  * because ldunif has usually a shorter lifespan, allowing for
                  * more accumulator reuse and QPU merges.
                  */
-                if (!inst->qpu.sig.ldunif) {
-                        uint8_t class_bits =
-                                get_temp_class_bits(&c->nodes, inst->dst.index) &
-                                ~CLASS_BITS_R5;
-                        set_temp_class_bits(&c->nodes, inst->dst.index,
-                                            class_bits);
-
+                if (c->devinfo->has_accumulators) {
+                        if (!inst->qpu.sig.ldunif) {
+                                uint8_t class_bits =
+                                        get_temp_class_bits(c, inst->dst.index) &
+                                        ~CLASS_BITS_R5;
+                                set_temp_class_bits(c, inst->dst.index,
+                                                    class_bits);
+
+                        } else {
+                                /* Until V3D 4.x, we could only load a uniform
+                                 * to r5, so we'll need to spill if uniform
+                                 * loads interfere with each other.
+                                 */
+                                if (c->devinfo->ver < 40) {
+                                        set_temp_class_bits(c, inst->dst.index,
+                                                            CLASS_BITS_R5);
+                                }
+                        }
                 } else {
-                        /* Until V3D 4.x, we could only load a uniform
-                         * to r5, so we'll need to spill if uniform
-                         * loads interfere with each other.
+                        /* Make sure we don't allocate the ldvary's
+                         * destination to rf0, since it would clash
+                         * with its implicit write to that register.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                ra_add_node_interference(c->g,
+                                                         temp_to_node(c, inst->dst.index),
+                                                         implicit_rf_nodes[0]);
+                        }
+                        /* Flag dst temps from ldunif(a) instructions
+                         * so we can try to assign rf0 to them and avoid
+                         * converting these to ldunif(a)rf.
                          */
-                        if (c->devinfo->ver < 40) {
-                                set_temp_class_bits(&c->nodes, inst->dst.index,
-                                                    CLASS_BITS_R5);
+                        if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
+                                const uint32_t dst_n =
+                                        temp_to_node(c, inst->dst.index);
+                                c->nodes.info[dst_n].is_ldunif_dst = true;
                         }
                 }
         }

         /* All accumulators are invalidated across a thread switch. */
-        if (inst->qpu.sig.thrsw) {
+        if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
                 for (int i = 0; i < c->num_temps; i++) {
                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
-                                set_temp_class_bits(&c->nodes, i,
+                                set_temp_class_bits(c, i,
                                                     CLASS_BITS_PHYS);
                         }
                 }
         }
 }

+static void
+flag_program_end_nodes(struct v3d_compile *c)
+{
+        /* Only look for registers used in this many instructions */
+        uint32_t last_set_count = 6;
+
+        struct qblock *last_block = vir_exit_block(c);
+        list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
+                if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
+                        continue;
+
+                int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+                for (int i = 0; i < num_src; i++) {
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                int node = temp_to_node(c, inst->src[i].index);
+                                c->nodes.info[node].is_program_end = true;
+                        }
+                }
+
+                num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+                for (int i = 0; i < num_src; i++) {
+                       if (inst->src[i].file == QFILE_TEMP) {
+                                int node = temp_to_node(c, inst->src[i].index);
+                                c->nodes.info[node].is_program_end = true;
+
+                        }
+                }
+
+                if (inst->dst.file == QFILE_TEMP) {
+                        int node = temp_to_node(c, inst->dst.index);
+                        c->nodes.info[node].is_program_end = true;
+                }
+
+                if (--last_set_count == 0)
+                        break;
+        }
+}
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -1080,19 +1297,32 @@ struct qpu_reg *
 v3d_register_allocate(struct v3d_compile *c)
 {
         int acc_nodes[ACC_COUNT];
+        int implicit_rf_nodes[IMPLICIT_RF_COUNT];
+
+        unsigned num_ra_nodes = c->num_temps;
+        if (c->devinfo->has_accumulators)
+                num_ra_nodes += ARRAY_SIZE(acc_nodes);
+        else
+                num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
+
         c->nodes = (struct v3d_ra_node_info) {
                 .alloc_count = c->num_temps,
                 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
-                                          c->num_temps + ACC_COUNT),
+                                          num_ra_nodes),
         };

+        uint32_t phys_index = get_phys_index(c->devinfo);
+
         struct v3d_ra_select_callback_data callback_data = {
+                .phys_index = phys_index,
                 .next_acc = 0,
                 /* Start at RF3, to try to keep the TLB writes from using
-                 * RF0-2.
+                 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
+                 * using RF2-3.
                  */
-                .next_phys = 3,
+                .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
                 .nodes = &c->nodes,
+                .devinfo = c->devinfo,
         };

         vir_calculate_live_intervals(c);
@@ -1108,27 +1338,35 @@ v3d_register_allocate(struct v3d_compile *c)
                         c->thread_index--;
         }

-        c->g = ra_alloc_interference_graph(c->compiler->regs,
-                                           c->num_temps + ARRAY_SIZE(acc_nodes));
+        c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);

         /* Make some fixed nodes for the accumulators, which we will need to
          * interfere with when ops have implied r3/r4 writes or for the thread
          * switches.  We could represent these as classes for the nodes to
          * live in, but the classes take up a lot of memory to set up, so we
-         * don't want to make too many.
+         * don't want to make too many. We use the same mechanism on platforms
+         * without accumulators that can have implicit writes to phys regs.
          */
-        for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
-                if (i < ACC_COUNT) {
+        for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                c->nodes.info[i].is_ldunif_dst = false;
+                c->nodes.info[i].is_program_end = false;
+                c->nodes.info[i].unused = false;
+                c->nodes.info[i].priority = 0;
+                c->nodes.info[i].class_bits = 0;
+                if (c->devinfo->has_accumulators && i < ACC_COUNT) {
                         acc_nodes[i] = i;
                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
-                        c->nodes.info[i].priority = 0;
-                        c->nodes.info[i].class_bits = 0;
+                } else if (!c->devinfo->has_accumulators &&
+                           i < ARRAY_SIZE(implicit_rf_nodes)) {
+                        implicit_rf_nodes[i] = i;
+                        ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
                 } else {
-                        uint32_t t = node_to_temp(i);
+                        uint32_t t = node_to_temp(c, i);
                         c->nodes.info[i].priority =
                                 c->temp_end[t] - c->temp_start[t];
-                        c->nodes.info[i].class_bits = CLASS_BITS_ANY;
+                        c->nodes.info[i].class_bits =
+                                get_class_bit_any(c->devinfo);
                 }
         }

@@ -1136,25 +1374,61 @@ v3d_register_allocate(struct v3d_compile *c)
          * interferences.
          */
         int ip = 0;
+        int last_ldvary_ip = -1;
         vir_for_each_inst_inorder(inst, c) {
                 inst->ip = ip++;
-                update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
+
+                /* ldunif(a) always write to a temporary, so we have
+                 * liveness info available to decide if rf0 is
+                 * available for them, however, ldvary is different:
+                 * it always writes to rf0 directly so we don't have
+                 * liveness information for its implicit rf0 write.
+                 *
+                 * That means the allocator may assign rf0 to a temp
+                 * that is defined while an implicit rf0 write from
+                 * ldvary is still live. We fix that by manually
+                 * tracking rf0 live ranges from ldvary instructions.
+                 */
+                if (inst->qpu.sig.ldvary)
+                        last_ldvary_ip = ip;
+
+                update_graph_and_reg_classes_for_inst(c, acc_nodes,
+                                                      implicit_rf_nodes,
+                                                      last_ldvary_ip, inst);
         }

+        /* Flag the nodes that are used in the last instructions of the program
+         * (there are some registers that cannot be used in the last 3
+         * instructions). We only do this for fragment shaders, because the idea
+         * is that by avoiding this conflict we may be able to emit the last
+         * thread switch earlier in some cases, however, in non-fragment shaders
+         * this won't happen because the last instructions are always VPM stores
+         * with a small immediate, which conflicts with other signals,
+         * preventing us from ever moving the thrsw earlier.
+         */
+        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+                flag_program_end_nodes(c);
+
         /* Set the register classes for all our temporaries in the graph */
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                ra_set_node_class(c->g, temp_to_node(i),
+                ra_set_node_class(c->g, temp_to_node(c, i),
                                   choose_reg_class_for_temp(c, i));
         }

         /* Add register interferences based on liveness data */
         for (uint32_t i = 0; i < c->num_temps; i++) {
+                /* And while we are here, let's also flag nodes for
+                 * unused temps.
+                 */
+                if (c->temp_start[i] > c->temp_end[i])
+                        c->nodes.info[temp_to_node(c, i)].unused = true;
+
                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
                         if (interferes(c->temp_start[i], c->temp_end[i],
                                        c->temp_start[j], c->temp_end[j])) {
                                 ra_add_node_interference(c->g,
-                                                         temp_to_node(i),
-                                                         temp_to_node(j));
+                                                         temp_to_node(c, i),
+                                                         temp_to_node(c, j));
                         }
                 }
         }
@@ -1171,9 +1445,9 @@ v3d_register_allocate(struct v3d_compile *c)
                 if (c->spill_size <
                     V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
                         int node = v3d_choose_spill_node(c);
-                        uint32_t temp = node_to_temp(node);
+                        uint32_t temp = node_to_temp(c, node);
                         if (node != -1) {
-                                v3d_spill_reg(c, acc_nodes, temp);
+                                v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
                                 continue;
                         }
                 }
@@ -1186,11 +1460,11 @@ v3d_register_allocate(struct v3d_compile *c)
                 if (node == -1)
                         goto spill_fail;

-                uint32_t temp = node_to_temp(node);
+                uint32_t temp = node_to_temp(c, node);
                 enum temp_spill_type spill_type =
                         get_spill_type_for_temp(c, temp);
                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
-                        v3d_spill_reg(c, acc_nodes, temp);
+                        v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
                         if (c->spills + c->fills > c->max_tmu_spills)
                                 goto spill_fail;
                 } else {
@@ -1201,14 +1475,14 @@ v3d_register_allocate(struct v3d_compile *c)
         /* Allocation was successful, build the 'temp -> reg' map */
         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
-                if (ra_reg < PHYS_INDEX) {
+                int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+                if (ra_reg < phys_index) {
                         temp_registers[i].magic = true;
                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
                                                    ra_reg - ACC_INDEX);
                 } else {
                         temp_registers[i].magic = false;
-                        temp_registers[i].index = ra_reg - PHYS_INDEX;
+                        temp_registers[i].index = ra_reg - phys_index;
                 }
         }

diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 45e6bfa1470..4ed184cbbcb 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -86,15 +86,32 @@ new_qpu_nop_before(struct qinst *inst)
         return q;
 }

+static void
+v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+{
+        /* If we have a small immediate move it from inst->raddr_b to the
+         * corresponding raddr.
+         */
+        if (src.smimm) {
+                assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
+                       instr->sig.small_imm_c || instr->sig.small_imm_d);
+                *raddr = instr->raddr_b;
+                return;
+        }
+
+        assert(!src.magic);
+        *raddr = src.index;
+}
+
 /**
  * Allocates the src register (accumulator or register file) into the RADDR
  * fields of the instruction.
  */
 static void
-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
 {
         if (src.smimm) {
-                assert(instr->sig.small_imm);
+                assert(instr->sig.small_imm_b);
                 *mux = V3D_QPU_MUX_B;
                 return;
         }
@@ -106,20 +123,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
                 return;
         }

-        if (instr->alu.add.a != V3D_QPU_MUX_A &&
-            instr->alu.add.b != V3D_QPU_MUX_A &&
-            instr->alu.mul.a != V3D_QPU_MUX_A &&
-            instr->alu.mul.b != V3D_QPU_MUX_A) {
+        if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
+            instr->alu.add.b.mux != V3D_QPU_MUX_A &&
+            instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
+            instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
                 instr->raddr_a = src.index;
                 *mux = V3D_QPU_MUX_A;
         } else {
                 if (instr->raddr_a == src.index) {
                         *mux = V3D_QPU_MUX_A;
                 } else {
-                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
-                                 instr->alu.add.b == V3D_QPU_MUX_B &&
-                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
-                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
+                        assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
                                src.index == instr->raddr_b);

                         instr->raddr_b = src.index;
@@ -128,33 +145,40 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
         }
 }

-static bool
-is_no_op_mov(struct qinst *qinst)
+/*
+ * The main purpose of the following wrapper is to make calling set_src
+ * cleaner. This is the reason it receives both mux and raddr pointers. Those
+ * will be filled or not based on the device version.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr,
+        enum v3d_qpu_mux *mux,
+        uint8_t *raddr,
+        struct qpu_reg src,
+        const struct v3d_device_info *devinfo)
 {
-        static const struct v3d_qpu_sig no_sig = {0};
-
-        /* Make sure it's just a lone MOV. */
-        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
-            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
-            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
-            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
-                return false;
-        }
+        if (devinfo->ver < 71)
+                return v3d33_set_src(instr, mux, src);
+        else
+                return v3d71_set_src(instr, raddr, src);
+}

-        /* Check if it's a MOV from a register to itself. */
+static bool
+v3d33_mov_src_and_dst_equal(struct qinst *qinst)
+{
         enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
         if (qinst->qpu.alu.mul.magic_write) {
                 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
                         return false;

-                if (qinst->qpu.alu.mul.a !=
+                if (qinst->qpu.alu.mul.a.mux !=
                     V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
                         return false;
                 }
         } else {
                 int raddr;

-                switch (qinst->qpu.alu.mul.a) {
+                switch (qinst->qpu.alu.mul.a.mux) {
                 case V3D_QPU_MUX_A:
                         raddr = qinst->qpu.raddr_a;
                         break;
@@ -168,10 +192,61 @@ is_no_op_mov(struct qinst *qinst)
                         return false;
         }

+        return true;
+}
+
+static bool
+v3d71_mov_src_and_dst_equal(struct qinst *qinst)
+{
+        if (qinst->qpu.alu.mul.magic_write)
+                return false;
+
+        enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+        int raddr;
+
+        raddr = qinst->qpu.alu.mul.a.raddr;
+        if (raddr != waddr)
+                return false;
+
+        return true;
+}
+
+static bool
+mov_src_and_dst_equal(struct qinst *qinst,
+                      const struct v3d_device_info *devinfo)
+{
+        if (devinfo->ver < 71)
+                return v3d33_mov_src_and_dst_equal(qinst);
+        else
+                return v3d71_mov_src_and_dst_equal(qinst);
+}
+
+
+static bool
+is_no_op_mov(struct qinst *qinst,
+             const struct v3d_device_info *devinfo)
+{
+        static const struct v3d_qpu_sig no_sig = {0};
+
+        /* Make sure it's just a lone MOV. We only check for M_MOV. Although
+         * for V3D 7.x there is also A_MOV, we don't need to check for it as
+         * we always emit using M_MOV. We could use A_MOV later on the
+         * squedule to improve performance
+         */
+        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+                return false;
+        }
+
+        if (!mov_src_and_dst_equal(qinst, devinfo))
+                return false;
+
         /* No packing or flags updates, or we need to execute the
          * instruction.
          */
-        if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+        if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
             qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
             qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
             qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
@@ -277,8 +352,15 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);

-                                if (!dst.magic ||
-                                    dst.index != V3D_QPU_WADDR_R5) {
+                                bool use_rf;
+                                if (c->devinfo->has_accumulators) {
+                                        use_rf = !dst.magic ||
+                                                 dst.index != V3D_QPU_WADDR_R5;
+                                } else {
+                                        use_rf = dst.magic || dst.index != 0;
+                                }
+
+                                if (use_rf) {
                                         assert(c->devinfo->ver >= 40);

                                         if (qinst->qpu.sig.ldunif) {
@@ -300,13 +382,18 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 qinst->qpu.sig_magic = dst.magic;
                         } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
                                 if (nsrc >= 1) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.add.a, src[0]);
+                                                &qinst->qpu.alu.add.a.mux,
+                                                &qinst->qpu.alu.add.a.raddr,
+                                                src[0], c->devinfo);
                                 }
                                 if (nsrc >= 2) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.add.b, src[1]);
+                                                &qinst->qpu.alu.add.b.mux,
+                                                &qinst->qpu.alu.add.b.raddr,
+                                                src[1], c->devinfo);
                                 }

                                 qinst->qpu.alu.add.waddr = dst.index;
@@ -314,17 +401,21 @@ v3d_generate_code_block(struct v3d_compile *c,
                         } else {
                                 if (nsrc >= 1) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.mul.a, src[0]);
+                                                &qinst->qpu.alu.mul.a.mux,
+                                                &qinst->qpu.alu.mul.a.raddr,
+                                                src[0], c->devinfo);
                                 }
                                 if (nsrc >= 2) {
                                         set_src(&qinst->qpu,
-                                                &qinst->qpu.alu.mul.b, src[1]);
+                                                &qinst->qpu.alu.mul.b.mux,
+                                                &qinst->qpu.alu.mul.b.raddr,
+                                                src[1], c->devinfo);
                                 }

                                 qinst->qpu.alu.mul.waddr = dst.index;
                                 qinst->qpu.alu.mul.magic_write = dst.magic;

-                                if (is_no_op_mov(qinst)) {
+                                if (is_no_op_mov(qinst, c->devinfo)) {
                                         vir_remove_instruction(c, qinst);
                                         continue;
                                 }
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 2c10e46b188..73cb7aa0575 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')

 subdir('cle')

-v3d_versions = ['33', '41', '42']
+v3d_versions = ['33', '41', '42', '71']
 v3d_libs = []

 if with_gallium_v3d or with_broadcom_vk
diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
index 28fb2357b97..c1590a760de 100644
--- a/src/broadcom/qpu/qpu_disasm.c
+++ b/src/broadcom/qpu/qpu_disasm.c
@@ -56,13 +56,14 @@ pad_to(struct disasm_state *disasm, int n)


 static void
-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
-                     const struct v3d_qpu_instr *instr, uint8_t mux)
+v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
+                       const struct v3d_qpu_instr *instr,
+                       enum v3d_qpu_mux mux)
 {
         if (mux == V3D_QPU_MUX_A) {
                 append(disasm, "rf%d", instr->raddr_a);
         } else if (mux == V3D_QPU_MUX_B) {
-                if (instr->sig.small_imm) {
+                if (instr->sig.small_imm_b) {
                         uint32_t val;
                         ASSERTED bool ok =
                                 v3d_qpu_small_imm_unpack(disasm->devinfo,
@@ -82,6 +83,64 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
         }
 }

+enum v3d_qpu_input_class {
+        V3D_QPU_ADD_A,
+        V3D_QPU_ADD_B,
+        V3D_QPU_MUL_A,
+        V3D_QPU_MUL_B
+};
+
+static void
+v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+                       const struct v3d_qpu_instr *instr,
+                       uint8_t raddr,
+                       enum v3d_qpu_input_class input_class)
+{
+        bool is_small_imm = false;
+        switch(input_class) {
+        case V3D_QPU_ADD_A:
+                is_small_imm = instr->sig.small_imm_a;
+                break;
+        case V3D_QPU_ADD_B:
+                is_small_imm = instr->sig.small_imm_b;
+                break;
+        case V3D_QPU_MUL_A:
+                is_small_imm = instr->sig.small_imm_c;
+                break;
+        case V3D_QPU_MUL_B:
+                is_small_imm = instr->sig.small_imm_d;
+                break;
+        }
+
+        if (is_small_imm) {
+                uint32_t val;
+                ASSERTED bool ok =
+                        v3d_qpu_small_imm_unpack(disasm->devinfo,
+                                                 raddr,
+                                                 &val);
+
+                if ((int)val >= -16 && (int)val <= 15)
+                        append(disasm, "%d", val);
+                else
+                        append(disasm, "0x%08x", val);
+                assert(ok);
+        } else {
+                append(disasm, "rf%d", raddr);
+        }
+}
+
+static void
+v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+                     const struct v3d_qpu_instr *instr,
+                     const struct v3d_qpu_input *input,
+                     enum v3d_qpu_input_class input_class)
+{
+        if (disasm->devinfo->ver < 71)
+                v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
+        else
+                v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
+}
+
 static void
 v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
 {
@@ -121,16 +180,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
         if (num_src >= 1) {
                 if (has_dst)
                         append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.add.a_unpack));
+                       v3d_qpu_unpack_name(instr->alu.add.a.unpack));
         }

         if (num_src >= 2) {
                 append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.add.b_unpack));
+                       v3d_qpu_unpack_name(instr->alu.add.b.unpack));
         }
 }

@@ -164,16 +223,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
         if (num_src >= 1) {
                 if (has_dst)
                         append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
+                       v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
         }

         if (num_src >= 2) {
                 append(disasm, ", ");
-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
+                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
                 append(disasm, "%s",
-                       v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
+                       v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
         }
 }

diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 60dabf74e8e..44f20618a5a 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
         if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
                 return "tmu";

+        /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
+         */
+        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
+                return "quad";
+
+        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
+                return "rep";
+
         static const char *waddr_magic[] = {
                 [V3D_QPU_WADDR_R0] = "r0",
                 [V3D_QPU_WADDR_R1] = "r1",
@@ -169,6 +177,12 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
                 [V3D_QPU_A_ITOF] = "itof",
                 [V3D_QPU_A_CLZ] = "clz",
                 [V3D_QPU_A_UTOF] = "utof",
+                [V3D_QPU_A_MOV] = "mov",
+                [V3D_QPU_A_FMOV] = "fmov",
+                [V3D_QPU_A_VPACK] = "vpack",
+                [V3D_QPU_A_V8PACK] = "v8pack",
+                [V3D_QPU_A_V10PACK] = "v10pack",
+                [V3D_QPU_A_V11FPACK] = "v11fpack",
         };

         if (op >= ARRAY_SIZE(op_names))
@@ -191,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
                 [V3D_QPU_M_MOV] = "mov",
                 [V3D_QPU_M_NOP] = "nop",
                 [V3D_QPU_M_FMUL] = "fmul",
+                [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
+                [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
+                [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
+                [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
+                [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
+                [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
         };

         if (op >= ARRAY_SIZE(op_names))
@@ -450,6 +470,13 @@ static const uint8_t add_op_args[] = {
         [V3D_QPU_A_ITOF] = D | A,
         [V3D_QPU_A_CLZ] = D | A,
         [V3D_QPU_A_UTOF] = D | A,
+
+        [V3D_QPU_A_MOV] = D | A,
+        [V3D_QPU_A_FMOV] = D | A,
+        [V3D_QPU_A_VPACK] = D | A | B,
+        [V3D_QPU_A_V8PACK] = D | A | B,
+        [V3D_QPU_A_V10PACK] = D | A | B,
+        [V3D_QPU_A_V11FPACK] = D | A | B,
 };

 static const uint8_t mul_op_args[] = {
@@ -463,6 +490,12 @@ static const uint8_t mul_op_args[] = {
         [V3D_QPU_M_NOP] = 0,
         [V3D_QPU_M_MOV] = D | A,
         [V3D_QPU_M_FMUL] = D | A | B,
+        [V3D_QPU_M_FTOUNORM16] = D | A,
+        [V3D_QPU_M_FTOSNORM16] = D | A,
+        [V3D_QPU_M_VFTOUNORM8] = D | A,
+        [V3D_QPU_M_VFTOSNORM8] = D | A,
+        [V3D_QPU_M_VFTOUNORM10LO] = D | A,
+        [V3D_QPU_M_VFTOUNORM10HI] = D | A,
 };

 bool
@@ -636,12 +669,14 @@ v3d_qpu_add_op_writes_vpm(enum  v3d_qpu_add_op op)
 }

 bool
-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
 {
-        if (inst->sig.ldtlb ||
-            inst->sig.ldtlbu)
-                return true;
+        return inst->sig.ldtlb || inst->sig.ldtlbu;
+}

+bool
+v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
+{
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
                     inst->alu.add.magic_write &&
@@ -659,6 +694,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
         return false;
 }

+bool
+v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+{
+        return  v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
+}
+
 bool
 v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
 {
@@ -846,6 +887,9 @@ bool
 v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
+        if(!devinfo->has_accumulators)
+                return false;
+
         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
                 return true;

@@ -856,6 +900,9 @@ bool
 v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
                     inst->alu.add.magic_write &&
@@ -886,6 +933,9 @@ bool
 v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
                   const struct v3d_qpu_instr *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
                 return true;

@@ -896,6 +946,9 @@ bool
 v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
                      const struct v3d_qpu_instr *inst)
 {
+        if (!devinfo->has_accumulators)
+                return false;
+
         if (v3d_qpu_writes_r5(devinfo, inst))
                 return true;
         if (v3d_qpu_writes_r4(devinfo, inst))
@@ -912,16 +965,68 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
         return false;
 }

+bool
+v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+                              const struct v3d_qpu_instr *inst)
+{
+        if (devinfo->ver >= 71 &&
+            (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
+                return true;
+        }
+
+        return false;
+}
+
 bool
 v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
 {
         int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
         int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);

-        return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
-                (add_nsrc > 1 && inst->alu.add.b == mux) ||
-                (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
-                (mul_nsrc > 1 && inst->alu.mul.b == mux));
+        return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
+                (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
+                (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
+                (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+}
+
+bool
+v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+{
+        int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+        int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+
+        return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
+               (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
+               (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
+               (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
+}
+
+bool
+v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+                                  const struct v3d_qpu_instr *inst,
+                                  uint8_t waddr)
+{
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
+            !inst->alu.add.magic_write &&
+            inst->alu.add.waddr == waddr) {
+                return true;
+        }
+
+        if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
+            !inst->alu.mul.magic_write &&
+            inst->alu.mul.waddr == waddr) {
+                return true;
+        }
+
+        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+            !inst->sig_magic && inst->sig_addr == waddr) {
+                return true;
+        }
+
+        return false;
 }

 bool
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index 2e133472698..56eee9f9cac 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
         bool ldvpm:1;
         bool ldtlb:1;
         bool ldtlbu:1;
-        bool small_imm:1;
         bool ucb:1;
         bool rotate:1;
         bool wrtmuc:1;
+        bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
+        bool small_imm_b:1; /* raddr_b (add b) */
+        bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
+        bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
 };

 enum v3d_qpu_cond {
@@ -88,12 +91,13 @@ enum v3d_qpu_uf {
 };

 enum v3d_qpu_waddr {
-        V3D_QPU_WADDR_R0 = 0,
-        V3D_QPU_WADDR_R1 = 1,
-        V3D_QPU_WADDR_R2 = 2,
-        V3D_QPU_WADDR_R3 = 3,
-        V3D_QPU_WADDR_R4 = 4,
-        V3D_QPU_WADDR_R5 = 5,
+        V3D_QPU_WADDR_R0 = 0,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R1 = 1,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R2 = 2,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R3 = 3,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R4 = 4,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_R5 = 5,    /* V3D 4.x */
+        V3D_QPU_WADDR_QUAD = 5,  /* V3D 7.x */
         V3D_QPU_WADDR_NOP = 6,
         V3D_QPU_WADDR_TLB = 7,
         V3D_QPU_WADDR_TLBU = 8,
@@ -108,12 +112,12 @@ enum v3d_qpu_waddr {
         V3D_QPU_WADDR_SYNC = 16,
         V3D_QPU_WADDR_SYNCU = 17,
         V3D_QPU_WADDR_SYNCB = 18,
-        V3D_QPU_WADDR_RECIP = 19,
-        V3D_QPU_WADDR_RSQRT = 20,
-        V3D_QPU_WADDR_EXP = 21,
-        V3D_QPU_WADDR_LOG = 22,
-        V3D_QPU_WADDR_SIN = 23,
-        V3D_QPU_WADDR_RSQRT2 = 24,
+        V3D_QPU_WADDR_RECIP = 19,  /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_RSQRT = 20,  /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_EXP = 21,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_LOG = 22,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_SIN = 23,    /* Reserved on V3D 7.x */
+        V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
         V3D_QPU_WADDR_TMUC = 32,
         V3D_QPU_WADDR_TMUS = 33,
         V3D_QPU_WADDR_TMUT = 34,
@@ -129,7 +133,8 @@ enum v3d_qpu_waddr {
         V3D_QPU_WADDR_TMUHSCM = 44,
         V3D_QPU_WADDR_TMUHSF = 45,
         V3D_QPU_WADDR_TMUHSLOD = 46,
-        V3D_QPU_WADDR_R5REP = 55,
+        V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
+        V3D_QPU_WADDR_REP = 55,   /* V3D 7.x */
 };

 struct v3d_qpu_flags {
@@ -222,6 +227,14 @@ enum v3d_qpu_add_op {
         V3D_QPU_A_ITOF,
         V3D_QPU_A_CLZ,
         V3D_QPU_A_UTOF,
+
+        /* V3D 7.x */
+        V3D_QPU_A_FMOV,
+        V3D_QPU_A_MOV,
+        V3D_QPU_A_VPACK,
+        V3D_QPU_A_V8PACK,
+        V3D_QPU_A_V10PACK,
+        V3D_QPU_A_V11FPACK,
 };

 enum v3d_qpu_mul_op {
@@ -235,6 +248,14 @@ enum v3d_qpu_mul_op {
         V3D_QPU_M_MOV,
         V3D_QPU_M_NOP,
         V3D_QPU_M_FMUL,
+
+        /* V3D 7.x */
+        V3D_QPU_M_FTOUNORM16,
+        V3D_QPU_M_FTOSNORM16,
+        V3D_QPU_M_VFTOUNORM8,
+        V3D_QPU_M_VFTOSNORM8,
+        V3D_QPU_M_VFTOUNORM10LO,
+        V3D_QPU_M_VFTOUNORM10HI,
 };

 enum v3d_qpu_output_pack {
@@ -276,6 +297,15 @@ enum v3d_qpu_input_unpack {

         /** Swap high and low 16 bits */
         V3D_QPU_UNPACK_SWAP_16,
+
+        /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
+        V3D_QPU_UNPACK_UL,
+        /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
+        V3D_QPU_UNPACK_UH,
+        /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
+        V3D_QPU_UNPACK_IL,
+        /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
+        V3D_QPU_UNPACK_IH,
 };

 enum v3d_qpu_mux {
@@ -289,25 +319,29 @@ enum v3d_qpu_mux {
         V3D_QPU_MUX_B,
 };

+struct v3d_qpu_input {
+        union {
+                enum v3d_qpu_mux mux; /* V3D 4.x */
+                uint8_t raddr; /* V3D 7.x */
+        };
+        enum v3d_qpu_input_unpack unpack;
+};
+
 struct v3d_qpu_alu_instr {
         struct {
                 enum v3d_qpu_add_op op;
-                enum v3d_qpu_mux a, b;
+                struct v3d_qpu_input a, b;
                 uint8_t waddr;
                 bool magic_write;
                 enum v3d_qpu_output_pack output_pack;
-                enum v3d_qpu_input_unpack a_unpack;
-                enum v3d_qpu_input_unpack b_unpack;
         } add;

         struct {
                 enum v3d_qpu_mul_op op;
-                enum v3d_qpu_mux a, b;
+                struct v3d_qpu_input a, b;
                 uint8_t waddr;
                 bool magic_write;
                 enum v3d_qpu_output_pack output_pack;
-                enum v3d_qpu_input_unpack a_unpack;
-                enum v3d_qpu_input_unpack b_unpack;
         } mul;
 };

@@ -379,8 +413,8 @@ struct v3d_qpu_instr {
         struct v3d_qpu_sig sig;
         uint8_t sig_addr;
         bool sig_magic; /* If the signal writes to a magic address */
-        uint8_t raddr_a;
-        uint8_t raddr_b;
+        uint8_t raddr_a; /* V3D 4.x */
+        uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
         struct v3d_qpu_flags flags;

         union {
@@ -450,6 +484,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -464,6 +500,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
+                                   const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
                           const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
@@ -483,4 +521,9 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;

 bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+
+bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
+                                       const struct v3d_qpu_instr *inst,
+                                       uint8_t waddr);
 #endif
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 94629aff4fc..4e3c3da8866 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -84,6 +84,9 @@
 #define V3D_QPU_MUL_A_SHIFT                 18
 #define V3D_QPU_MUL_A_MASK                  QPU_MASK(20, 18)

+#define V3D_QPU_RADDR_C_SHIFT               18
+#define V3D_QPU_RADDR_C_MASK                QPU_MASK(23, 18)
+
 #define V3D_QPU_ADD_B_SHIFT                 15
 #define V3D_QPU_ADD_B_MASK                  QPU_MASK(17, 15)

@@ -98,6 +101,9 @@
 #define V3D_QPU_BRANCH_BDI_SHIFT            12
 #define V3D_QPU_BRANCH_BDI_MASK             QPU_MASK(13, 12)

+#define V3D_QPU_RADDR_D_SHIFT               12
+#define V3D_QPU_RADDR_D_MASK                QPU_MASK(17, 12)
+
 #define V3D_QPU_RADDR_A_SHIFT               6
 #define V3D_QPU_RADDR_A_MASK                QPU_MASK(11, 6)

@@ -112,12 +118,15 @@
 #define LDTMU .ldtmu = true
 #define LDVARY .ldvary = true
 #define LDVPM .ldvpm = true
-#define SMIMM .small_imm = true
 #define LDTLB .ldtlb = true
 #define LDTLBU .ldtlbu = true
 #define UCB .ucb = true
 #define ROT .rotate = true
 #define WRTMUC .wrtmuc = true
+#define SMIMM_A .small_imm_a = true
+#define SMIMM_B .small_imm_b = true
+#define SMIMM_C .small_imm_c = true
+#define SMIMM_D .small_imm_d = true

 static const struct v3d_qpu_sig v33_sig_map[] = {
         /*      MISC   R3       R4      R5 */
@@ -135,8 +144,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
         [11] = { THRSW, LDVARY,         LDUNIF },
         [12] = {        LDVARY, LDTMU,         },
         [13] = { THRSW, LDVARY, LDTMU,         },
-        [14] = { SMIMM, LDVARY,                },
-        [15] = { SMIMM,                        },
+        [14] = { SMIMM_B, LDVARY,              },
+        [15] = { SMIMM_B,                      },
         [16] = {        LDTLB,                 },
         [17] = {        LDTLBU,                },
         /* 18-21 reserved */
@@ -148,8 +157,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
         [27] = { THRSW, LDVPM,          LDUNIF },
         [28] = {        LDVPM, LDTMU,          },
         [29] = { THRSW, LDVPM, LDTMU,          },
-        [30] = { SMIMM, LDVPM,                 },
-        [31] = { SMIMM,                        },
+        [30] = { SMIMM_B, LDVPM,               },
+        [31] = { SMIMM_B,                      },
 };

 static const struct v3d_qpu_sig v40_sig_map[] = {
@@ -167,8 +176,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
         [10] = {        LDVARY,         LDUNIF },
         [11] = { THRSW, LDVARY,         LDUNIF },
         /* 12-13 reserved */
-        [14] = { SMIMM, LDVARY,                },
-        [15] = { SMIMM,                        },
+        [14] = { SMIMM_B, LDVARY,              },
+        [15] = { SMIMM_B,                      },
         [16] = {        LDTLB,                 },
         [17] = {        LDTLBU,                },
         [18] = {                        WRTMUC },
@@ -178,7 +187,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
         [22] = { UCB,                          },
         [23] = { ROT,                          },
         /* 24-30 reserved */
-        [31] = { SMIMM,         LDTMU,         },
+        [31] = { SMIMM_B,       LDTMU,         },
 };

 static const struct v3d_qpu_sig v41_sig_map[] = {
@@ -197,8 +206,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
         [11] = { THRSW,    LDVARY, LDUNIF },
         [12] = { LDUNIFRF                 },
         [13] = { THRSW,    LDUNIFRF       },
-        [14] = { SMIMM,    LDVARY,        },
-        [15] = { SMIMM,                   },
+        [14] = { SMIMM_B,    LDVARY       },
+        [15] = { SMIMM_B,                 },
         [16] = {           LDTLB,         },
         [17] = {           LDTLBU,        },
         [18] = {                          WRTMUC },
@@ -210,7 +219,41 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
         [24] = {                   LDUNIFA},
         [25] = { LDUNIFARF                },
         /* 26-30 reserved */
-        [31] = { SMIMM,            LDTMU, },
+        [31] = { SMIMM_B,          LDTMU, },
+};
+
+
+static const struct v3d_qpu_sig v71_sig_map[] = {
+        /*      MISC       phys    RF0 */
+        [0]  = {                          },
+        [1]  = { THRSW,                   },
+        [2]  = {                   LDUNIF },
+        [3]  = { THRSW,            LDUNIF },
+        [4]  = {           LDTMU,         },
+        [5]  = { THRSW,    LDTMU,         },
+        [6]  = {           LDTMU,  LDUNIF },
+        [7]  = { THRSW,    LDTMU,  LDUNIF },
+        [8]  = {           LDVARY,        },
+        [9]  = { THRSW,    LDVARY,        },
+        [10] = {           LDVARY, LDUNIF },
+        [11] = { THRSW,    LDVARY, LDUNIF },
+        [12] = { LDUNIFRF                 },
+        [13] = { THRSW,    LDUNIFRF       },
+        [14] = { SMIMM_A,                 },
+        [15] = { SMIMM_B,                 },
+        [16] = {           LDTLB,         },
+        [17] = {           LDTLBU,        },
+        [18] = {                          WRTMUC },
+        [19] = { THRSW,                   WRTMUC },
+        [20] = {           LDVARY,        WRTMUC },
+        [21] = { THRSW,    LDVARY,        WRTMUC },
+        [22] = { UCB,                     },
+        /* 23 reserved */
+        [24] = {                   LDUNIFA},
+        [25] = { LDUNIFARF                },
+        /* 26-29 reserved */
+        [30] = { SMIMM_C,                 },
+        [31] = { SMIMM_D,                 },
 };

 bool
@@ -221,7 +264,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
         if (packed_sig >= ARRAY_SIZE(v33_sig_map))
                 return false;

-        if (devinfo->ver >= 41)
+        if (devinfo->ver >= 71)
+                *sig = v71_sig_map[packed_sig];
+        else if (devinfo->ver >= 41)
                 *sig = v41_sig_map[packed_sig];
         else if (devinfo->ver == 40)
                 *sig = v40_sig_map[packed_sig];
@@ -240,7 +285,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
 {
         static const struct v3d_qpu_sig *map;

-        if (devinfo->ver >= 41)
+        if (devinfo->ver >= 71)
+                map = v71_sig_map;
+        else if (devinfo->ver >= 41)
                 map = v41_sig_map;
         else if (devinfo->ver == 40)
                 map = v40_sig_map;
@@ -443,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,

 /* Make a mapping of the table of opcodes in the spec.  The opcode is
  * determined by a combination of the opcode field, and in the case of 0 or
- * 1-arg opcodes, the mux_b field as well.
+ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
+ * well.
  */
-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
-#define ANYMUX MUX_MASK(0, 7)
+#define OP_MASK(val) BITFIELD64_BIT(val)
+#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
+#define ANYMUX OP_RANGE(0, 7)
+#define ANYOPMASK OP_RANGE(0, 63)

 struct opcode_desc {
         uint8_t opcode_first;
         uint8_t opcode_last;
-        uint8_t mux_b_mask;
-        uint8_t mux_a_mask;
+
+        union {
+                struct {
+                        uint8_t b_mask;
+                        uint8_t a_mask;
+                } mux;
+                uint64_t raddr_mask;
+        };
+
         uint8_t op;

         /* first_ver == 0 if it's the same across all V3D versions.
@@ -465,122 +522,321 @@ struct opcode_desc {
         uint8_t last_ver;
 };

-static const struct opcode_desc add_ops[] = {
+static const struct opcode_desc add_ops_v33[] = {
         /* FADD is FADDNF depending on the order of the mux_a/mux_b. */
-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADD },
-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
-        { 53,  55,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
-        { 56,  56,  ANYMUX, ANYMUX, V3D_QPU_A_ADD },
-        { 57,  59,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
-        { 60,  60,  ANYMUX, ANYMUX, V3D_QPU_A_SUB },
-        { 61,  63,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
-        { 64,  111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
-        { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
-        { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
-        { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
-        { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
-        { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
-        { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
-        { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
-        { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
+        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
+        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
+        { 53,  55,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+        { 56,  56,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
+        { 57,  59,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+        { 60,  60,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
+        { 61,  63,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
+        { 64,  111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
+        { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
+        { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
+        { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
+        { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
+        { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
+        { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
+        { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
+        { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
         /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
-        { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
-
-        { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
-        { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
-        { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
-
-        { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
-        { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
-        { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
-        { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
-        { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
-        { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
-        { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
-        { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
-        { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
-        { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
-        { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
-        { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
-        { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
-        { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
-        { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
-        { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
-        { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
-        { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
-
-        { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
-        { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
-        { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
-        { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
-
-        { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
-        { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
-        { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
-        { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
-        { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
-        { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
-        { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
-        { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
-        { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
-
-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
-        { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
-        { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
-        { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
-        { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
-        { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
-        { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
+        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
+        { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
+
+        { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
+        { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
+        { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
+
+        { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
+        { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
+        { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
+        { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
+        { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
+        { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
+        { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
+        { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
+        { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
+        { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
+        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
+
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
+        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
+        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
+        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+
+        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+        { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
+        { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },

         /* FIXME: MORE COMPLICATED */
-        /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+        /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */

-        { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
-        { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
+        { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
+        { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },

-        { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
-        { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
-        { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
-        { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
-        { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
-        { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
-        { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
-        { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
+        { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
+        { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
+        { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
+        { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
+        { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
+        { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
+        { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
+        { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },

-        { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
-        { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
+        { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
+        { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },

         /* The stvpms are distinguished by the waddr field. */
-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
+        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
+        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
+        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
+
+        { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
+        { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
+        { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
+};
+
+static const struct opcode_desc mul_ops_v33[] = {
+        { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
+        { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
+        { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
+        { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
+        { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
+        { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
+        { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
+        { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
+        { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
+        { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
+
+        { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
+};
+
+/* Note that it would have been possible to define all the add/mul opcodes in
+ * just one table, using the first_ver/last_ver. But taking into account that
+ * for v71 there were a lot of changes, it was more tidy this way. Also right
+ * now we are doing a linear search on those tables, so this maintains the
+ * tables smaller.
+ *
+ * Just in case we merge the tables, we define the first_ver as 71 for those
+ * opcodes that changed on v71
+ */
+static const struct opcode_desc add_ops_v71[] = {
+        /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
+        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
+        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
+        { 53,  55,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+        { 56,  56,  .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
+        { 57,  59,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+        { 60,  60,  .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
+        { 61,  63,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+        { 64,  111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
+        { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
+        { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
+        { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
+        { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
+        { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
+        { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
+        { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
+        { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
+        /* FMIN is instead FMAX depending on the raddr_a/b order. */
+        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
+        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
+        { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
+
+        { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
+        { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
+        { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
+        { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
+        { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
+
+        { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
+        { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
+        { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
+        { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
+        { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
+        { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
+        { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
+        { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
+
+        { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
+        { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
+        { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
+        { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
+        { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
+        { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
+        { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
+        { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
+        { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
+        { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
+        { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
+        { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
+        { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
+        { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
+        { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
+        { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
+        { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
+        { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
+        { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
+
+        { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
+        { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
+
+        { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
+
+        { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
+        { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
+
+        { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },

-        { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
-        { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
-        { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
+        /* The stvpms are distinguished by the waddr field. */
+        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
+        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
+        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
+
+        { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FROUND, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FROUND, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FROUND, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(3),  V3D_QPU_A_FTOIN, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(7),  V3D_QPU_A_FTOIN, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
+        { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
+
+        { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
+        { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
+
+        { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
+        { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
+        { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
+        { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
+
+        { 246, 246, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
+
+        { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+        { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+
+        { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
+        { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
+
+        { 249, 249, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
+        { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
+
+        { 249, 249, .raddr_mask = OP_MASK(3),  V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(7),  V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+        { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+
+        { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
+        { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
 };

-static const struct opcode_desc mul_ops[] = {
-        { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
-        { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
-        { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
-        { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
-        { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
-        { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
-        { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
-        { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
-        { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
-        { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
-        { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
+static const struct opcode_desc mul_ops_v71[] = {
+        /* For V3D 7.1, second mask field would be ignored */
+        { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
+        { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
+        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
+        { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
+        { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
+        { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
+
+        { 14, 14, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
+        { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
+
+        { 14, 14, .raddr_mask = OP_MASK(3),  V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(7),  V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+
+        { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
+        { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
+
+        { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
+
+        { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
 };

 /* Returns true if op_desc should be filtered out based on devinfo->ver
@@ -589,17 +845,23 @@ static const struct opcode_desc mul_ops[] = {
  */
 static bool
 opcode_invalid_in_version(const struct v3d_device_info *devinfo,
-                          const struct opcode_desc *op_desc)
+                          const uint8_t first_ver,
+                          const uint8_t last_ver)
 {
-        return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
-                (op_desc->last_ver != 0  && devinfo->ver > op_desc->last_ver);
+        return (first_ver != 0 && devinfo->ver < first_ver) ||
+                (last_ver != 0  && devinfo->ver > last_ver);
 }

+/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
+ * on the devinfo->ver some would be ignored. We do this way just to avoid
+ * having two really similar lookup_opcode methods
+ */
 static const struct opcode_desc *
 lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
                           const struct opcode_desc *opcodes,
                           size_t num_opcodes, uint32_t opcode,
-                          uint32_t mux_a, uint32_t mux_b)
+                          uint32_t mux_a, uint32_t mux_b,
+                          uint32_t raddr)
 {
         for (int i = 0; i < num_opcodes; i++) {
                 const struct opcode_desc *op_desc = &opcodes[i];
@@ -608,14 +870,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
                     opcode > op_desc->opcode_last)
                         continue;

-                if (opcode_invalid_in_version(devinfo, op_desc))
+                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
                         continue;

-                if (!(op_desc->mux_b_mask & (1 << mux_b)))
-                        continue;
+                if (devinfo->ver < 71) {
+                        if (!(op_desc->mux.b_mask & (1 << mux_b)))
+                                continue;

-                if (!(op_desc->mux_a_mask & (1 << mux_a)))
-                        continue;
+                        if (!(op_desc->mux.a_mask & (1 << mux_a)))
+                                continue;
+                } else {
+                        if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
+                                continue;
+                }

                 return op_desc;
         }
@@ -667,6 +934,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
         }
 }

+static bool
+v3d_qpu_int32_unpack_unpack(uint32_t packed,
+                            enum v3d_qpu_input_unpack *unpacked)
+{
+        switch (packed) {
+        case 0:
+                *unpacked = V3D_QPU_UNPACK_NONE;
+                return true;
+        case 1:
+                *unpacked = V3D_QPU_UNPACK_UL;
+                return true;
+        case 2:
+                *unpacked = V3D_QPU_UNPACK_UH;
+                return true;
+        case 3:
+                *unpacked = V3D_QPU_UNPACK_IL;
+                return true;
+        case 4:
+                *unpacked = V3D_QPU_UNPACK_IH;
+                return true;
+        default:
+                return false;
+        }
+}
+
+static bool
+v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+                          uint32_t *packed)
+{
+        switch (unpacked) {
+        case V3D_QPU_UNPACK_NONE:
+                *packed = 0;
+                return true;
+        case V3D_QPU_UNPACK_UL:
+                *packed = 1;
+                return true;
+        case V3D_QPU_UNPACK_UH:
+                *packed = 2;
+                return true;
+        case V3D_QPU_UNPACK_IL:
+                *packed = 3;
+                return true;
+        case V3D_QPU_UNPACK_IH:
+                *packed = 4;
+                return true;
+        default:
+                return false;
+        }
+}
+
 static bool
 v3d_qpu_float16_unpack_unpack(uint32_t packed,
                               enum v3d_qpu_input_unpack *unpacked)
@@ -737,8 +1054,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_input_unpack unpacked,
 }

 static bool
-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
-                   struct v3d_qpu_instr *instr)
+v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
 {
         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
@@ -755,8 +1072,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 map_op = (map_op - 253 + 245);

         const struct opcode_desc *desc =
-                lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
-                                          map_op, mux_a, mux_b);
+                lookup_opcode_from_packed(devinfo, add_ops_v33,
+                                          ARRAY_SIZE(add_ops_v33),
+                                          map_op, mux_a, mux_b, 0);

         if (!desc)
                 return false;
@@ -812,12 +1130,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                         instr->alu.add.output_pack = V3D_QPU_PACK_NONE;

                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }

                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
-                                                   &instr->alu.add.b_unpack)) {
+                                                   &instr->alu.add.b.unpack)) {
                         return false;
                 }
                 break;
@@ -831,7 +1149,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.add.output_pack = mux_b & 0x3;

                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }
                 break;
@@ -843,7 +1161,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;

                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }
                 break;
@@ -851,23 +1169,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
         case V3D_QPU_A_VFMIN:
         case V3D_QPU_A_VFMAX:
                 if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
-                                                   &instr->alu.add.a_unpack)) {
+                                                   &instr->alu.add.a.unpack)) {
                         return false;
                 }

                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
                 break;

         default:
                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
-                instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
                 break;
         }

-        instr->alu.add.a = mux_a;
-        instr->alu.add.b = mux_b;
+        instr->alu.add.a.mux = mux_a;
+        instr->alu.add.b.mux = mux_b;
         instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);

         instr->alu.add.magic_write = false;
@@ -892,8 +1210,194 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
 }

 static bool
-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
+{
+        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+        uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
+        uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
+        uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+        uint32_t map_op = op;
+
+        const struct opcode_desc *desc =
+                lookup_opcode_from_packed(devinfo,
+                                          add_ops_v71,
+                                          ARRAY_SIZE(add_ops_v71),
+                                          map_op, 0, 0,
+                                          raddr_b);
+        if (!desc)
+                return false;
+
+        instr->alu.add.op = desc->op;
+
+        /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
+         * operands.
+         */
+        if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+            instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
+                if (instr->alu.add.op == V3D_QPU_A_FMIN)
+                        instr->alu.add.op = V3D_QPU_A_FMAX;
+                if (instr->alu.add.op == V3D_QPU_A_FADD)
+                        instr->alu.add.op = V3D_QPU_A_FADDNF;
+        }
+
+        /* Some QPU ops require a bit more than just basic opcode and mux a/b
+         * comparisons to distinguish them.
+         */
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_STVPMV:
+        case V3D_QPU_A_STVPMD:
+        case V3D_QPU_A_STVPMP:
+                switch (waddr) {
+                case 0:
+                        instr->alu.add.op = V3D_QPU_A_STVPMV;
+                        break;
+                case 1:
+                        instr->alu.add.op = V3D_QPU_A_STVPMD;
+                        break;
+                case 2:
+                        instr->alu.add.op = V3D_QPU_A_STVPMP;
+                        break;
+                default:
+                        return false;
+                }
+                break;
+        default:
+                break;
+        }
+
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_FADD:
+        case V3D_QPU_A_FADDNF:
+        case V3D_QPU_A_FSUB:
+        case V3D_QPU_A_FMIN:
+        case V3D_QPU_A_FMAX:
+        case V3D_QPU_A_FCMP:
+        case V3D_QPU_A_VFPACK:
+                if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
+                    instr->alu.add.op != V3D_QPU_A_FCMP) {
+                        instr->alu.add.output_pack = (op >> 4) & 0x3;
+                } else {
+                        instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                }
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+                                                   &instr->alu.add.b.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_FFLOOR:
+        case V3D_QPU_A_FROUND:
+        case V3D_QPU_A_FTRUNC:
+        case V3D_QPU_A_FCEIL:
+        case V3D_QPU_A_FDX:
+        case V3D_QPU_A_FDY:
+                instr->alu.add.output_pack = raddr_b & 0x3;
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_FTOIN:
+        case V3D_QPU_A_FTOIZ:
+        case V3D_QPU_A_FTOUZ:
+        case V3D_QPU_A_FTOC:
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_VFMIN:
+        case V3D_QPU_A_VFMAX:
+                unreachable("pending v71 update");
+                if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                break;
+
+        case V3D_QPU_A_MOV:
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
+                                                 &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        case V3D_QPU_A_FMOV:
+                instr->alu.add.output_pack = raddr_b & 0x3;
+
+                /* Mul alu FMOV has one additional variant */
+                int32_t unpack = (raddr_b >> 2) & 0x7;
+                if (unpack == 7)
+                        return false;
+
+                if (!v3d_qpu_float32_unpack_unpack(unpack,
+                                                   &instr->alu.add.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        default:
+                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                break;
+        }
+
+        instr->alu.add.a.raddr = raddr_a;
+        instr->alu.add.b.raddr = raddr_b;
+        instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+
+        instr->alu.add.magic_write = false;
+        if (packed_inst & V3D_QPU_MA) {
+                switch (instr->alu.add.op) {
+                case V3D_QPU_A_LDVPMV_IN:
+                        instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
+                        break;
+                case V3D_QPU_A_LDVPMD_IN:
+                        instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
+                        break;
+                case V3D_QPU_A_LDVPMG_IN:
+                        instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
+                        break;
+                default:
+                        instr->alu.add.magic_write = true;
+                        break;
+                }
+        }
+
+        return true;
+}
+
+static bool
+v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                    struct v3d_qpu_instr *instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
+        else
+                return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
+}
+
+static bool
+v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
 {
         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
@@ -901,9 +1405,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,

         {
                 const struct opcode_desc *desc =
-                        lookup_opcode_from_packed(devinfo, mul_ops,
-                                                  ARRAY_SIZE(mul_ops),
-                                                  op, mux_a, mux_b);
+                        lookup_opcode_from_packed(devinfo,
+                                                  mul_ops_v33,
+                                                  ARRAY_SIZE(mul_ops_v33),
+                                                  op, mux_a, mux_b, 0);
                 if (!desc)
                         return false;

@@ -915,12 +1420,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;

                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
-                                                   &instr->alu.mul.a_unpack)) {
+                                                   &instr->alu.mul.a.unpack)) {
                         return false;
                 }

                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
-                                                   &instr->alu.mul.b_unpack)) {
+                                                   &instr->alu.mul.b.unpack)) {
                         return false;
                 }

@@ -931,7 +1436,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                                               ((mux_b >> 2) & 1));

                 if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
-                                                   &instr->alu.mul.a_unpack)) {
+                                                   &instr->alu.mul.a.unpack)) {
                         return false;
                 }

@@ -941,74 +1446,169 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;

                 if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
-                                                   &instr->alu.mul.a_unpack)) {
+                                                   &instr->alu.mul.a.unpack)) {
                         return false;
                 }

-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;

                 break;

         default:
                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
-                instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
                 break;
         }

-        instr->alu.mul.a = mux_a;
-        instr->alu.mul.b = mux_b;
+        instr->alu.mul.a.mux = mux_a;
+        instr->alu.mul.b.mux = mux_b;
         instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
         instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;

         return true;
 }

-static const struct opcode_desc *
-lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
-                         const struct opcode_desc *opcodes, size_t num_opcodes,
-                         uint8_t op)
+static bool
+v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                     struct v3d_qpu_instr *instr)
 {
-        for (int i = 0; i < num_opcodes; i++) {
-                const struct opcode_desc *op_desc = &opcodes[i];
-
-                if (op_desc->op != op)
-                        continue;
+        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+        uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
+        uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);

-                if (opcode_invalid_in_version(devinfo, op_desc))
-                        continue;
+        {
+                const struct opcode_desc *desc =
+                        lookup_opcode_from_packed(devinfo,
+                                                  mul_ops_v71,
+                                                  ARRAY_SIZE(mul_ops_v71),
+                                                  op, 0, 0,
+                                                  raddr_d);
+                if (!desc)
+                        return false;

-                return op_desc;
+                instr->alu.mul.op = desc->op;
         }

-        return NULL;
-}
-
+        switch (instr->alu.mul.op) {
+        case V3D_QPU_M_FMUL:
+                instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+                                                   &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+                                                   &instr->alu.mul.b.unpack)) {
+                        return false;
+                }
+
+                break;
+
+        case V3D_QPU_M_FMOV:
+                instr->alu.mul.output_pack = raddr_d & 0x3;
+
+                if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
+                                                   &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+
+                break;
+
+        case V3D_QPU_M_VFMUL:
+                unreachable("pending v71 update");
+                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+                                                   &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+
+                break;
+
+        case V3D_QPU_M_MOV:
+                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+
+                if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
+                                                 &instr->alu.mul.a.unpack)) {
+                        return false;
+                }
+                break;
+
+        default:
+                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+                break;
+        }
+
+        instr->alu.mul.a.raddr = raddr_c;
+        instr->alu.mul.b.raddr = raddr_d;
+        instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+        instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+
+        return true;
+}
+
 static bool
-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                   struct v3d_qpu_instr *instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
+        else
+                return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
+}
+
+static const struct opcode_desc *
+lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+                         const struct opcode_desc *opcodes, size_t num_opcodes,
+                         uint8_t op)
+{
+        for (int i = 0; i < num_opcodes; i++) {
+                const struct opcode_desc *op_desc = &opcodes[i];
+
+                if (op_desc->op != op)
+                        continue;
+
+                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
+                        continue;
+
+                return op_desc;
+        }
+
+        return NULL;
+}
+
+static bool
+v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
 {
         uint32_t waddr = instr->alu.add.waddr;
-        uint32_t mux_a = instr->alu.add.a;
-        uint32_t mux_b = instr->alu.add.b;
+        uint32_t mux_a = instr->alu.add.a.mux;
+        uint32_t mux_b = instr->alu.add.b.mux;
         int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
         const struct opcode_desc *desc =
-                lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+                lookup_opcode_from_instr(devinfo, add_ops_v33,
+                                         ARRAY_SIZE(add_ops_v33),
                                          instr->alu.add.op);

         if (!desc)
                 return false;

-        uint32_t opcode = desc->opcode_first;
+        uint32_t opcode = opcode = desc->opcode_first;

         /* If an operation doesn't use an arg, its mux values may be used to
          * identify the operation type.
          */
         if (nsrc < 2)
-                mux_b = ffs(desc->mux_b_mask) - 1;
+                mux_b = ffs(desc->mux.b_mask) - 1;

         if (nsrc < 1)
-                mux_a = ffs(desc->mux_a_mask) - 1;
+                mux_a = ffs(desc->mux.a_mask) - 1;

         bool no_magic_write = false;

@@ -1061,12 +1661,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 }
                 opcode |= output_pack << 4;

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &a_unpack)) {
                         return false;
                 }

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
                                                  &b_unpack)) {
                         return false;
                 }
@@ -1100,23 +1700,23 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 uint32_t a_unpack;
                 uint32_t b_unpack;

-                if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
-                    instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
                         return false;
                 }

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &a_unpack)) {
                         return false;
                 }

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
                                                  &b_unpack)) {
                         return false;
                 }

-                opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
-                opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);

                 break;
         }
@@ -1135,13 +1735,13 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                 }
                 mux_b |= packed;

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &packed)) {
                         return false;
                 }
                 if (packed == 0)
                         return false;
-                opcode = (opcode & ~(1 << 2)) | packed << 2;
+                opcode = (opcode & ~(0x3 << 2)) | packed << 2;
                 break;
         }

@@ -1153,7 +1753,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
                         return false;

                 uint32_t packed;
-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1166,11 +1766,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         case V3D_QPU_A_VFMIN:
         case V3D_QPU_A_VFMAX:
                 if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
-                    instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
+                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
                         return false;
                 }

-                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1180,8 +1780,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         default:
                 if (instr->alu.add.op != V3D_QPU_A_NOP &&
                     (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
-                     instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
-                     instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
+                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
                         return false;
                 }
                 break;
@@ -1198,15 +1798,280 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
 }

 static bool
-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
 {
-        uint32_t mux_a = instr->alu.mul.a;
-        uint32_t mux_b = instr->alu.mul.b;
+        uint32_t waddr = instr->alu.add.waddr;
+        uint32_t raddr_a = instr->alu.add.a.raddr;
+        uint32_t raddr_b = instr->alu.add.b.raddr;
+
+        int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+        const struct opcode_desc *desc =
+                lookup_opcode_from_instr(devinfo, add_ops_v71,
+                                         ARRAY_SIZE(add_ops_v71),
+                                         instr->alu.add.op);
+        if (!desc)
+                return false;
+
+        uint32_t opcode = opcode = desc->opcode_first;
+
+        /* If an operation doesn't use an arg, its raddr values may be used to
+         * identify the operation type.
+         */
+        if (nsrc < 2)
+                raddr_b = ffsll(desc->raddr_mask) - 1;
+
+        bool no_magic_write = false;
+
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_STVPMV:
+                waddr = 0;
+                no_magic_write = true;
+                break;
+        case V3D_QPU_A_STVPMD:
+                waddr = 1;
+                no_magic_write = true;
+                break;
+        case V3D_QPU_A_STVPMP:
+                waddr = 2;
+                no_magic_write = true;
+                break;
+
+        case V3D_QPU_A_LDVPMV_IN:
+        case V3D_QPU_A_LDVPMD_IN:
+        case V3D_QPU_A_LDVPMP:
+        case V3D_QPU_A_LDVPMG_IN:
+                assert(!instr->alu.add.magic_write);
+                break;
+
+        case V3D_QPU_A_LDVPMV_OUT:
+        case V3D_QPU_A_LDVPMD_OUT:
+        case V3D_QPU_A_LDVPMG_OUT:
+                assert(!instr->alu.add.magic_write);
+                *packed_instr |= V3D_QPU_MA;
+                break;
+
+        default:
+                break;
+        }
+
+        switch (instr->alu.add.op) {
+        case V3D_QPU_A_FADD:
+        case V3D_QPU_A_FADDNF:
+        case V3D_QPU_A_FSUB:
+        case V3D_QPU_A_FMIN:
+        case V3D_QPU_A_FMAX:
+        case V3D_QPU_A_FCMP: {
+                uint32_t output_pack;
+                uint32_t a_unpack;
+                uint32_t b_unpack;
+
+                if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+                        if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+                                                       &output_pack)) {
+                                return false;
+                        }
+                        opcode |= output_pack << 4;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &a_unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                 &b_unpack)) {
+                        return false;
+                }
+
+                /* These operations with commutative operands are
+                 * distinguished by which order their operands come in.
+                 */
+                bool ordering =
+                        instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
+                        instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
+                if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+                      instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+                    ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+                      instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
+                        uint32_t temp;
+
+                        temp = a_unpack;
+                        a_unpack = b_unpack;
+                        b_unpack = temp;
+
+                        temp = raddr_a;
+                        raddr_a = raddr_b;
+                        raddr_b = temp;
+
+                        /* If we are swapping raddr_a/b we also need to swap
+                         * small_imm_a/b.
+                         */
+                        if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
+                                assert(instr->sig.small_imm_a !=
+                                       instr->sig.small_imm_b);
+                                struct v3d_qpu_sig new_sig = instr->sig;
+                                new_sig.small_imm_a = !instr->sig.small_imm_a;
+                                new_sig.small_imm_b = !instr->sig.small_imm_b;
+                                uint32_t sig;
+                                if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+                                    return false;
+                            *packed_instr &= ~V3D_QPU_SIG_MASK;
+                            *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+                        }
+                }
+
+                opcode |= a_unpack << 2;
+                opcode |= b_unpack << 0;
+
+                break;
+        }
+
+        case V3D_QPU_A_VFPACK: {
+                uint32_t a_unpack;
+                uint32_t b_unpack;
+
+                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
+                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &a_unpack)) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                 &b_unpack)) {
+                        return false;
+                }
+
+                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
+                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+
+                break;
+        }
+
+        case V3D_QPU_A_FFLOOR:
+        case V3D_QPU_A_FROUND:
+        case V3D_QPU_A_FTRUNC:
+        case V3D_QPU_A_FCEIL:
+        case V3D_QPU_A_FDX:
+        case V3D_QPU_A_FDY: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                raddr_b |= packed;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                if (packed == 0)
+                        return false;
+                raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
+                break;
+        }
+
+        case V3D_QPU_A_FTOIN:
+        case V3D_QPU_A_FTOIZ:
+        case V3D_QPU_A_FTOUZ:
+        case V3D_QPU_A_FTOC:
+                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                uint32_t packed;
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                if (packed == 0)
+                        return false;
+
+                raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
+
+                break;
+
+        case V3D_QPU_A_VFMIN:
+        case V3D_QPU_A_VFMAX:
+                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+                        return false;
+                }
+
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                opcode |= packed;
+                break;
+
+        case V3D_QPU_A_MOV: {
+                uint32_t packed;
+
+                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
+                                               &packed)) {
+                        return false;
+                }
+
+                raddr_b |= packed << 2;
+                break;
+        }
+
+        case V3D_QPU_A_FMOV: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                raddr_b = packed;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                raddr_b |= packed << 2;
+                break;
+        }
+
+        default:
+                if (instr->alu.add.op != V3D_QPU_A_NOP &&
+                    (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                        return false;
+                }
+                break;
+        }
+
+        *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
+        *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
+        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
+        *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
+        if (instr->alu.add.magic_write && !no_magic_write)
+                *packed_instr |= V3D_QPU_MA;
+
+        return true;
+}
+
+static bool
+v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        uint32_t mux_a = instr->alu.mul.a.mux;
+        uint32_t mux_b = instr->alu.mul.b.mux;
         int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);

         const struct opcode_desc *desc =
-                lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
+                lookup_opcode_from_instr(devinfo, mul_ops_v33,
+                                         ARRAY_SIZE(mul_ops_v33),
                                          instr->alu.mul.op);

         if (!desc)
@@ -1218,10 +2083,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
          * that here.  If mux a/b determine packing, it will be set below.
          */
         if (nsrc < 2)
-                mux_b = ffs(desc->mux_b_mask) - 1;
+                mux_b = ffs(desc->mux.b_mask) - 1;

         if (nsrc < 1)
-                mux_a = ffs(desc->mux_a_mask) - 1;
+                mux_a = ffs(desc->mux.a_mask) - 1;

         switch (instr->alu.mul.op) {
         case V3D_QPU_M_FMUL: {
@@ -1236,13 +2101,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                  */
                 opcode += packed << 4;

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
                                                  &packed)) {
                         return false;
                 }
                 opcode |= packed << 2;

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1260,7 +2125,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                 opcode |= (packed >> 1) & 1;
                 mux_b = (packed & 1) << 2;

-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
                                                  &packed)) {
                         return false;
                 }
@@ -1274,22 +2139,28 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
                 if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
                         return false;

-                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
                                                  &packed)) {
                         return false;
                 }
-                if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
+                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
                         opcode = 8;
                 else
                         opcode |= (packed + 4) & 7;

-                if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
+                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
                         return false;

                 break;
         }

         default:
+                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                        return false;
+                }
                 break;
         }

@@ -1304,6 +2175,150 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
         return true;
 }

+static bool
+v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        uint32_t raddr_c = instr->alu.mul.a.raddr;
+        uint32_t raddr_d = instr->alu.mul.b.raddr;
+        int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+
+        const struct opcode_desc *desc =
+                lookup_opcode_from_instr(devinfo, mul_ops_v71,
+                                         ARRAY_SIZE(mul_ops_v71),
+                                         instr->alu.mul.op);
+        if (!desc)
+                return false;
+
+        uint32_t opcode = desc->opcode_first;
+
+        /* Some opcodes have a single valid value for their raddr_d, so set
+         * that here.  If raddr_b determine packing, it will be set below.
+         */
+        if (nsrc < 2)
+                raddr_d = ffsll(desc->raddr_mask) - 1;
+
+        switch (instr->alu.mul.op) {
+        case V3D_QPU_M_FMUL: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                /* No need for a +1 because desc->opcode_first has a 1 in this
+                 * field.
+                 */
+                opcode += packed << 4;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                opcode |= packed << 2;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                opcode |= packed << 0;
+                break;
+        }
+
+        case V3D_QPU_M_FMOV: {
+                uint32_t packed;
+
+                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
+                                               &packed)) {
+                        return false;
+                }
+                raddr_d |= packed;
+
+                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                raddr_d |= packed << 2;
+                break;
+        }
+
+        case V3D_QPU_M_VFMUL: {
+                unreachable("pending v71 update");
+                uint32_t packed;
+
+                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+                                                 &packed)) {
+                        return false;
+                }
+                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+                        opcode = 8;
+                else
+                        opcode |= (packed + 4) & 7;
+
+                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+                        return false;
+
+                break;
+        }
+
+        case V3D_QPU_M_MOV: {
+                uint32_t packed;
+
+                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+                        return false;
+
+                if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
+                                               &packed)) {
+                        return false;
+                }
+
+                raddr_d |= packed << 2;
+                break;
+        }
+
+        default:
+                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
+                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                        return false;
+                }
+                break;
+        }
+
+        *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
+        *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
+        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
+        *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
+        if (instr->alu.mul.magic_write)
+                *packed_instr |= V3D_QPU_MM;
+
+        return true;
+}
+
+static bool
+v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
+        else
+                return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
+}
+
+static bool
+v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+{
+        if (devinfo->ver < 71)
+                return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
+        else
+                return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
+}
+
 static bool
 v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
                          uint64_t packed_instr,
@@ -1332,8 +2347,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
                         return false;
         }

-        instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
-        instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+        if (devinfo->ver <= 71) {
+                /*
+                 * For v71 this will be set on add/mul unpack, as raddr are now
+                 * part of v3d_qpu_input
+                 */
+                instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+                instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
+        }

         if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
                 return false;
@@ -1419,8 +2440,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
         *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);

         if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
-                *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
-                *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+                if (devinfo->ver < 71) {
+                        /*
+                         * For v71 this will be set on add/mul unpack, as raddr are now
+                         * part of v3d_qpu_input
+                         */
+                        *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+                        *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
+                }

                 if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
                         return false;
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index 2f8e19c73fe..be7b78d5ef0 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -160,10 +160,10 @@ main(int argc, char **argv)
                                 /* Swap the operands to be sure that we test
                                  * how the QPUs distinguish between these ops.
                                  */
-                                swap_mux(&instr.alu.add.a,
-                                         &instr.alu.add.b);
-                                swap_pack(&instr.alu.add.a_unpack,
-                                          &instr.alu.add.b_unpack);
+                                swap_mux(&instr.alu.add.a.mux,
+                                         &instr.alu.add.b.mux);
+                                swap_pack(&instr.alu.add.a.unpack,
+                                          &instr.alu.add.b.unpack);
                                 break;
                         default:
                                 break;
diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
index eea5d3f050e..c4bbd61abc2 100644
--- a/src/broadcom/simulator/v3d_simulator.c
+++ b/src/broadcom/simulator/v3d_simulator.c
@@ -92,6 +92,9 @@ static struct v3d_simulator_state {
         /** Last performance monitor ID. */
         uint32_t last_perfid;

+        /** Total performance counters */
+        uint32_t perfcnt_total;
+
         struct util_dynarray bin_oom;
         int refcount;
 } sim_state = {
@@ -436,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)

         perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
         if (perfmon)
-                v3d41_simulator_perfmon_stop(sim_state.v3d,
-                                             perfmon->ncounters,
-                                             perfmon->values);
+                v3d_X_simulator(perfmon_stop)(sim_state.v3d,
+                                              perfmon->ncounters,
+                                              perfmon->values);

         perfmon = v3d_get_simulator_perfmon(fd, perfid);
         if (perfmon)
-                v3d41_simulator_perfmon_start(sim_state.v3d,
-                                              perfmon->ncounters,
-                                              perfmon->counters);
+                v3d_X_simulator(perfmon_start)(sim_state.v3d,
+                                               perfmon->ncounters,
+                                               perfmon->counters);

         file->active_perfid = perfid;
 }
@@ -489,11 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
         bin_fd = fd;

         v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
-
-        if (sim_state.ver >= 41)
-                v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
-        else
-                v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+        v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);

         util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
                               sim_bo) {
@@ -632,15 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
         return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
 }

-static int
-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
-{
-        if (sim_state.ver >= 41)
-                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
-        else
-                return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
-}
-
 static int
 v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
 {
@@ -652,10 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
         v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
         v3d_simulator_copy_in_handle(file, args->bo_handles[3]);

-        if (sim_state.ver >= 41)
-                ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
-        else
-                ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+        ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);

         v3d_simulator_copy_out_handle(file, args->bo_handles[0]);

@@ -682,11 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)

         v3d_simulator_perfmon_switch(fd, args->perfmon_id);

-        if (sim_state.ver >= 41)
-                ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
-                                                       file->gmp->ofs);
-        else
-                ret = -1;
+        ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
+                                                file->gmp->ofs);

         for (int i = 0; i < args->bo_handle_count; i++)
                 v3d_simulator_copy_out_handle(file, bo_handles[i]);
@@ -716,7 +700,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)

         perfmon->ncounters = args->ncounters;
         for (int i = 0; i < args->ncounters; i++) {
-                if (args->counters[i] >= V3D_PERFCNT_NUM) {
+                if (args->counters[i] >= sim_state.perfcnt_total) {
                         ralloc_free(perfmon);
                         return -EINVAL;
                 } else {
@@ -797,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
                 return 0;

         case DRM_IOCTL_V3D_GET_PARAM:
-                return v3d_simulator_get_param_ioctl(fd, args);
+                return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);

         case DRM_IOCTL_GEM_CLOSE:
                 return v3d_simulator_gem_close_ioctl(fd, args);
@@ -880,10 +864,19 @@ v3d_simulator_init_global()

         util_dynarray_init(&sim_state.bin_oom, NULL);

-        if (sim_state.ver >= 41)
-                v3d41_simulator_init_regs(sim_state.v3d);
-        else
-                v3d33_simulator_init_regs(sim_state.v3d);
+        v3d_X_simulator(init_regs)(sim_state.v3d);
+
+        switch(sim_state.ver) {
+        case 41:
+        case 42:
+                sim_state.perfcnt_total = 87;
+                break;
+        case 71:
+                sim_state.perfcnt_total = 93;
+                break;
+        default:
+                sim_state.perfcnt_total = 0;
+        }
 }

 struct v3d_simulator_file *
diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
index ddb079c1455..92305634468 100644
--- a/src/broadcom/simulator/v3d_simulator.h
+++ b/src/broadcom/simulator/v3d_simulator.h
@@ -52,6 +52,32 @@ uint32_t v3d_simulator_get_mem_free(void);
 #  define v3dX(x) v3d41_##x
 #  include "v3dx_simulator.h"
 #  undef v3dX
+
+#  define v3dX(x) v3d71_##x
+#  include "v3dx_simulator.h"
+#  undef v3dX
+
 #endif

+/* Helper to call simulator ver specific functions */
+#define v3d_X_simulator(thing) ({                     \
+   __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\
+   switch (sim_state.ver) {                           \
+   case 33:                                           \
+   case 40:                                           \
+      v3d_X_sim_thing = &v3d33_simulator_##thing;     \
+      break;                                          \
+   case 41:                                           \
+   case 42:                                           \
+      v3d_X_sim_thing = &v3d41_simulator_##thing;     \
+      break;                                          \
+   case 71:                                           \
+      v3d_X_sim_thing = &v3d71_simulator_##thing;     \
+      break;                                          \
+   default:                                           \
+      unreachable("Unsupported hardware generation"); \
+   }                                                  \
+   v3d_X_sim_thing;                                   \
+})
+
 #endif
diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
index c9322f0397b..01cf6b22663 100644
--- a/src/broadcom/simulator/v3dx_simulator.c
+++ b/src/broadcom/simulator/v3dx_simulator.c
@@ -46,11 +46,15 @@

 #define HW_REGISTER_RO(x) (x)
 #define HW_REGISTER_RW(x) (x)
-#if V3D_VERSION >= 41
-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
+#if V3D_VERSION == 71
+#include "libs/core/v3d/registers/7.1.5.1/v3d.h"
+#else
+#if V3D_VERSION == 41 || V3D_VERSION == 42
+#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
 #else
 #include "libs/core/v3d/registers/3.3.0.0/v3d.h"
 #endif
+#endif

 #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
 #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
@@ -178,38 +182,48 @@ v3d_flush_caches(struct v3d_hw *v3d)
         v3d_flush_l2t(v3d);
 }

+#if V3D_VERSION < 71
+#define TFU_REG(NAME) V3D_TFU_ ## NAME
+#else
+#define TFU_REG(NAME) V3D_IFC_ ## NAME
+#endif
+
+
 int
 v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
                                  struct drm_v3d_submit_tfu *args)
 {
-        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
-
-        V3D_WRITE(V3D_TFU_IIA, args->iia);
-        V3D_WRITE(V3D_TFU_IIS, args->iis);
-        V3D_WRITE(V3D_TFU_ICA, args->ica);
-        V3D_WRITE(V3D_TFU_IUA, args->iua);
-        V3D_WRITE(V3D_TFU_IOA, args->ioa);
-        V3D_WRITE(V3D_TFU_IOS, args->ios);
-        V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
-        V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
-        V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
-        V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
-
-        V3D_WRITE(V3D_TFU_ICFG, args->icfg);
-
-        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+        int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET;
+
+        V3D_WRITE(TFU_REG(IIA), args->iia);
+        V3D_WRITE(TFU_REG(IIS), args->iis);
+        V3D_WRITE(TFU_REG(ICA), args->ica);
+        V3D_WRITE(TFU_REG(IUA), args->iua);
+        V3D_WRITE(TFU_REG(IOA), args->ioa);
+#if V3D_VERSION >= 71
+        V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
+#endif
+        V3D_WRITE(TFU_REG(IOS), args->ios);
+        V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
+        V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
+        V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
+        V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
+
+        V3D_WRITE(TFU_REG(ICFG), args->icfg);
+
+        while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
                 v3d_hw_tick(v3d);
         }

         return 0;
 }

-#if V3D_VERSION >= 41
 int
 v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
                                  struct drm_v3d_submit_csd *args,
                                  uint32_t gmp_ofs)
 {
+#if V3D_VERSION >= 41
         int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
                                    V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
         g_gmp_ofs = gmp_ofs;
@@ -223,6 +237,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
+#if V3D_VERSION >= 71
+        V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
+#endif
         /* CFG0 kicks off the job */
         V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);

@@ -239,8 +256,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
         v3d_flush_caches(v3d);

         return 0;
-}
+#else
+        return -1;
 #endif
+}

 int
 v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
@@ -310,16 +329,17 @@ v3d_isr_core(struct v3d_hw *v3d,
                 return;
         }

+#if V3D_VERSION <= 42
         if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
                 fprintf(stderr, "GMP violation at 0x%08x\n",
                         V3D_READ(V3D_GMP_VIO_ADDR));
-                abort();
         } else {
                 fprintf(stderr,
                         "Unexpected ISR with core status 0x%08x\n",
                         core_status);
         }
         abort();
+#endif
 }

 static void
@@ -396,6 +416,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
         }

         handle_mmu_interruptions(v3d, hub_status);
+
+#if V3D_VERSION == 71
+        if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
+                fprintf(stderr, "GMP violation at 0x%08x\n",
+                        V3D_READ(V3D_GMP_VIO_ADDR));
+        } else {
+                fprintf(stderr,
+                        "Unexpected ISR with status 0x%08x\n",
+                        hub_status);
+        }
+        abort();
+#endif
 }

 static void
@@ -436,8 +468,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
          * for tracing. Perhaps we should evaluate to do the same here and add
          * some debug options.
          */
-        uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
-                                    V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
+        uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
+#if V3D_VERSION <= 42
+        core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
+#endif
+
         V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
         V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);

@@ -447,6 +482,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
             V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET |  /* CAP exceeded */
             V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */

+#if V3D_VERSION == 71
+        hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
+#endif
         V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
         V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);

@@ -509,7 +547,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
 #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
 #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
 #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
-                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
+                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + \
+                                                 V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
 #endif

 void
diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
index ad032d832ad..182388a35b4 100644
--- a/src/broadcom/vulkan/meson.build
+++ b/src/broadcom/vulkan/meson.build
@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
     '--beta', with_vulkan_beta.to_string(),
     '--device-prefix', 'ver42',
+    '--device-prefix', 'ver71',
   ],
   depend_files : vk_entrypoints_gen_depend_files,
 )
@@ -64,13 +65,11 @@ files_per_version = files(
   'v3dvx_pipeline.c',
   'v3dvx_meta_common.c',
   'v3dvx_pipeline.c',
+  'v3dvx_query.c',
   'v3dvx_queue.c',
 )

-# The vulkan driver only supports version >= 42, which is the version present in
-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
-# driver.
-v3d_versions = ['42']
+v3d_versions = ['42', '71']

 v3dv_flags = []

diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index a14db073b4f..c6462735fe4 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job,
                          uint32_t layers,
                          uint32_t render_target_count,
                          uint8_t max_internal_bpp,
+                         uint8_t total_color_bpp,
                          bool msaa,
                          bool double_buffer)
 {
@@ -360,13 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job,
    tiling->render_target_count = render_target_count;
    tiling->msaa = msaa;
    tiling->internal_bpp = max_internal_bpp;
+   tiling->total_color_bpp = total_color_bpp;
    tiling->double_buffer = double_buffer;

    /* Double-buffer is incompatible with MSAA */
    assert(!tiling->msaa || !tiling->double_buffer);

-   v3d_choose_tile_size(render_target_count, max_internal_bpp,
-                        tiling->msaa, tiling->double_buffer,
+   v3d_choose_tile_size(&job->device->devinfo,
+                        render_target_count,
+                        max_internal_bpp, total_color_bpp, msaa,
+                        tiling->double_buffer,
                         &tiling->tile_width, &tiling->tile_height);

    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
@@ -457,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
                      bool allocate_tile_state_now,
                      uint32_t render_target_count,
                      uint8_t max_internal_bpp,
+                     uint8_t total_color_bpp,
                      bool msaa)
 {
    assert(job);
@@ -467,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
    const struct v3dv_frame_tiling *tiling =
       job_compute_frame_tiling(job, width, height, layers,
                                render_target_count, max_internal_bpp,
-                               msaa, false);
+                               total_color_bpp, msaa, false);

    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
    v3dv_return_if_oom(NULL, job);
@@ -528,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
                                job->frame_tiling.layers,
                                job->frame_tiling.render_target_count,
                                job->frame_tiling.internal_bpp,
+                               job->frame_tiling.total_color_bpp,
                                job->frame_tiling.msaa,
                                true);

@@ -1374,7 +1380,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
    }

    uint32_t att_count = 0;
-   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
+   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */

    /* We only need to emit subpass clears as draw calls for color attachments
     * if the render area is not aligned to tile boundaries.
@@ -1672,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,

       const struct v3dv_framebuffer *framebuffer = state->framebuffer;

-      uint8_t internal_bpp;
+      uint8_t max_internal_bpp, total_color_bpp;
       bool msaa;
       v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
-         (framebuffer, state->attachments, subpass, &internal_bpp, &msaa);
+         (framebuffer, state->attachments, subpass,
+          &max_internal_bpp, &total_color_bpp, &msaa);

       /* From the Vulkan spec:
        *
@@ -1699,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
                            layers,
                            true, false,
                            subpass->color_count,
-                           internal_bpp,
+                           max_internal_bpp,
+                           total_color_bpp,
                            msaa);
    }

@@ -2062,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }

+   if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+      if (memcmp(&dest->depth_bounds, &src->depth_bounds,
+                 sizeof(src->depth_bounds))) {
+         memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
+         dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+      }
+   }
+
    if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
       if (dest->line_width != src->line_width) {
          dest->line_width = src->line_width;
@@ -2131,39 +2147,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
    }
 }

-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
-void
-v3dv_viewport_compute_xform(const VkViewport *viewport,
-                            float scale[3],
-                            float translate[3])
-{
-   float x = viewport->x;
-   float y = viewport->y;
-   float half_width = 0.5f * viewport->width;
-   float half_height = 0.5f * viewport->height;
-   double n = viewport->minDepth;
-   double f = viewport->maxDepth;
-
-   scale[0] = half_width;
-   translate[0] = half_width + x;
-   scale[1] = half_height;
-   translate[1] = half_height + y;
-
-   scale[2] = (f - n);
-   translate[2] = n;
-
-   /* It seems that if the scale is small enough the hardware won't clip
-    * correctly so we work around this my choosing the smallest scale that
-    * seems to work.
-    *
-    * This case is exercised by CTS:
-    * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
-    */
-   const float min_abs_scale = 0.000009f;
-   if (fabs(scale[2]) < min_abs_scale)
-      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
-}
-
 /* Considers the pipeline's negative_one_to_one state and applies it to the
  * current viewport transform if needed to produce the resulting Z translate
  * and scale parameters.
@@ -2216,9 +2199,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
           viewportCount * sizeof(*pViewports));

    for (uint32_t i = firstViewport; i < total_count; i++) {
-      v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
-                                  state->dynamic.viewport.scale[i],
-                                  state->dynamic.viewport.translate[i]);
+      v3dv_X(cmd_buffer->device, viewport_compute_xform)
+         (&state->dynamic.viewport.viewports[i],
+          state->dynamic.viewport.scale[i],
+          state->dynamic.viewport.translate[i]);
    }

    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
@@ -2699,6 +2683,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
                         true, false,
                         old_job->frame_tiling.render_target_count,
                         old_job->frame_tiling.internal_bpp,
+                        old_job->frame_tiling.total_color_bpp,
                         true /* msaa */);

    v3dv_job_destroy(old_job);
@@ -2963,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
       v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);

+   if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
+      v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
+
    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
       v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);

@@ -3410,9 +3398,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
                        float minDepthBounds,
                        float maxDepthBounds)
 {
-   /* We do not support depth bounds testing so we just ignore this. We are
-    * already asserting that pipelines don't enable the feature anyway.
-    */
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
+   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
 }

 VKAPI_ATTR void VKAPI_CALL
@@ -3844,6 +3834,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)

 void
 v3dv_cmd_buffer_rewrite_indirect_csd_job(
+   struct v3dv_device *device,
    struct v3dv_csd_indirect_cpu_job_info *info,
    const uint32_t *wg_counts)
 {
@@ -3863,8 +3854,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;

-   submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
-                    (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+   uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
+                          (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+   if (device->devinfo.ver < 71 ||
+       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+      submit->cfg[4] = num_batches - 1;
+   } else {
+      submit->cfg[4] = num_batches;
+   }
    assert(submit->cfg[4] != ~0);

    if (info->needs_wg_uniform_rewrite) {
@@ -3897,6 +3895,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t **wg_uniform_offsets_out,
                           uint32_t *wg_size_out)
 {
+   struct v3dv_device *device = cmd_buffer->device;
    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
    assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
    struct v3dv_shader_variant *cs_variant =
@@ -3955,18 +3954,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
    if (wg_size_out)
       *wg_size_out = wg_size;

-   submit->cfg[4] = num_batches - 1;
+   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+   if (device->devinfo.ver < 71 ||
+       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
+      submit->cfg[4] = num_batches - 1;
+   } else {
+      submit->cfg[4] = num_batches;
+   }
    assert(submit->cfg[4] != ~0);

    assert(pipeline->shared_data->assembly_bo);
    struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;

    submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
-   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
    if (cs_variant->prog_data.base->single_seg)
       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
    if (cs_variant->prog_data.base->threads == 4)
       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+   /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved  */
+   if (device->devinfo.ver < 71)
+      submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;

    if (cs_variant->prog_data.cs->shared_size > 0) {
       job->csd.shared_memory =
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 3bad290e8c5..d013edaa63d 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -91,7 +91,7 @@ static const struct vk_instance_extension_table instance_extensions = {
    .KHR_display                         = true,
    .KHR_get_display_properties2         = true,
    .EXT_direct_mode_display             = true,
-   .EXT_acquire_drm_display             = true,
+   .EXT_acquire_drm_display             = false,
 #endif
    .KHR_external_fence_capabilities     = true,
    .KHR_external_memory_capabilities    = true,
@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device,
    *features = (struct vk_features) {
       /* Vulkan 1.0 */
       .robustBufferAccess = true, /* This feature is mandatory */
-      .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
+      .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
       .imageCubeArray = true,
       .independentBlend = true,
       .geometryShader = true,
@@ -224,10 +224,10 @@ get_features(const struct v3dv_physical_device *physical_device,
       .logicOp = true,
       .multiDrawIndirect = false,
       .drawIndirectFirstInstance = true,
-      .depthClamp = false, /* Only available since V3D 4.5.1.1 */
+      .depthClamp = physical_device->devinfo.ver >= 71,
       .depthBiasClamp = true,
       .fillModeNonSolid = true,
-      .depthBounds = false, /* Only available since V3D 4.3.16.2 */
+      .depthBounds = physical_device->devinfo.ver >= 71,
       .wideLines = true,
       .largePoints = true,
       .alphaToOne = true,
@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device,
        * problematic, we would always have to scalarize. Overall, this would
        * not lead to best performance so let's just not support it.
        */
-      .scalarBlockLayout = false,
+      .scalarBlockLayout = physical_device->devinfo.ver >= 71,
       /* This tells applications 2 things:
        *
        * 1. If they can select just one aspect for barriers. For us barriers
@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
    device->next_program_id = 0;

    ASSERTED int len =
-      asprintf(&device->name, "V3D %d.%d",
-               device->devinfo.ver / 10, device->devinfo.ver % 10);
+      asprintf(&device->name, "V3D %d.%d.%d",
+               device->devinfo.ver / 10,
+               device->devinfo.ver % 10,
+               device->devinfo.rev);
    assert(len != -1);

    v3dv_physical_device_init_disk_cache(device);
@@ -1212,6 +1214,12 @@ create_physical_device(struct v3dv_instance *instance,

    list_addtail(&device->vk.link, &instance->vk.physical_devices.list);

+   if (device->devinfo.ver != 42) {
+      fprintf(stderr, "WARNING: v3dv support for hw version %i is neither "
+              "a complete nor a conformant Vulkan implementation. Testing "
+              "use only.\n", device->devinfo.ver);
+   }
+
    return VK_SUCCESS;

 fail:
@@ -1279,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance)
       if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
          char **compat = devices[i]->deviceinfo.platform->compatible;
          while (*compat) {
-            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
+            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
+                strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
                v3d_idx = i;
                break;
             }
@@ -1288,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance)
       } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
          char **compat = devices[i]->deviceinfo.platform->compatible;
          while (*compat) {
-            if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
-                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
+            if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
+                strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
                vc4_idx = i;
                break;
             }
@@ -1326,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
    switch (dev->devinfo.ver) {
    case 42:
       return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+   case 71:
+      return 0x55701C33; /* Broadcom deviceID for 2712 */
    default:
       unreachable("Unsupported V3D version");
    }
@@ -1354,6 +1366,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
    const VkSampleCountFlags supported_sample_counts =
       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;

+   const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
+
    struct timespec clock_res;
    clock_getres(CLOCK_MONOTONIC, &clock_res);
    const float timestamp_period =
@@ -1424,7 +1438,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxFragmentInputComponents               = max_varying_components,
       .maxFragmentOutputAttachments             = 4,
       .maxFragmentDualSrcAttachments            = 0,
-      .maxFragmentCombinedOutputResources       = MAX_RENDER_TARGETS +
+      .maxFragmentCombinedOutputResources       = max_rts +
                                                   MAX_STORAGE_BUFFERS +
                                                   MAX_STORAGE_IMAGES,

@@ -1437,7 +1451,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .subPixelPrecisionBits                    = V3D_COORD_SHIFT,
       .subTexelPrecisionBits                    = 8,
       .mipmapPrecisionBits                      = 8,
-      .maxDrawIndexedIndexValue                 = 0x00ffffff,
+      .maxDrawIndexedIndexValue                 = pdevice->devinfo.ver >= 71 ?
+                                                  0xffffffff : 0x00ffffff,
       .maxDrawIndirectCount                     = 0x7fffffff,
       .maxSamplerLodBias                        = 14.0f,
       .maxSamplerAnisotropy                     = 16.0f,
@@ -1464,7 +1479,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .framebufferDepthSampleCounts             = supported_sample_counts,
       .framebufferStencilSampleCounts           = supported_sample_counts,
       .framebufferNoAttachmentsSampleCounts     = supported_sample_counts,
-      .maxColorAttachments                      = MAX_RENDER_TARGETS,
+      .maxColorAttachments                      = max_rts,
       .sampledImageColorSampleCounts            = supported_sample_counts,
       .sampledImageIntegerSampleCounts          = supported_sample_counts,
       .sampledImageDepthSampleCounts            = supported_sample_counts,
@@ -2031,7 +2046,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
                             device->instance->default_pipeline_cache_enabled);
    device->default_attribute_float =
-      v3dv_pipeline_create_default_attribute_values(device, NULL);
+      v3dv_X(device, create_default_attribute_values)(device, NULL);

    device->device_address_mem_ctx = ralloc_context(NULL);
    util_dynarray_init(&device->device_address_bo_list,
@@ -2975,7 +2990,7 @@ v3dv_CreateSampler(VkDevice _device,
       }
    }

-   v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
+   v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);

    *pSampler = v3dv_sampler_to_handle(sampler);

diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
index ebbd60e4c03..e01e2e1bd19 100644
--- a/src/broadcom/vulkan/v3dv_image.c
+++ b/src/broadcom/vulkan/v3dv_image.c
@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device,
     * makes sense to implement swizzle composition using VkSwizzle directly.
     */
    VkFormat format;
-   uint8_t image_view_swizzle[4];
    if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
        range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
       format = VK_FORMAT_R8G8B8A8_UINT;
@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device,
       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);

       util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
-                                   image_view_swizzle);
+                                   iview->view_swizzle);
    } else {
       format = pCreateInfo->format;
       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
-                                           image_view_swizzle);
+                                           iview->view_swizzle);
    }

    iview->vk.view_format = format;
@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device,

       const uint8_t *format_swizzle =
          v3dv_get_format_swizzle(device, format, plane);
-      util_format_compose_swizzles(format_swizzle, image_view_swizzle,
+      util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
                                    iview->planes[plane].swizzle);

       iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
index 9cda9f0d6d2..8ac99724105 100644
--- a/src/broadcom/vulkan/v3dv_limits.h
+++ b/src/broadcom/vulkan/v3dv_limits.h
@@ -50,8 +50,6 @@
 #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
                              MAX_DYNAMIC_STORAGE_BUFFERS)

-#define MAX_RENDER_TARGETS 4
-
 #define MAX_MULTIVIEW_VIEW_COUNT 16

 /* These are tunable parameters in the HW design, but all the V3D
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index a200298a898..0b64653000d 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,

       v3dv_job_start_frame(job, width, height, max_layer,
                            false, true, 1, internal_bpp,
+                           4 * v3d_internal_bpp_words(internal_bpp),
                            image->vk.samples > VK_SAMPLE_COUNT_1_BIT);

       struct v3dv_meta_framebuffer framebuffer;
@@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
    uint32_t bit_offset = 0;

    key |= rt_idx;
-   bit_offset += 2;
+   bit_offset += 3;

    key |= ((uint64_t) format) << bit_offset;
    bit_offset += 32;
@@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);

-   /* We can only clear attachments in the current subpass */
-   assert(attachmentCount <= 5); /* 4 color + D/S */
+   /* We can have at most max_color_RTs + 1 D/S attachments */
+   assert(attachmentCount <=
+          V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);

+   /* We can only clear attachments in the current subpass */
    struct v3dv_render_pass *pass = cmd_buffer->state.pass;

    assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index c0ec888b8c7..2d30c611e17 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);

-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
-                        1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        false);

    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);

-   v3dv_job_start_frame(job, width, height, num_layers,
-                        false, true, 1, internal_bpp,
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);

    struct v3dv_meta_framebuffer framebuffer;
@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);

-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
-                        1, internal_bpp, false);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        false);

    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       (fb_format, region->srcSubresource.aspectMask,
        &internal_type, &internal_bpp);

-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
-                        1, internal_bpp, true);
+   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
+                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                        true);

    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 20f5014268d..0583faf6f9a 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device,

          /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
           * the clear might get lost. If a subpass has this then we can't emit
-          * the clear using the TLB and we have to do it as a draw call.
+          * the clear using the TLB and we have to do it as a draw call. This
+          * issue is fixed since V3D 4.3.18.
           *
           * FIXME: separate stencil.
           */
-         if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+         if (device->devinfo.ver == 42 &&
+             subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
             struct v3dv_render_pass_attachment *att =
                &pass->attachments[subpass->ds_attachment.attachment];
             if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
@@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device,
    /* Granularity is defined by the tile size */
    assert(subpass_idx < pass->subpass_count);
    struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
-   const uint32_t color_attachment_count = subpass->color_count;
+   const uint32_t color_count = subpass->color_count;

    bool msaa = false;
-   uint32_t max_bpp = 0;
-   for (uint32_t i = 0; i < color_attachment_count; i++) {
+   uint32_t max_internal_bpp = 0;
+   uint32_t total_color_bpp = 0;
+   for (uint32_t i = 0; i < color_count; i++) {
       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
       if (attachment_idx == VK_ATTACHMENT_UNUSED)
          continue;
@@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device,
       v3dv_X(device, get_internal_type_bpp_for_output_format)
          (format->planes[0].rt_type, &internal_type, &internal_bpp);

-      max_bpp = MAX2(max_bpp, internal_bpp);
+      max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
+      total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);

       if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
          msaa = true;
@@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device,
     * heuristics so we choose a conservative granularity here, with it disabled.
     */
    uint32_t width, height;
-   v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
+   v3d_choose_tile_size(&device->devinfo, color_count,
+                        max_internal_bpp, total_color_bpp, msaa,
                         false /* double-buffer */, &width, &height);
    *granularity = (VkExtent2D) {
       .width = width,
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 99fe8c16bfa..d6629c9a4a0 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
       return V3DV_DYNAMIC_LINE_WIDTH;
    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
       return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
-
-   /* Depth bounds testing is not available in in V3D 4.2 so here we are just
-    * ignoring this dynamic state. We are already asserting at pipeline creation
-    * time that depth bounds testing is not enabled.
-    */
    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
-      return 0;
+      return V3DV_DYNAMIC_DEPTH_BOUNDS;

    default:
       unreachable("Unhandled dynamic state");
@@ -2632,6 +2627,7 @@ pipeline_init_dynamic_state(
    const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
 {
    /* Initialize to default values */
+   const struct v3d_device_info *devinfo = &pipeline->device->devinfo;
    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
    memset(dynamic, 0, sizeof(*dynamic));
    dynamic->stencil_compare_mask.front = ~0;
@@ -2639,7 +2635,9 @@ pipeline_init_dynamic_state(
    dynamic->stencil_write_mask.front = ~0;
    dynamic->stencil_write_mask.back = ~0;
    dynamic->line_width = 1.0f;
-   dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
+   dynamic->color_write_enable =
+      (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
+   dynamic->depth_bounds.max = 1.0f;

    /* Create a mask of enabled dynamic states */
    uint32_t dynamic_states = 0;
@@ -2661,9 +2659,10 @@ pipeline_init_dynamic_state(
                       pViewportState->viewportCount);

          for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
-            v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
-                                        dynamic->viewport.scale[i],
-                                        dynamic->viewport.translate[i]);
+            v3dv_X(pipeline->device, viewport_compute_xform)
+               (&dynamic->viewport.viewports[i],
+                dynamic->viewport.scale[i],
+                dynamic->viewport.translate[i]);
          }
       }

@@ -2691,6 +2690,11 @@ pipeline_init_dynamic_state(
          dynamic->stencil_reference.front = pDepthStencilState->front.reference;
          dynamic->stencil_reference.back = pDepthStencilState->back.reference;
       }
+
+      if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
+         dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds;
+         dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds;
+      }
    }

    if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
@@ -2802,62 +2806,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
    }
 }

-static bool
-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
-{
-   for (uint8_t i = 0; i < pipeline->va_count; i++) {
-      if (vk_format_is_int(pipeline->va[i].vk_format))
-         return true;
-   }
-   return false;
-}
-
-/* @pipeline can be NULL. We assume in that case that all the attributes have
- * a float format (we only create an all-float BO once and we reuse it with
- * all float pipelines), otherwise we look at the actual type of each
- * attribute used with the specific pipeline passed in.
- */
-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
-                                              struct v3dv_pipeline *pipeline)
-{
-   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
-   struct v3dv_bo *bo;
-
-   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
-
-   if (!bo) {
-      fprintf(stderr, "failed to allocate memory for the default "
-              "attribute values\n");
-      return NULL;
-   }
-
-   bool ok = v3dv_bo_map(device, bo, size);
-   if (!ok) {
-      fprintf(stderr, "failed to map default attribute values buffer\n");
-      return false;
-   }
-
-   uint32_t *attrs = bo->map;
-   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
-   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
-      attrs[i * 4 + 0] = 0;
-      attrs[i * 4 + 1] = 0;
-      attrs[i * 4 + 2] = 0;
-      VkFormat attr_format =
-         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
-      if (i < va_count && vk_format_is_int(attr_format)) {
-         attrs[i * 4 + 3] = 1;
-      } else {
-         attrs[i * 4 + 3] = fui(1.0);
-      }
-   }
-
-   v3dv_bo_unmap(device, bo);
-
-   return bo;
-}
-
 static void
 pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
                          const VkPipelineMultisampleStateCreateInfo *ms_info)
@@ -2960,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline,
    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
     * feature and it shouldn't be used by any pipeline.
     */
-   assert(!ds_info || !ds_info->depthBoundsTestEnable);
+   assert(device->devinfo.ver >= 71 ||
+          !ds_info || !ds_info->depthBoundsTestEnable);
+   pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable;

    enable_depth_bias(pipeline, rs_info);

@@ -2992,9 +2942,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,

    v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);

-   if (pipeline_has_integer_vertex_attrib(pipeline)) {
+   if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
       pipeline->default_attribute_values =
-         v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
+         v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
+
       if (!pipeline->default_attribute_values)
          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
    } else {
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index c6707211529..89e2f1c7e5c 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -123,6 +123,9 @@ struct v3d_simulator_file;
 /* Minimum required by the Vulkan 1.1 spec */
 #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)

+/* Maximum performance counters number */
+#define V3D_MAX_PERFCNT 93
+
 struct v3dv_physical_device {
    struct vk_physical_device vk;

@@ -581,6 +584,10 @@ struct v3dv_device {
     * being float being float, allowing us to reuse the same BO for all
     * pipelines matching this requirement. Pipelines that need integer
     * attributes will create their own BO.
+    *
+    * Note that since v71 the default attribute values are not needed, so this
+    * can be NULL.
+    *
     */
    struct v3dv_bo *default_attribute_float;

@@ -772,6 +779,8 @@ struct v3dv_image_view {

    const struct v3dv_format *format;

+   uint8_t view_swizzle[4];
+
    uint8_t plane_count;
    struct {
       uint8_t image_plane;
@@ -782,8 +791,8 @@ struct v3dv_image_view {
       uint32_t internal_type;
       uint32_t offset;

-      /* Precomputed (composed from createinfo->components and formar swizzle)
-       * swizzles to pass in to the shader key.
+      /* Precomputed swizzle (composed from the view swizzle and the format
+       * swizzle).
        *
        * This could be also included on the descriptor bo, but the shader state
        * packet doesn't need it on a bo, so we can just avoid a memory copy
@@ -946,6 +955,7 @@ struct v3dv_frame_tiling {
    uint32_t layers;
    uint32_t render_target_count;
    uint32_t internal_bpp;
+   uint32_t total_color_bpp;
    bool     msaa;
    bool     double_buffer;
    uint32_t tile_width;
@@ -1040,7 +1050,8 @@ enum v3dv_dynamic_state_bits {
    V3DV_DYNAMIC_DEPTH_BIAS                = 1 << 6,
    V3DV_DYNAMIC_LINE_WIDTH                = 1 << 7,
    V3DV_DYNAMIC_COLOR_WRITE_ENABLE        = 1 << 8,
-   V3DV_DYNAMIC_ALL                       = (1 << 9) - 1,
+   V3DV_DYNAMIC_DEPTH_BOUNDS              = 1 << 9,
+   V3DV_DYNAMIC_ALL                       = (1 << 10) - 1,
 };

 /* Flags for dirty pipeline state.
@@ -1065,6 +1076,7 @@ enum v3dv_cmd_dirty_bits {
    V3DV_CMD_DIRTY_LINE_WIDTH                = 1 << 16,
    V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 17,
    V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE        = 1 << 18,
+   V3DV_CMD_DIRTY_DEPTH_BOUNDS              = 1 << 19,
 };

 struct v3dv_dynamic_state {
@@ -1101,6 +1113,11 @@ struct v3dv_dynamic_state {
       float slope_factor;
    } depth_bias;

+   struct {
+      float                                     min;
+      float                                     max;
+   } depth_bounds;
+
    float line_width;

    uint32_t color_write_enable;
@@ -1196,7 +1213,7 @@ struct v3dv_timestamp_query_cpu_job_info {
 };

 /* Number of perfmons required to handle all supported performance counters */
-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
+#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
                                        DRM_V3D_MAX_PERF_COUNTERS)

 struct v3dv_perf_query {
@@ -1369,6 +1386,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
                           bool allocate_tile_state_now,
                           uint32_t render_target_count,
                           uint8_t max_internal_bpp,
+                          uint8_t total_color_bpp,
                           bool msaa);

 bool v3dv_job_type_is_gpu(struct v3dv_job *job);
@@ -1667,7 +1685,7 @@ struct v3dv_query_pool {
    /* Only used with performance queries */
    struct {
       uint32_t ncounters;
-      uint8_t counters[V3D_PERFCNT_NUM];
+      uint8_t counters[V3D_MAX_PERFCNT];

       /* V3D has a limit on the number of counters we can track in a
        * single performance monitor, so if too many counters are requested
@@ -1803,7 +1821,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
 void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
                                  struct drm_v3d_submit_tfu *tfu);

-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
+void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
+                                              struct v3dv_csd_indirect_cpu_job_info *info,
                                               const uint32_t *wg_counts);

 void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
@@ -2289,11 +2308,15 @@ struct v3dv_pipeline {
    unsigned char sha1[20];

    /* In general we can reuse v3dv_device->default_attribute_float, so note
-    * that the following can be NULL.
+    * that the following can be NULL. In 7.x this is not used, so it will be
+    * NULL.
     *
     * FIXME: the content of this BO will be small, so it could be improved to
     * be uploaded to a common BO. But as in most cases it will be NULL, it is
     * not a priority.
+    *
+    * Note that since v71 the default attribute values are not needed, so this
+    * can be NULL.
     */
    struct v3dv_bo *default_attribute_values;

@@ -2323,6 +2346,9 @@ struct v3dv_pipeline {
       bool is_z16;
    } depth_bias;

+   /* Depth bounds */
+   bool depth_bounds_test_enabled;
+
    struct {
       void *mem_ctx;
       struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
@@ -2338,6 +2364,13 @@ struct v3dv_pipeline {
    uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
 };

+static inline bool
+v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
+{
+   return device->devinfo.ver > 71 ||
+          (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
+}
+
 static inline VkPipelineBindPoint
 v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
 {
@@ -2500,10 +2533,6 @@ void
 v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
                                     struct v3dv_pipeline_cache *cache);

-struct v3dv_bo *
-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
-                                              struct v3dv_pipeline *pipeline);
-
 VkResult
 v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
                                       nir_shader *nir,
@@ -2608,12 +2637,32 @@ u64_compare(const void *key1, const void *key2)
    case 42:                                           \
       v3d_X_thing = &v3d42_##thing;                   \
       break;                                          \
+   case 71:                                           \
+      v3d_X_thing = &v3d71_##thing;                   \
+      break;                                          \
    default:                                           \
       unreachable("Unsupported hardware generation"); \
    }                                                  \
    v3d_X_thing;                                       \
 })

+/* Helper to get hw-specific macro values */
+#define V3DV_X(device, thing) ({                                \
+   __typeof(V3D42_##thing) V3D_X_THING;                         \
+   switch (device->devinfo.ver) {                               \
+   case 42:                                                     \
+      V3D_X_THING = V3D42_##thing;                              \
+      break;                                                    \
+   case 71:                                                     \
+      V3D_X_THING = V3D71_##thing;                              \
+      break;                                                    \
+   default:                                                     \
+      unreachable("Unsupported hardware generation");           \
+   }                                                            \
+   V3D_X_THING;                                                 \
+})
+
+

 /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
  * define v3dX for each version supported, because when we compile code that
@@ -2626,6 +2675,10 @@ u64_compare(const void *key1, const void *key2)
 #  define v3dX(x) v3d42_##x
 #  include "v3dvx_private.h"
 #  undef v3dX
+
+#  define v3dX(x) v3d71_##x
+#  include "v3dvx_private.h"
+#  undef v3dX
 #endif

 #ifdef ANDROID
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 3284c467d74..deb7821f02b 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -23,7 +23,6 @@

 #include "v3dv_private.h"

-#include "common/v3d_performance_counters.h"
 #include "util/timespec.h"
 #include "compiler/nir/nir_builder.h"

@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device,
                            DRM_IOCTL_V3D_PERFMON_CREATE,
                            &req);
       if (ret)
-         fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
+         fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));

       pool->queries[query].perf.kperfmon_ids[i] = req.id;
    }
@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device,
                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);

       assert(pq_info);
-      assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);

       pool->perfmon.ncounters = pq_info->counterIndexCount;
       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device,
    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);

    struct v3dv_query *q = &pool->queries[query];
-   uint64_t counter_values[V3D_PERFCNT_NUM];
+   uint64_t counter_values[V3D_MAX_PERFCNT];

    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
       struct drm_v3d_perfmon_get_values req = {
@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
    VkPerformanceCounterKHR *pCounters,
    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
 {
-   uint32_t desc_count = *pCounterCount;
+   V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);

-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
-                          out, pCounters, pCounterCount);
-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
-                          out_desc, pCounterDescriptions, &desc_count);
-
-   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
-      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
-         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
-         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
-         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
-
-         unsigned char sha1_result[20];
-         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
-                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
-                            sha1_result);
-
-         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
-      }
-
-      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
-                               &out_desc, desc) {
-         desc->flags = 0;
-         snprintf(desc->name, sizeof(desc->name), "%s",
-            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
-         snprintf(desc->category, sizeof(desc->category), "%s",
-            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
-         snprintf(desc->description, sizeof(desc->description), "%s",
-            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
-      }
-   }
-
-   return vk_outarray_status(&out);
+   return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
+                                                                pCounters,
+                                                                pCounterDescriptions);
 }

 VKAPI_ATTR void VKAPI_CALL
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index b4aae195180..429d14a9196 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,

    if (memcmp(group_counts, info->csd_job->csd.wg_count,
               sizeof(info->csd_job->csd.wg_count)) != 0) {
-      v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
+      v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
    }

    return VK_SUCCESS;
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 72fa9a1b39c..0e681cc4ee2 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);

    struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
-
+   float clipper_xy_granularity =
+      V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
    for (int i = 0; i < uinfo->count; i++) {
       uint32_t data = uinfo->data[i];

@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
          break;

       case QUNIFORM_VIEWPORT_X_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
+         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
          break;

       case QUNIFORM_VIEWPORT_Y_SCALE:
-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
+         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
          break;

       case QUNIFORM_VIEWPORT_Z_OFFSET: {
diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index f182b790d36..1bd634f5027 100644
--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
    };
    config.width_in_pixels = tiling->width;
    config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
    config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
    config.multisample_mode_4x = tiling->msaa;
    config.double_buffer_in_non_ms_mode = tiling->double_buffer;
    config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      unreachable("HW generation 71 not supported yet.");
+#endif

    uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
    cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
@@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
       config.width_in_pixels = tiling->width;
       config.height_in_pixels = tiling->height;
+#if V3D_VERSION == 42
       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
       config.multisample_mode_4x = tiling->msaa;
       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideally we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
    }

    /* There's definitely nothing in the VCD cache we want. */
@@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
                                              iview->vk.base_array_layer + layer,
                                              image_plane);

+   /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
+    * is broken in earlier V3D versions.
+    */
+   assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
+
    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
       store.buffer_to_store = buffer;
       store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
@@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
       const VkImageAspectFlags aspects =
          vk_format_aspects(ds_attachment->desc.format);

+#if V3D_VERSION <= 42
+      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+       * for depth/stencil.
+       *
+       * There used to be some confusion regarding the Clear Tile Buffers
+       * Z/S bit also being broken, but we confirmed with Broadcom that this
+       * is not the case, it was just that some other hardware bugs (that we
+       * need to work around, such as GFXH-1461) could cause this bit to behave
+       * incorrectly.
+       *
+       * There used to be another issue where the RTs bit in the Clear Tile
+       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+       * fixed since V3D 4.1.
+       *
+       * So if we have to emit a clear of depth or stencil we don't use
+       * the per-buffer store clear bit, even if we need to store the buffers,
+       * instead we always have to use the Clear Tile Buffers Z/S bit.
+       * If we have configured the job to do early Z/S clearing, then we
+       * don't want to emit any Clear Tile Buffers command at all here.
+       *
+       * Note that GFXH-1689 is not reproduced in the simulator, where
+       * using the clear buffer bit in depth/stencil stores works fine.
+       */
+
       /* Only clear once on the first subpass that uses the attachment */
       uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
          ds_attachment->first_subpass :
@@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
                            ds_attachment->desc.stencilLoadOp,
                            subpass->do_stencil_clear_with_draw);

+      use_global_zs_clear = !state->job->early_zs_clear &&
+         (needs_depth_clear || needs_stencil_clear);
+#endif
+#if V3D_VERSION >= 71
+      /* The store command's clear buffer bit cannot be used for Z/S stencil:
+       * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
+       * so we don't want to emit redundant clears here.
+       */
+      use_global_zs_clear = false;
+#endif
+
       /* Skip the last store if it is not required */
       uint32_t ds_last_subpass = !pass->multiview_enabled ?
          ds_attachment->last_subpass :
@@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
          needs_stencil_store = subpass->resolve_stencil;
       }

-      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
-       * for depth/stencil.
-       *
-       * There used to be some confusion regarding the Clear Tile Buffers
-       * Z/S bit also being broken, but we confirmed with Broadcom that this
-       * is not the case, it was just that some other hardware bugs (that we
-       * need to work around, such as GFXH-1461) could cause this bit to behave
-       * incorrectly.
-       *
-       * There used to be another issue where the RTs bit in the Clear Tile
-       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
-       * fixed since V3D 4.1.
-       *
-       * So if we have to emit a clear of depth or stencil we don't use
-       * the per-buffer store clear bit, even if we need to store the buffers,
-       * instead we always have to use the Clear Tile Buffers Z/S bit.
-       * If we have configured the job to do early Z/S clearing, then we
-       * don't want to emit any Clear Tile Buffers command at all here.
-       *
-       * Note that GFXH-1689 is not reproduced in the simulator, where
-       * using the clear buffer bit in depth/stencil stores works fine.
-       */
-      use_global_zs_clear = !state->job->early_zs_clear &&
-         (needs_depth_clear || needs_stencil_clear);
       if (needs_depth_store || needs_stencil_store) {
          const uint32_t zs_buffer =
             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
@@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
     * bit and instead we have to emit a single clear of all tile buffers.
     */
    if (use_global_zs_clear || use_global_rt_clear) {
+#if V3D_VERSION == 42
       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
          clear.clear_z_stencil_buffer = use_global_zs_clear;
          clear.clear_all_render_targets = use_global_rt_clear;
       }
+#endif
+#if V3D_VERSION >= 71
+      cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
    }
 }

@@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
    }
 }

+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ * FIXME: for v71 we are not returning all the possible combinations for
+ * render target internal type and clamp. For example for int types we are
+ * always using clamp int, and for 16f we are using clamp none or pos (that
+ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary
+ * right now we are just porting what we were doing on 4.2
+ */
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format)
+{
+#if V3D_VERSION == 42
+   if (vk_format_is_int(vk_format))
+      return V3D_RENDER_TARGET_CLAMP_INT;
+   else if (vk_format_is_srgb(vk_format))
+      return V3D_RENDER_TARGET_CLAMP_NORM;
+   else
+      return V3D_RENDER_TARGET_CLAMP_NONE;
+#endif
+#if V3D_VERSION >= 71
+   switch (rt_type) {
+   case V3D_INTERNAL_TYPE_8I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+   case V3D_INTERNAL_TYPE_8UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_8:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+   case V3D_INTERNAL_TYPE_16I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+   case V3D_INTERNAL_TYPE_16UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_16F:
+      return vk_format_is_srgb(vk_format) ?
+         V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+         V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+   case V3D_INTERNAL_TYPE_32I:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+   case V3D_INTERNAL_TYPE_32UI:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+   case V3D_INTERNAL_TYPE_32F:
+      return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+   default:
+      unreachable("Unknown internal render target type");
+   }
+
+   return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+}
+
+static void
+cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
+                                           int rt,
+                                           uint32_t *rt_bpp,
+#if V3D_VERSION == 42
+                                           uint32_t *rt_type,
+                                           uint32_t *rt_clamp)
+#else
+                                           uint32_t *rt_type_clamp)
+#endif
+{
+   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+
+   assert(state->subpass_idx < state->pass->subpass_count);
+   const struct v3dv_subpass *subpass =
+      &state->pass->subpasses[state->subpass_idx];
+
+   if (rt >= subpass->color_count)
+      return;
+
+   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+   const uint32_t attachment_idx = attachment->attachment;
+   if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      return;
+
+   assert(attachment_idx < state->framebuffer->attachment_count &&
+          attachment_idx < state->attachment_alloc_count);
+   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+   assert(vk_format_is_color(iview->vk.format));
+
+   assert(iview->plane_count == 1);
+   *rt_bpp = iview->planes[0].internal_bpp;
+#if V3D_VERSION == 42
+   *rt_type = iview->planes[0].internal_type;
+   *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+                                               iview->vk.format);
+#endif
+#if V3D_VERSION >= 71
+   *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
+                                                    iview->vk.format);
+#endif
+}
+
 void
 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       config.number_of_render_targets = MAX2(subpass->color_count, 1);
       config.multisample_mode_4x = tiling->msaa;
       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideallly we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif

       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
          const struct v3dv_image_view *iview =
@@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
           * Early-Z/S clearing is independent of Early Z/S testing, so it is
           * possible to enable one but not the other so long as their
           * respective requirements are met.
+          *
+          * From V3D 4.5.6, Z/S buffers are always cleared automatically
+          * between tiles, but we still want to enable early ZS clears
+          * when Z/S are not loaded or stored.
           */
          struct v3dv_render_pass_attachment *ds_attachment =
             &pass->attachments[ds_attachment_idx];
@@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          const VkImageAspectFlags ds_aspects =
             vk_format_aspects(ds_attachment->desc.format);

-         bool needs_depth_clear =
-            check_needs_clear(state,
-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                              ds_attachment->first_subpass,
-                              ds_attachment->desc.loadOp,
-                              subpass->do_depth_clear_with_draw);
-
          bool needs_depth_store =
             v3dv_cmd_buffer_check_needs_store(state,
                                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
                                               ds_attachment->last_subpass,
                                               ds_attachment->desc.storeOp) ||
                                               subpass->resolve_depth;
+#if V3D_VERSION <= 42
+         bool needs_depth_clear =
+            check_needs_clear(state,
+                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                              ds_attachment->first_subpass,
+                              ds_attachment->desc.loadOp,
+                              subpass->do_depth_clear_with_draw);

          do_early_zs_clear = needs_depth_clear && !needs_depth_store;
+#endif
+#if V3D_VERSION >= 71
+         bool needs_depth_load =
+            v3dv_cmd_buffer_check_needs_load(state,
+                                             ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                             ds_attachment->first_subpass,
+                                             ds_attachment->desc.loadOp,
+                                             ds_attachment->last_subpass,
+                                             ds_attachment->desc.storeOp);
+         do_early_zs_clear = !needs_depth_load && !needs_depth_store;
+#endif
+
          if (do_early_zs_clear &&
              vk_format_has_stencil(ds_attachment->desc.format)) {
             bool needs_stencil_load =
@@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
     */
    job->early_zs_clear = do_early_zs_clear;

+#if V3D_VERSION >= 71
+   uint32_t base_addr = 0;
+#endif
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
+      if (attachment_idx == VK_ATTACHMENT_UNUSED) {
+#if V3D_VERSION >= 71
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+            rt.render_target_number = i;
+            rt.stride = 1; /* Unused */
+         }
+#endif
          continue;
+      }

       struct v3dv_image_view *iview =
          state->attachments[attachment_idx].image_view;
@@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       const struct v3d_resource_slice *slice =
          &image->planes[plane].slices[iview->vk.base_mip_level];

-      const uint32_t *clear_color =
+      UNUSED const uint32_t *clear_color =
          &state->attachments[attachment_idx].clear_value.color[0];

-      uint32_t clear_pad = 0;
+      UNUSED uint32_t clear_pad = 0;
       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
           slice->tiling == V3D_TILING_UIF_XOR) {
          int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
@@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
          }
       }

+#if V3D_VERSION == 42
       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
          clear.clear_color_low_32_bits = clear_color[0];
          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
@@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
             clear.render_target_number = i;
          };
       }
+#endif
+
+#if V3D_VERSION >= 71
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+         rt.clear_color_low_bits = clear_color[0];
+         cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
+                                                    &rt.internal_type_and_clamping);
+         rt.stride =
+            v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+                                                   v3d_internal_bpp_words(rt.internal_bpp));
+         rt.base_address = base_addr;
+         rt.render_target_number = i;
+
+         /* base_addr in multiples of 512 bits. We divide by 8 because stride
+          * is in 128-bit units, but it is packing 2 rows worth of data, so we
+          * need to divide it by 2 so it is only 1 row, and then again by 4 so
+          * it is in 512-bit units.
+          */
+         base_addr += (tiling->tile_height * rt.stride) / 8;
+      }
+
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+            rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+               ((uint64_t) clear_color[1]) |
+               (((uint64_t) (clear_color[2] & 0xff)) << 32);
+            rt.render_target_number = i;
+         }
+      }
+
+      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
+         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+            rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+               (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
+               (((uint64_t) (clear_color[3])) << 24);
+            rt.render_target_number = i;
+         }
+      }
+#endif
    }

+#if V3D_VERSION >= 71
+   /* If we don't have any color RTs, we still need to emit one and flag
+    * it as not used using stride = 1.
+    */
+   if (subpass->color_count == 0) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+         rt.stride = 1;
+      }
+   }
+#endif
+
+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
           &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
           &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
           &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
-      v3dX(cmd_buffer_render_pass_setup_render_target)
+      cmd_buffer_render_pass_setup_render_target
          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
    }
+#endif

    /* Ends rendering mode config. */
    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
@@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       }
       if (cmd_buffer->state.tile_aligned_render_area &&
           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
             clear.clear_z_stencil_buffer = !job->early_zs_clear;
             clear.clear_all_render_targets = true;
          }
+#endif
+#if V3D_VERSION >= 71
+         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+#endif
       }
       cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
@@ -1054,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
    cl_emit(rcl, END_OF_RENDERING, end);
 }

+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+                            float scale[3],
+                            float translate[3])
+{
+   float x = viewport->x;
+   float y = viewport->y;
+   float half_width = 0.5f * viewport->width;
+   float half_height = 0.5f * viewport->height;
+   double n = viewport->minDepth;
+   double f = viewport->maxDepth;
+
+   scale[0] = half_width;
+   translate[0] = half_width + x;
+   scale[1] = half_height;
+   translate[1] = half_height + y;
+
+   scale[2] = (f - n);
+   translate[2] = n;
+
+   /* It seems that if the scale is small enough the hardware won't clip
+    * correctly so we work around this my choosing the smallest scale that
+    * seems to work.
+    *
+    * This case is exercised by CTS:
+    * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+    *
+    * V3D 7.x fixes this by using the new
+    * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
+    */
+#if V3D_VERSION <= 42
+   const float min_abs_scale = 0.0005f;
+   if (fabs(scale[2]) < min_abs_scale)
+      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+#endif
+}
+
 void
 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
    v3dv_return_if_oom(cmd_buffer, NULL);

+#if V3D_VERSION == 42
    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
    }
+#endif
+#if V3D_VERSION >= 71
+   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+      clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
+      clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
+   }
+#endif

    float translate_z, scale_z;
    v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
                                               &translate_z, &scale_z);

+#if V3D_VERSION == 42
    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
       clip.viewport_z_offset_zc_to_zs = translate_z;
       clip.viewport_z_scale_zc_to_zs = scale_z;
    }
+#endif
+
+#if V3D_VERSION >= 71
+   /* If the Z scale is too small guardband clipping may not clip correctly */
+   if (fabsf(scale_z) < 0.01f) {
+      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
+         clip.viewport_z_offset_zc_to_zs = translate_z;
+         clip.viewport_z_scale_zc_to_zs = scale_z;
+      }
+   } else {
+      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+         clip.viewport_z_offset_zc_to_zs = translate_z;
+         clip.viewport_z_scale_zc_to_zs = scale_z;
+      }
+   }
+#endif
+
    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
       /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
        * we are using OpenGL's [-1, 1] instead.
@@ -1205,14 +1499,48 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
       bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
       bias.depth_offset_units = dynamic->depth_bias.constant_factor;
+#if V3D_VERSION <= 42
       if (pipeline->depth_bias.is_z16)
          bias.depth_offset_units *= 256.0f;
+#endif
       bias.limit = dynamic->depth_bias.depth_bias_clamp;
    }

    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
 }

+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* No depthBounds support for v42, so this method is empty on that case.
+    *
+    * Note that this method is being called as v3dv_job_init flag all state as
+    * dirty. See FIXME note at v3dv_job_init.
+    */
+
+#if V3D_VERSION >= 71
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   assert(pipeline);
+
+   if (!pipeline->depth_bounds_test_enabled)
+      return;
+
+   struct v3dv_job *job = cmd_buffer->state.job;
+   assert(job);
+
+   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
+      bounds.lower_test_limit = dynamic->depth_bounds.min;
+      bounds.upper_test_limit = dynamic->depth_bounds.max;
+   }
+
+   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+#endif
+}
+
 void
 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -1256,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);

+   const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
+   const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
+
    const uint32_t blend_packets_size =
       cl_packet_length(BLEND_ENABLES) +
       cl_packet_length(BLEND_CONSTANT_COLOR) +
-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
+      cl_packet_length(BLEND_CFG) * max_color_rts;

    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
    v3dv_return_if_oom(cmd_buffer, NULL);
@@ -1271,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
          }
       }

-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+      for (uint32_t i = 0; i < max_color_rts; i++) {
          if (pipeline->blend.enables & (1 << i))
             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
       }
@@ -1298,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)

    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
+   uint32_t color_write_mask = ~dynamic->color_write_enable |
+                               pipeline->blend.color_write_masks;
+#if V3D_VERSION <= 42
+   /* Only 4 RTs */
+   color_write_mask &= 0xffff;
+#endif
+
    cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
-      mask.mask = (~dynamic->color_write_enable |
-                   pipeline->blend.color_write_masks) & 0xffff;
+      mask.mask = color_write_mask;
    }

    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
@@ -1591,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
    assert(pipeline);

-   bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
-
    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
    v3dv_return_if_oom(cmd_buffer, NULL);

    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+#if V3D_VERSION == 42
+      bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
       config.early_z_enable = enable_ez;
       config.early_z_updates_enable = config.early_z_enable &&
          pipeline->z_updates_enable;
+#endif
    }
 }

@@ -1845,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
          gs_bin->prog_data.gs->base.threads == 4;
       shader.geometry_bin_mode_shader_start_in_final_thread_section =
          gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
       shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
       shader.geometry_bin_mode_shader_uniforms_address =
          gs_bin_uniforms;

@@ -1855,7 +2195,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
          gs->prog_data.gs->base.threads == 4;
       shader.geometry_render_mode_shader_start_in_final_thread_section =
          gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
       shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
       shader.geometry_render_mode_shader_uniforms_address =
          gs_render_uniforms;
    }
@@ -2031,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
                                 pipeline->vpm_cfg.Gv);
    }

+#if V3D_VERSION == 42
    struct v3dv_bo *default_attribute_values =
       pipeline->default_attribute_values != NULL ?
       pipeline->default_attribute_values :
       pipeline->device->default_attribute_float;
+#endif

    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
                           pipeline->shader_state_record, shader) {
@@ -2060,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
       shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
       shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;

+#if V3D_VERSION == 42
       shader.address_of_default_attribute_values =
          v3dv_cl_address(default_attribute_values, 0);
+#endif

       shader.any_shader_reads_hardware_written_primitive_id =
          (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
@@ -2370,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
                                      buffer->mem_offset + offset);
    }
 }
-
-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
-                                                 int rt,
-                                                 uint32_t *rt_bpp,
-                                                 uint32_t *rt_type,
-                                                 uint32_t *rt_clamp)
-{
-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
-
-   assert(state->subpass_idx < state->pass->subpass_count);
-   const struct v3dv_subpass *subpass =
-      &state->pass->subpasses[state->subpass_idx];
-
-   if (rt >= subpass->color_count)
-      return;
-
-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
-   const uint32_t attachment_idx = attachment->attachment;
-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
-      return;
-
-   assert(attachment_idx < state->framebuffer->attachment_count &&
-          attachment_idx < state->attachment_alloc_count);
-   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
-   assert(vk_format_is_color(iview->vk.format));
-
-   assert(iview->plane_count == 1);
-   *rt_bpp = iview->planes[0].internal_bpp;
-   *rt_type = iview->planes[0].internal_type;
-   if (vk_format_is_int(iview->vk.view_format))
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-   else if (vk_format_is_srgb(iview->vk.view_format))
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
-   else
-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
-}
diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
index e235983864c..1b50d51e19f 100644
--- a/src/broadcom/vulkan/v3dvx_device.c
+++ b/src/broadcom/vulkan/v3dvx_device.c
@@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = {
    [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
 };

-
 static union pipe_color_union encode_border_color(
+   const struct v3dv_device *device,
    const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
 {
    const struct util_format_description *desc =
@@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color(
     * colors so we need to fix up the swizzle manually for this case.
     */
    uint8_t swizzle[4];
-   if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+   const bool v3d_has_reverse_swap_rb_bits =
+      v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
+   if (!v3d_has_reverse_swap_rb_bits &&
+       v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
        v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
       swizzle[0] = PIPE_SWIZZLE_W;
       swizzle[1] = PIPE_SWIZZLE_X;
       swizzle[2] = PIPE_SWIZZLE_Y;
       swizzle[3] = PIPE_SWIZZLE_Z;
+   }
+   /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
+    * we have to use the new reverse and swap_r/b flags in the texture shader
+    * state which will apply the format swizzle automatically when sampling
+    * the border color too and we should not apply it manually here.
+    */
+   else if (v3d_has_reverse_swap_rb_bits &&
+            (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
+             v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
+      swizzle[0] = PIPE_SWIZZLE_X;
+      swizzle[1] = PIPE_SWIZZLE_Y;
+      swizzle[2] = PIPE_SWIZZLE_Z;
+      swizzle[3] = PIPE_SWIZZLE_W;
    } else {
       memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
    }
@@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color(
                              (1 << (desc->channel[i].size - 1)) - 1);
    }

-   /* convert from float to expected format */
+#if V3D_VERSION <= 42
+   /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+    * for us. In V3D 4.x we need to manually convert floating point color
+    * values to the expected format.
+    */
    if (vk_format_is_srgb(bc_info->format) ||
        vk_format_is_compressed(bc_info->format)) {
       for (int i = 0; i < 4; i++)
@@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color(
          }
       }
    }
+#endif

    return border;
 }

 void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+                         struct v3dv_sampler *sampler,
                          const VkSamplerCreateInfo *pCreateInfo,
                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
 {
@@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
       s.border_color_mode = border_color_mode;

       if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
-         union pipe_color_union border = encode_border_color(bc_info);
+         union pipe_color_union border = encode_border_color(device, bc_info);

          s.border_color_word_0 = border.ui[0];
          s.border_color_word_1 = border.ui[1];
@@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
    const struct v3dv_framebuffer *framebuffer,
    const struct v3dv_cmd_buffer_attachment_state *attachments,
    const struct v3dv_subpass *subpass,
-   uint8_t *max_bpp,
+   uint8_t *max_internal_bpp,
+   uint8_t *total_color_bpp,
    bool *msaa)
 {
    STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
-   *max_bpp = V3D_INTERNAL_BPP_32;
+   *max_internal_bpp = V3D_INTERNAL_BPP_32;
+   *total_color_bpp = 0;
    *msaa = false;

    if (subpass) {
@@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
          assert(att);
          assert(att->plane_count == 1);

-         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-            *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+            const uint32_t internal_bpp = att->planes[0].internal_bpp;
+            *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+            *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+         }

          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
@@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
       }
-
       return;
    }

@@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
       assert(att);
       assert(att->plane_count == 1);

-      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
-         *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
+      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
+         const uint32_t internal_bpp = att->planes[0].internal_bpp;
+         *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
+         *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+      }

       if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
          *msaa = true;
diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
index 80a3e5bfde8..de984e81220 100644
--- a/src/broadcom/vulkan/v3dvx_image.c
+++ b/src/broadcom/vulkan/v3dvx_image.c
@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
          tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
          tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);

-         tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
-
          tex.texture_type = image_view->format->planes[plane].tex_type;

          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
@@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,

          tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;

-         tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
-
          /* At this point we don't have the job. That's the reason the first
           * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
           * add the bo to the job. This also means that we need to add manually
@@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
             v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
                               iplane);
          tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+
+         /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
+          * the reverse and/or swap_r/b swizzle from the format table with the
+          * image view swizzle. This, however, doesn't work for border colors,
+          * for that there is the reverse_standard_border_color.
+          *
+          * In v3d 7.x, however, there is no reverse_standard_border_color bit,
+          * since the reverse and swap_r/b bits also affect border colors. It is
+          * because of this that we absolutely need to use these bits with
+          * reversed and swpaped formats, since that's the only way to ensure
+          * correct border colors. In that case we don't want to program the
+          * swizzle to the composition of the format swizzle and the view
+          * swizzle like we do in v3d 4.x, since the format swizzle is applied
+          * via the reverse and swap_r/b bits.
+          */
+#if V3D_VERSION == 42
+         tex.srgb = is_srgb;
+         tex.reverse_standard_border_color =
+            image_view->planes[plane].channel_reverse;
+#endif
+#if V3D_VERSION >= 71
+         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+
+         tex.reverse = image_view->planes[plane].channel_reverse;
+         tex.r_b_swap = image_view->planes[plane].swap_rb;
+
+         if (tex.reverse || tex.r_b_swap) {
+            tex.swizzle_r =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
+            tex.swizzle_g =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
+            tex.swizzle_b =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
+            tex.swizzle_a =
+               v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
+         }
+
+         tex.chroma_offset_x = 1;
+         tex.chroma_offset_y = 1;
+         /* See comment in XML field definition for rationale of the shifts */
+         tex.texture_base_pointer_cb = base_offset >> 6;
+         tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
       }
    }
 }
@@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,

       assert(buffer_view->format->plane_count == 1);
       tex.texture_type = buffer_view->format->planes[0].tex_type;
-      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
+
+      bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+#if V3D_VERSION == 42
+      tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+      tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif

       /* At this point we don't have the job. That's the reason the first
        * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
          buffer_view->offset;

       tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+
+#if V3D_VERSION >= 71
+      tex.chroma_offset_x = 1;
+      tex.chroma_offset_y = 1;
+      /* See comment in XML field definition for rationale of the shifts */
+      tex.texture_base_pointer_cb = base_offset >> 6;
+      tex.texture_base_pointer_cr = base_offset >> 6;
+#endif
    }
 }
diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
index 04147b82cbd..858096f9e4b 100644
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@@ -26,6 +26,7 @@

 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/common/v3d_tfu.h"
+#include "broadcom/common/v3d_util.h"
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/compiler/v3d_compiler.h"

@@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job,
       config.number_of_render_targets = 1;
       config.multisample_mode_4x = tiling->msaa;
       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = log2_tile_size(tiling->tile_width);
+      config.log2_tile_height = log2_tile_size(tiling->tile_height);
+      /* FIXME: ideallly we would like next assert on the packet header (as is
+       * general, so also applies to GL). We would need to expand
+       * gen_pack_header for that.
+       */
+      assert(config.log2_tile_width == config.log2_tile_height ||
+             config.log2_tile_width == config.log2_tile_height + 1);
+#endif
       config.internal_depth_type = fb->internal_depth_type;
    }

+   const uint32_t *color = NULL;
    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
-      uint32_t clear_pad = 0;
+      UNUSED uint32_t clear_pad = 0;
       if (clear_info->image) {
          const struct v3dv_image *image = clear_info->image;

@@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job,
          }
       }

-      const uint32_t *color = &clear_info->clear_value->color[0];
+      color = &clear_info->clear_value->color[0];
+
+#if V3D_VERSION == 42
       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
          clear.clear_color_low_32_bits = color[0];
          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
@@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job,
             clear.render_target_number = 0;
          };
       }
+#endif
    }

+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
       rt.render_target_0_internal_bpp = tiling->internal_bpp;
       rt.render_target_0_internal_type = fb->internal_type;
       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
    }
+#endif
+
+#if V3D_VERSION >= 71
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+      if (color)
+         rt.clear_color_low_bits = color[0];
+      rt.internal_bpp = tiling->internal_bpp;
+      rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+                                                                      fb->vk_format);
+      rt.stride =
+         v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
+                                                v3d_internal_bpp_words(rt.internal_bpp));
+      rt.base_address = 0;
+      rt.render_target_number = 0;
+   }
+
+   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+         rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+            ((uint64_t) color[1]) |
+            (((uint64_t) (color[2] & 0xff)) << 32);
+         rt.render_target_number = 0;
+      }
+   }
+
+   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
+      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+         rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+            (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
+            (((uint64_t) (color[3])) << 24);
+         rt.render_target_number = 0;
+      }
+   }
+#endif

    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
@@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job,
        */
       if (clear_value &&
           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
+#if V3D_VERSION == 42
          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
             clear.clear_z_stencil_buffer = true;
             clear.clear_all_render_targets = true;
          }
+#endif
+#if V3D_VERSION >= 71
+         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
       }
       cl_emit(rcl, END_OF_TILE_MARKER, end);
    }
@@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,

    tfu.iia |= src_offset;

+#if V3D_VERSION <= 42
    if (src_tiling == V3D_TILING_RASTER) {
       tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
    } else {
@@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
                    V3D33_TFU_ICFG_FORMAT_SHIFT;
    }
    tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
+#endif
+#if V3D_VERSION >= 71
+   if (src_tiling == V3D_TILING_RASTER) {
+      tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+   } else {
+      tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+                  (src_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D71_TFU_ICFG_IFORMAT_SHIFT;
+   }
+   tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
+#endif

    tfu.ioa = dst_offset;

+#if V3D_VERSION <= 42
    tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
                (dst_tiling - V3D_TILING_LINEARTILE)) <<
                 V3D33_TFU_IOA_FORMAT_SHIFT;
+#endif
+
+#if V3D_VERSION >= 71
+   tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
+                  (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D71_TFU_IOC_FORMAT_SHIFT;
+
+   switch (dst_tiling) {
+   case V3D_TILING_UIF_NO_XOR:
+   case V3D_TILING_UIF_XOR:
+      tfu.v71.ioc |=
+         (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
+         V3D71_TFU_IOC_STRIDE_SHIFT;
+      break;
+   case V3D_TILING_RASTER:
+      tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
+                      V3D71_TFU_IOC_STRIDE_SHIFT;
+      break;
+   default:
+      break;
+   }
+#endif

    switch (src_tiling) {
    case V3D_TILING_UIF_NO_XOR:
@@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
    /* The TFU can handle raster sources but always produces UIF results */
    assert(dst_tiling != V3D_TILING_RASTER);

+#if V3D_VERSION <= 42
    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
     * OPAD field for the destination (how many extra UIF blocks beyond
     * those necessary to cover the height).
@@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
                       uif_block_h;
       tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
    }
+#endif

    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
 }
@@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
       uint32_t width, height;
       framebuffer_size_for_pixel_count(num_items, &width, &height);

-      v3dv_job_start_frame(job, width, height, 1, true, true,
-                           1, internal_bpp, false);
+      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                           false);

       struct v3dv_meta_framebuffer framebuffer;
       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
@@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
       uint32_t width, height;
       framebuffer_size_for_pixel_count(num_items, &width, &height);

-      v3dv_job_start_frame(job, width, height, 1, true, true,
-                           1, internal_bpp, false);
+      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
+                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                           false);

       struct v3dv_meta_framebuffer framebuffer;
       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
index 5d32d414ed8..ad22add155d 100644
--- a/src/broadcom/vulkan/v3dvx_pipeline.c
+++ b/src/broadcom/vulkan/v3dvx_pipeline.c
@@ -227,6 +227,45 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
          ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;

       pipeline->z_updates_enable = config.z_updates_enable;
+
+#if V3D_VERSION >= 71
+      /* From the Vulkan spec:
+       *
+       *    "depthClampEnable controls whether to clamp the fragment’s depth
+       *     values as described in Depth Test. If the pipeline is not created
+       *     with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
+       *     then enabling depth clamp will also disable clipping primitives to
+       *     the z planes of the frustrum as described in Primitive Clipping.
+       *     Otherwise depth clipping is controlled by the state set in
+       *     VkPipelineRasterizationDepthClipStateCreateInfoEXT."
+       *
+       * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
+       * supported in the driver yet, so in practice we are always enabling Z
+       * clipping for now.
+       */
+      bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+      bool z_clip_enable = false;
+      const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+         ds_info ? vk_find_struct_const(ds_info->pNext,
+                                        PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
+                   NULL;
+      if (clip_info)
+         z_clip_enable = clip_info->depthClipEnable;
+      else if (!z_clamp_enable)
+         z_clip_enable = true;
+
+      if (z_clip_enable) {
+         config.z_clipping_mode = pipeline->negative_one_to_one ?
+	    V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
+      } else {
+         config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+      }
+
+      config.z_clamp_mode = z_clamp_enable;
+
+      config.depth_bounds_test_enable =
+              ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
+#endif
    };
 }

@@ -360,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
 static void
 pack_shader_state_record(struct v3dv_pipeline *pipeline)
 {
-   assert(sizeof(pipeline->shader_state_record) ==
+   assert(sizeof(pipeline->shader_state_record) >=
           cl_packet_length(GL_SHADER_STATE_RECORD));

    struct v3d_fs_prog_data *prog_data_fs =
@@ -435,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
       shader.number_of_varyings_in_fragment_shader =
          prog_data_fs->num_inputs;

-      shader.coordinate_shader_propagate_nans = true;
-      shader.vertex_shader_propagate_nans = true;
-      shader.fragment_shader_propagate_nans = true;
-
       /* Note: see previous note about addresses */
       /* shader.coordinate_shader_code_address */
       /* shader.vertex_shader_code_address */
       /* shader.fragment_shader_code_address */

+#if V3D_VERSION == 42
+      shader.coordinate_shader_propagate_nans = true;
+      shader.vertex_shader_propagate_nans = true;
+      shader.fragment_shader_propagate_nans = true;
+
       /* FIXME: Use combined input/output size flag in the common case (also
        * on v3d, see v3dx_draw).
        */
@@ -451,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
          prog_data_vs_bin->separate_segments;
       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
          prog_data_vs->separate_segments;
-
       shader.coordinate_shader_input_vpm_segment_size =
          prog_data_vs_bin->separate_segments ?
          prog_data_vs_bin->vpm_input_size : 1;
       shader.vertex_shader_input_vpm_segment_size =
          prog_data_vs->separate_segments ?
          prog_data_vs->vpm_input_size : 1;
+#endif
+
+      /* On V3D 7.1 there isn't a specific flag to set if we are using
+       * shared/separate segments or not. We just set the value of
+       * vpm_input_size to 0, and set output to the max needed. That should be
+       * already properly set on prog_data_vs_bin
+       */
+#if V3D_VERSION == 71
+      shader.coordinate_shader_input_vpm_segment_size =
+         prog_data_vs_bin->vpm_input_size;
+      shader.vertex_shader_input_vpm_segment_size =
+         prog_data_vs->vpm_input_size;
+#endif

       shader.coordinate_shader_output_vpm_segment_size =
          prog_data_vs_bin->vpm_output_size;
@@ -659,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
       }
    }
 }
+
+#if V3D_VERSION == 42
+static bool
+pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+{
+   for (uint8_t i = 0; i < pipeline->va_count; i++) {
+      if (vk_format_is_int(pipeline->va[i].vk_format))
+         return true;
+   }
+   return false;
+}
+#endif
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION == 42
+   return pipeline_has_integer_vertex_attrib(pipeline);
+#endif
+
+   return false;
+}
+
+/* @pipeline can be NULL. In that case we assume the most common case. For
+ * example, for v42 we assume in that case that all the attributes have a
+ * float format (we only create an all-float BO once and we reuse it with all
+ * float pipelines), otherwise we look at the actual type of each attribute
+ * used with the specific pipeline passed in.
+ */
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                      struct v3dv_pipeline *pipeline)
+{
+#if V3D_VERSION >= 71
+   return NULL;
+#endif
+
+   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+   struct v3dv_bo *bo;
+
+   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+
+   if (!bo) {
+      fprintf(stderr, "failed to allocate memory for the default "
+              "attribute values\n");
+      return NULL;
+   }
+
+   bool ok = v3dv_bo_map(device, bo, size);
+   if (!ok) {
+      fprintf(stderr, "failed to map default attribute values buffer\n");
+      return NULL;
+   }
+
+   uint32_t *attrs = bo->map;
+   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+      attrs[i * 4 + 0] = 0;
+      attrs[i * 4 + 1] = 0;
+      attrs[i * 4 + 2] = 0;
+      VkFormat attr_format =
+         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+      if (i < va_count && vk_format_is_int(attr_format)) {
+         attrs[i * 4 + 3] = 1;
+      } else {
+         attrs[i * 4 + 3] = fui(1.0);
+      }
+   }
+
+   v3dv_bo_unmap(device, bo);
+
+   return bo;
+}
diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
index ad8ddfa5731..0f5887eab93 100644
--- a/src/broadcom/vulkan/v3dvx_private.h
+++ b/src/broadcom/vulkan/v3dvx_private.h
@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer);
 void
 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);

+void
+v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
+
 void
 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);

@@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
                          uint32_t internal_size,
                          uint32_t *hw_color);

-void
-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
-                                                 int rt,
-                                                 uint32_t *rt_bpp,
-                                                 uint32_t *rt_type,
-                                                 uint32_t *rt_clamp);
-
 /* Used at v3dv_device */

 void
-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+v3dX(pack_sampler_state)(const struct v3dv_device *device,
+                         struct v3dv_sampler *sampler,
                          const VkSamplerCreateInfo *pCreateInfo,
                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);

@@ -143,7 +140,9 @@ void
 v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
                                             const struct v3dv_cmd_buffer_attachment_state *attachments,
                                             const struct v3dv_subpass *subpass,
-                                            uint8_t *max_bpp, bool *msaa);
+                                            uint8_t *max_internal_bpp,
+                                            uint8_t *total_color_bpp,
+                                            bool *msaa);

 #ifdef DEBUG
 void
@@ -313,10 +312,24 @@ void
 v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
                                   const VkPipelineVertexInputStateCreateInfo *vi_info,
                                   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
+
+bool
+v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
+
+struct v3dv_bo *
+v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                      struct v3dv_pipeline *pipeline);
+
 /* Used at v3dv_queue */
 void
 v3dX(job_emit_noop)(struct v3dv_job *job);

+/* Used at v3dv_query */
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+                                           VkPerformanceCounterKHR *pCounters,
+                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
+
 /* Used at v3dv_descriptor_set, and other descriptor set utils */
 uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);

@@ -325,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
 uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);

 uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+
+/* General utils */
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format);
+
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                VkFormat vk_format);
+
+void
+v3dX(viewport_compute_xform)(const VkViewport *viewport,
+                             float scale[3],
+                             float translate[3]);
diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
new file mode 100644
index 00000000000..e59a1e84ff6
--- /dev/null
+++ b/src/broadcom/vulkan/v3dvx_query.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2023 Raspberry Pi Ltd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3dv_private.h"
+
+#include "common/v3d_performance_counters.h"
+
+VkResult
+v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
+                                           VkPerformanceCounterKHR *pCounters,
+                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+{
+   uint32_t desc_count = *pCounterCount;
+
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+                          out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+                          out_desc, pCounterDescriptions, &desc_count);
+
+   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+
+         unsigned char sha1_result[20];
+         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+                            sha1_result);
+
+         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+      }
+
+      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+                               &out_desc, desc) {
+         desc->flags = 0;
+         snprintf(desc->name, sizeof(desc->name), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+         snprintf(desc->category, sizeof(desc->category), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+         snprintf(desc->description, sizeof(desc->description), "%s",
+            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
index efe63de425c..6eed2de9d54 100644
--- a/src/broadcom/vulkan/v3dvx_queue.c
+++ b/src/broadcom/vulkan/v3dvx_queue.c
@@ -29,7 +29,8 @@
 void
 v3dX(job_emit_noop)(struct v3dv_job *job)
 {
-   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false);
+   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
+                        V3D_INTERNAL_BPP_32, 4, false);
    v3dX(job_emit_binning_flush)(job);

    struct v3dv_cl *rcl = &job->rcl;
@@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
       config.image_height_pixels = 1;
       config.number_of_render_targets = 1;
       config.multisample_mode_4x = false;
+#if V3D_VERSION == 42
       config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+#endif
+#if V3D_VERSION >= 71
+      config.log2_tile_width = 3; /* Tile size 64 */
+      config.log2_tile_height = 3; /* Tile size 64 */
+#endif
    }

+#if V3D_VERSION == 42
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
       rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
       rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
    }
+#endif
+#if V3D_VERSION >= 71
+   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+      rt.internal_bpp = V3D_INTERNAL_BPP_32;
+      rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
+      rt.stride = 1; /* Unused RT */
+   }
+#endif

    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
       clear.z_clear_value = 1.0f;
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index e6383b67737..46395d79a89 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -62,6 +62,8 @@ template = """\
 #include "util/softfloat.h"
 #include "util/bigmath.h"
 #include "util/format/format_utils.h"
+#include "util/format_r11g11b10f.h"
+#include "util/u_math.h"
 #include "nir_constant_expressions.h"

 /**
@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u)
    return _mesa_half_to_float(u);
 }

+/* Broadcom v3d specific instructions */
+/**
+ * Packs 2 2x16 floating split into a r11g11b10f
+ */
+static uint32_t v11fpack_v3d(const uint32_t src0,
+                             const uint32_t src1)
+{
+   float rgb[3];
+
+   rgb[0] = unpack_half_1x16((src0 & 0xffff));
+   rgb[1] = unpack_half_1x16((src0 >> 16));
+   rgb[2] = unpack_half_1x16((src1 & 0xffff));
+
+   return float3_to_r11g11b10f(rgb);
+}
+
+/**
+  * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
+  * as it receives a uint16_t val instead of a float
+  */
+static uint8_t _mesa_half_to_snorm8(uint16_t val)
+{
+   float x = _mesa_half_to_float(val);
+
+   return pack_snorm_1x8(x);
+}
+
+static uint16_t _mesa_float_to_snorm16(uint32_t val)
+{
+   union fi aux;
+   aux.ui = val;
+   return pack_snorm_1x16(aux.f);
+}
+
+static uint16_t _mesa_float_to_unorm16(uint32_t val)
+{
+   union fi aux;
+   aux.ui = val;
+   return pack_unorm_1x16(aux.f);
+}
+
+/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too
+ * verbose. It is likely that there would be a simpler way to implement
+ * it.
+ */
+static uint32_t float_pack16_v3d(uint32_t f32)
+{
+   float f = uif(f32);
+   return _mesa_float_to_half(f);
+}
+
+static uint32_t float_unpack16_v3d(uint32_t f16)
+{
+   float f = _mesa_half_to_float(f16);
+   return fui(f);
+}
+
+static uint32_t vfpack_v3d(uint32_t a, uint32_t b)
+{
+   return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
+}
+
+static  uint32_t vfsat_v3d(uint32_t a)
+{
+   return vfpack_v3d(
+      fui(SATURATE(_mesa_half_to_float(a & 0xffff))),
+      fui(SATURATE(_mesa_half_to_float(a >> 16))));
+}
+
+static uint32_t fmul_v3d(uint32_t a, uint32_t b)
+{
+   float f = uif(a);
+   float g = uif(b);
+
+   float x = f * g;
+
+   return fui(x);
+}
+
+#define L(x) float_unpack16_v3d((x) & 0xffff)
+#define H(x) float_unpack16_v3d((x) >> 16)
+#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b)))
+
+static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
+{
+   return V(fmul_v3d, a, b);
+}
+
+/* Convert 2x16-bit floating point to 2x10-bit unorm */
+static uint32_t vftounorm10lo(uint32_t src0)
+{
+   return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
+}
+
+/*
+ * Convert 2x16-bit floating point to one 2-bit and one
+ * 10-bit unorm
+ */
+static uint32_t vftounorm10hi(uint32_t src0)
+{
+   return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
+}
+
+
 /* Some typed vector structures to make things like src0.y work */
 typedef int8_t int1_t;
 typedef uint8_t uint1_t;
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index e4d87aa6126..63aa7cfa315 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) {
 }
 """)

+# v3d-specific opcodes
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into
+# r11g11b10 bits, rounding to nearest even
+binop_convert("v11fpack_v3d", tuint32, tuint32, "",
+              "v11fpack_v3d(src0, src1)")
+
+# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
+# difference with pack_32_2x16_split is that the sources are 32bit too. So it
+# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit
+# pack.
+binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
+            "(src0.x & 0xffff) | (src1.x << 16)")
+
+# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2
+binop_convert("v10pack_v3d", tuint32, tuint32, "",
+              "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
+
+# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
+#   dst[7:0]   = src0[7:0]
+#   dst[15:8]  = src0[23:16]
+#   dst[23:16] = src1[7:0]
+#   dst[31:24] = src1[23:16]
+opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
+       False, "",
+       "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
+
+# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
+unop("vftounorm8_v3d", tuint32,
+     "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
+unop("vftosnorm8_v3d", tuint32,
+     "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
+
+# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
+unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
+unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
+unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
+# and one 10 bit unorm
+unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)")
+
 # Mali-specific opcodes
 unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
diff --git a/src/gallium/drivers/v3d/driinfo_v3d.h b/src/gallium/drivers/v3d/driinfo_v3d.h
index 147ad0b49bd..8f989e8aa57 100644
--- a/src/gallium/drivers/v3d/driinfo_v3d.h
+++ b/src/gallium/drivers/v3d/driinfo_v3d.h
@@ -2,4 +2,6 @@

 DRI_CONF_SECTION_MISCELLANEOUS
    DRI_CONF_V3D_NONMSAA_TEXTURE_SIZE_LIMIT(false)
+   DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(false)
+   DRI_CONF_V3D_IS_XSERVER_PROCESS(false)
 DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
index dfa1e88097b..e47682db1aa 100644
--- a/src/gallium/drivers/v3d/meson.build
+++ b/src/gallium/drivers/v3d/meson.build
@@ -34,7 +34,6 @@ files_libv3d = files(
   'v3d_query.c',
   'v3d_query.h',
   'v3d_query_pipe.c',
-  'v3d_query_perfcnt.c',
   'v3d_resource.c',
   'v3d_resource.h',
   'v3d_screen.c',
@@ -47,8 +46,10 @@ files_per_version = files(
   'v3dx_emit.c',
   'v3dx_format_table.c',
   'v3dx_job.c',
+  'v3dx_query_perfcnt.c',
   'v3dx_rcl.c',
   'v3dx_state.c',
+  'v3dx_tfu.c',
 )

 v3d_args = ['-DV3D_BUILD_NEON']
@@ -58,7 +59,17 @@ if dep_v3dv3.found()
   v3d_args += '-DUSE_V3D_SIMULATOR'
 endif

-v3d_versions = ['33', '42']
+v3d_versions = ['33', '42', '71']
+
+v3d_deps = [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers]
+
+if with_platform_x11
+  v3d_deps += dep_xcb
+endif
+
+if with_platform_wayland
+  v3d_deps += dep_wayland_client
+endif

 per_version_libs = []
 foreach ver : v3d_versions
@@ -71,7 +82,7 @@ foreach ver : v3d_versions
     ],
     c_args : [v3d_args, '-DV3D_VERSION=' + ver],
     gnu_symbol_visibility : 'hidden',
-    dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers],
+    dependencies : v3d_deps,
 )

 endforeach
@@ -94,10 +105,7 @@ libv3d = static_library(
   c_args : [v3d_args],
   cpp_args : [v3d_args],
   gnu_symbol_visibility : 'hidden',
-  dependencies : [
-    dep_v3dv3, dep_libdrm, dep_valgrind,
-    idep_nir_headers, idep_mesautil,
-  ],
+  dependencies : v3d_deps + idep_mesautil,
   link_with: [per_version_libs],
 )

diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
index b7dc56a044e..ee3c14b154c 100644
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
         info->mask &= ~PIPE_MASK_S;
 }

-static bool
-v3d_tfu(struct pipe_context *pctx,
-        struct pipe_resource *pdst,
-        struct pipe_resource *psrc,
-        unsigned int src_level,
-        unsigned int base_level,
-        unsigned int last_level,
-        unsigned int src_layer,
-        unsigned int dst_layer,
-        bool for_mipmap)
-{
-        struct v3d_context *v3d = v3d_context(pctx);
-        struct v3d_screen *screen = v3d->screen;
-        struct v3d_resource *src = v3d_resource(psrc);
-        struct v3d_resource *dst = v3d_resource(pdst);
-        struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
-        struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
-        int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
-        int width = u_minify(pdst->width0, base_level) * msaa_scale;
-        int height = u_minify(pdst->height0, base_level) * msaa_scale;
-        enum pipe_format pformat;
-
-        if (psrc->format != pdst->format)
-                return false;
-        if (psrc->nr_samples != pdst->nr_samples)
-                return false;
-
-        /* Can't write to raster. */
-        if (dst_base_slice->tiling == V3D_TILING_RASTER)
-                return false;
-
-        /* When using TFU for blit, we are doing exact copies (both input and
-         * output format must be the same, no scaling, etc), so there is no
-         * pixel format conversions. Thus we can rewrite the format to use one
-         * that is TFU compatible based on its texel size.
-         */
-        if (for_mipmap) {
-                pformat = pdst->format;
-        } else {
-                switch (dst->cpp) {
-                case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT;   break;
-                case 8:  pformat = PIPE_FORMAT_R16G16B16A16_FLOAT;   break;
-                case 4:  pformat = PIPE_FORMAT_R32_FLOAT;            break;
-                case 2:  pformat = PIPE_FORMAT_R16_FLOAT;            break;
-                case 1:  pformat = PIPE_FORMAT_R8_UNORM;             break;
-                default: unreachable("unsupported format bit-size"); break;
-                };
-        }
-
-        uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
-        struct v3d_device_info *devinfo = &screen->devinfo;
-
-        if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) {
-                assert(for_mipmap);
-                return false;
-        }
-
-        v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
-        v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
-
-        struct drm_v3d_submit_tfu tfu = {
-                .ios = (height << 16) | width,
-                .bo_handles = {
-                        dst->bo->handle,
-                        src != dst ? src->bo->handle : 0
-                },
-                .in_sync = v3d->out_sync,
-                .out_sync = v3d->out_sync,
-        };
-        uint32_t src_offset = (src->bo->offset +
-                               v3d_layer_offset(psrc, src_level, src_layer));
-        tfu.iia |= src_offset;
-        if (src_base_slice->tiling == V3D_TILING_RASTER) {
-                tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
-                             V3D33_TFU_ICFG_FORMAT_SHIFT);
-        } else {
-                tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
-                              (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
-                             V3D33_TFU_ICFG_FORMAT_SHIFT);
-        }
-
-        uint32_t dst_offset = (dst->bo->offset +
-                               v3d_layer_offset(pdst, base_level, dst_layer));
-        tfu.ioa |= dst_offset;
-        if (last_level != base_level)
-                tfu.ioa |= V3D33_TFU_IOA_DIMTW;
-        tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
-                     (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
-                    V3D33_TFU_IOA_FORMAT_SHIFT);
-
-        tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
-        tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
-
-        switch (src_base_slice->tiling) {
-        case V3D_TILING_UIF_NO_XOR:
-        case V3D_TILING_UIF_XOR:
-                tfu.iis |= (src_base_slice->padded_height /
-                            (2 * v3d_utile_height(src->cpp)));
-                break;
-        case V3D_TILING_RASTER:
-                tfu.iis |= src_base_slice->stride / src->cpp;
-                break;
-        case V3D_TILING_LINEARTILE:
-        case V3D_TILING_UBLINEAR_1_COLUMN:
-        case V3D_TILING_UBLINEAR_2_COLUMN:
-                break;
-       }
-
-        /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
-         * OPAD field for the destination (how many extra UIF blocks beyond
-         * those necessary to cover the height).  When filling mipmaps, the
-         * miplevel 1+ tiling state is inferred.
-         */
-        if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
-            dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
-                int uif_block_h = 2 * v3d_utile_height(dst->cpp);
-                int implicit_padded_height = align(height, uif_block_h);
-
-                tfu.icfg |= (((dst_base_slice->padded_height -
-                               implicit_padded_height) / uif_block_h) <<
-                             V3D33_TFU_ICFG_OPAD_SHIFT);
-        }
-
-        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
-        if (ret != 0) {
-                fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
-                return false;
-        }
-
-        dst->writes++;
-
-        return true;
-}
-
 bool
 v3d_generate_mipmap(struct pipe_context *pctx,
                     struct pipe_resource *prsc,
@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx,
         if (first_layer != last_layer)
                 return false;

-        return v3d_tfu(pctx,
-                       prsc, prsc,
-                       base_level,
-                       base_level, last_level,
-                       first_layer, first_layer,
-                       true);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_screen *screen = v3d->screen;
+        struct v3d_device_info *devinfo = &screen->devinfo;
+
+        return v3d_X(devinfo, tfu)(pctx,
+                                   prsc, prsc,
+                                   base_level,
+                                   base_level, last_level,
+                                   first_layer, first_layer,
+                                   true);
 }

 static void
@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
         if (info->dst.format != info->src.format)
                 return;

-        if (v3d_tfu(pctx, info->dst.resource, info->src.resource,
-                    info->src.level,
-                    info->dst.level, info->dst.level,
-                    info->src.box.z, info->dst.box.z,
-                    false)) {
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_screen *screen = v3d->screen;
+        struct v3d_device_info *devinfo = &screen->devinfo;
+
+        if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource,
+                                info->src.level,
+                                info->dst.level, info->dst.level,
+                                info->src.box.z, info->dst.box.z,
+                                false)) {
                 info->mask &= ~PIPE_MASK_RGBA;
         }
 }
@@ -495,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
         bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa;

         uint32_t tile_width, tile_height, max_bpp;
-        v3d_get_tile_buffer_size(msaa, double_buffer,
+        v3d_get_tile_buffer_size(devinfo, msaa, double_buffer,
                                  is_color_blit ? 1 : 0, surfaces, src_surf,
                                  &tile_width, &tile_height, &max_bpp);

diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
index f12e8c92139..1dc4bd017fe 100644
--- a/src/gallium/drivers/v3d/v3d_context.c
+++ b/src/gallium/drivers/v3d/v3d_context.c
@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
 }

 void
-v3d_get_tile_buffer_size(bool is_msaa,
+v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+                         bool is_msaa,
                          bool double_buffer,
                          uint32_t nr_cbufs,
                          struct pipe_surface **cbufs,
@@ -232,11 +233,13 @@ v3d_get_tile_buffer_size(bool is_msaa,
         assert(!is_msaa || !double_buffer);

         uint32_t max_cbuf_idx = 0;
+        uint32_t total_bpp = 0;
         *max_bpp = 0;
         for (int i = 0; i < nr_cbufs; i++) {
                 if (cbufs[i]) {
                         struct v3d_surface *surf = v3d_surface(cbufs[i]);
                         *max_bpp = MAX2(*max_bpp, surf->internal_bpp);
+                        total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp);
                         max_cbuf_idx = MAX2(i, max_cbuf_idx);
                 }
         }
@@ -245,9 +248,11 @@ v3d_get_tile_buffer_size(bool is_msaa,
                 struct v3d_surface *bsurf = v3d_surface(bbuf);
                 assert(bbuf->texture->nr_samples <= 1 || is_msaa);
                 *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp);
+                total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp);
         }

-        v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp,
+        v3d_choose_tile_size(devinfo, max_cbuf_idx + 1,
+                             *max_bpp, total_bpp,
                              is_msaa, double_buffer,
                              tile_width, tile_height);
 }
diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
index 97850b0363e..eb184b4b203 100644
--- a/src/gallium/drivers/v3d/v3d_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj {
         unsigned num_elements;

         uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)];
+        /* defaults can be NULL for some hw generation */
         struct pipe_resource *defaults;
         uint32_t defaults_offset;
 };
@@ -794,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx);
 void v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
                                   enum pipe_shader_type shader);

-void v3d_get_tile_buffer_size(bool is_msaa,
+void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+                              bool is_msaa,
                               bool double_buffer,
                               uint32_t nr_cbufs,
                               struct pipe_surface **cbufs,
@@ -818,16 +820,52 @@ void v3d_disk_cache_store(struct v3d_context *v3d,

 /* Helper to call hw ver specific functions */
 #define v3d_X(devinfo, thing) ({                                \
-        __typeof(&v3d42_##thing) v3d_X_thing;                   \
-        if ((devinfo)->ver >= 42)                               \
-                v3d_X_thing = &v3d42_##thing;                   \
-        else if ((devinfo)->ver >= 33)                          \
+        __typeof(&v3d33_##thing) v3d_X_thing;                   \
+        switch (devinfo->ver) {                                 \
+        case 33:                                                \
+        case 40:                                                \
                 v3d_X_thing = &v3d33_##thing;                   \
-        else                                                    \
+                break;                                          \
+        case 42:                                                \
+                v3d_X_thing = &v3d42_##thing;                   \
+                break;                                          \
+        case 71:                                                \
+                v3d_X_thing = &v3d71_##thing;                   \
+                break;                                          \
+        default:                                                \
                 unreachable("Unsupported hardware generation"); \
+        }                                                       \
         v3d_X_thing;                                            \
 })

+/* FIXME: The same for vulkan/opengl. Common place? define it at the
+ * v3d_packet files?
+ */
+#define V3D33_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
+#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
+
+/* Helper to get hw-specific macro values */
+#define V3DV_X(devinfo, thing) ({                               \
+   __typeof(V3D33_##thing) V3D_X_THING;                         \
+   switch (devinfo->ver) {                                      \
+   case 33:                                                     \
+   case 40:                                                     \
+      V3D_X_THING = V3D33_##thing;                              \
+      break;                                                    \
+      case 41:                                                  \
+   case 42:                                                     \
+      V3D_X_THING = V3D42_##thing;                              \
+      break;                                                    \
+   case 71:                                                     \
+      V3D_X_THING = V3D71_##thing;                              \
+      break;                                                    \
+   default:                                                     \
+      unreachable("Unsupported hardware generation");           \
+   }                                                            \
+   V3D_X_THING;                                                 \
+})
+
 #ifdef v3dX
 #  include "v3dx_context.h"
 #else
@@ -838,6 +876,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
 #  define v3dX(x) v3d42_##x
 #  include "v3dx_context.h"
 #  undef v3dX
+
+#  define v3dX(x) v3d71_##x
+#  include "v3dx_context.h"
+#  undef v3dX
 #endif

 #endif /* V3D_CONTEXT_H */
diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c
index b022ed45073..577890a06c3 100644
--- a/src/gallium/drivers/v3d/v3d_job.c
+++ b/src/gallium/drivers/v3d/v3d_job.c
@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
                 job->double_buffer = false;
         }

-        v3d_get_tile_buffer_size(job->msaa, job->double_buffer,
+        v3d_get_tile_buffer_size(&v3d->screen->devinfo,
+                                 job->msaa, job->double_buffer,
                                  job->nr_cbufs, job->cbufs, job->bbuf,
-                                 &job->tile_width, &job->tile_height,
+                                 &job->tile_width,
+                                 &job->tile_height,
                                  &job->internal_bpp);

         /* The dirty flags are tracking what's been updated while v3d->job has
diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c
index db98c89625f..83f82e44a3d 100644
--- a/src/gallium/drivers/v3d/v3d_query.c
+++ b/src/gallium/drivers/v3d/v3d_query.c
@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index,
                                 struct pipe_driver_query_group_info *info)
 {
         struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_device_info *devinfo = &screen->devinfo;

-        return v3d_get_driver_query_group_info_perfcnt(screen, index, info);
+        return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen,
+                                                                   index,
+                                                                   info);
 }

 int
@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
                           struct pipe_driver_query_info *info)
 {
         struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_device_info *devinfo = &screen->devinfo;

-        return v3d_get_driver_query_info_perfcnt(screen, index, info);
+        return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen,
+                                                             index,
+                                                             info);
 }

 static struct pipe_query *
@@ -53,9 +59,13 @@ static struct pipe_query *
 v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
                        unsigned *query_types)
 {
-        return v3d_create_batch_query_perfcnt(v3d_context(pctx),
-                                              num_queries,
-                                              query_types);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_screen *screen = v3d->screen;
+        struct v3d_device_info *devinfo = &screen->devinfo;
+
+        return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx),
+                                                          num_queries,
+                                                          query_types);
 }

 static void
diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h
index 3e1426b8d86..605ed1a12f9 100644
--- a/src/gallium/drivers/v3d/v3d_query.h
+++ b/src/gallium/drivers/v3d/v3d_query.h
@@ -42,11 +42,5 @@ struct v3d_query
 };

 struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index);
-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
-                                                  unsigned *query_types);
-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
-                                            struct pipe_driver_query_group_info *info);
-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
-                                      struct pipe_driver_query_info *info);

 #endif /* V3D_QUERY_H */
diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
index a0a210ccad5..8e31acb0ff0 100644
--- a/src/gallium/drivers/v3d/v3d_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -439,7 +439,7 @@ v3d_resource_get_handle(struct pipe_screen *pscreen,
         case WINSYS_HANDLE_TYPE_SHARED:
                 return v3d_bo_flink(bo, &whandle->handle);
         case WINSYS_HANDLE_TYPE_KMS:
-                if (screen->ro) {
+                if (screen->ro && rsc->scanout) {
                         if (renderonly_get_handle(rsc->scanout, whandle)) {
                                 whandle->stride = rsc->slices[0].stride;
                                 return true;
@@ -785,6 +785,27 @@ v3d_resource_setup(struct pipe_screen *pscreen,
         return rsc;
 }

+static bool
+v3d_resource_should_scanout(struct pipe_screen *pscreen,
+                            const struct pipe_resource *tmpl,
+                            const uint64_t *modifiers,
+                            int count)
+{
+        struct v3d_screen *screen = v3d_screen(pscreen);
+
+        if (tmpl->bind & PIPE_BIND_SCANOUT) {
+                if (screen->maintain_ignorable_scanout)
+                        return true;
+                if (screen->has_x_session && screen->ignore_scanout_usages) {
+                        if (drm_find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF,
+                                              modifiers, count))
+                                return false;
+                }
+                return true;
+        }
+        return false;
+}
+
 static struct pipe_resource *
 v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
                                    const struct pipe_resource *tmpl,
@@ -798,6 +819,8 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
         struct pipe_resource *prsc = &rsc->base;
         /* Use a tiled layout if we can, for better 3D performance. */
         bool should_tile = true;
+        bool should_scanout = v3d_resource_should_scanout(pscreen, tmpl,
+                                                          modifiers, count);

         assert(tmpl->target != PIPE_BUFFER ||
                (tmpl->format == PIPE_FORMAT_NONE ||
@@ -827,7 +850,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
         /* If using the old-school SCANOUT flag, we don't know what the screen
          * might support other than linear. Just force linear.
          */
-        if (tmpl->bind & PIPE_BIND_SCANOUT)
+        if ((tmpl->bind & PIPE_BIND_SCANOUT) && should_scanout)
                 should_tile = false;

         /* No user-specified modifier; determine our own. */
@@ -849,7 +872,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,

         v3d_setup_slices(rsc, 0, tmpl->bind & PIPE_BIND_SHARED);

-        if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) {
+        if (screen->ro && should_scanout) {
                 struct winsys_handle handle;
                 struct pipe_resource scanout_tmpl = {
                         .target = prsc->target,
@@ -979,7 +1002,7 @@ v3d_resource_from_handle(struct pipe_screen *pscreen,
                  }
         }

-        if (screen->ro) {
+        if (screen->ro && !rsc->tiled) {
                 /* Make sure that renderonly has a handle to our buffer in the
                  * display's fd, so that a later renderonly_get_handle()
                  * returns correct handles or GEM names.
@@ -1025,7 +1048,9 @@ v3d_update_shadow_texture(struct pipe_context *pctx,

         assert(view->texture != pview->texture);

-        if (shadow->writes == orig->writes && orig->bo->private)
+        if (shadow->writes == orig->writes &&
+            orig->base.sync_status == 0 &&
+            (orig->bo->private || orig->base.sync_condition))
                 return;

         perf_debug("Updating %dx%d@%d shadow for linear texture\n",
@@ -1068,6 +1093,7 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
         }

         shadow->writes = orig->writes;
+        orig->base.sync_status = 0;
 }

 static struct pipe_surface *
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
index bce1eeafcd9..4d2478b130d 100644
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -47,6 +47,42 @@
 #include "compiler/v3d_compiler.h"
 #include "drm-uapi/drm_fourcc.h"

+#ifdef HAVE_WAYLAND_PLATFORM
+#include <wayland-client.h>
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+#include <xcb/xcb.h>
+#endif
+
+static bool
+check_x_session()
+{
+        bool xcb_connection = false;
+
+#ifdef HAVE_WAYLAND_PLATFORM
+        struct wl_display *display;
+
+        display = wl_display_connect(NULL);
+
+        if (display) {
+                wl_display_disconnect(display);
+                return xcb_connection;
+        }
+#endif
+
+#ifdef HAVE_X11_PLATFORM
+        xcb_connection_t *conn;
+
+        conn = xcb_connect(NULL, NULL);
+
+        if (!xcb_connection_has_error(conn))
+                xcb_connection = true;
+        xcb_disconnect(conn);
+#endif
+        return xcb_connection;
+}
+
 static const char *
 v3d_screen_get_name(struct pipe_screen *pscreen)
 {
@@ -255,9 +291,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
                 return V3D_MAX_ARRAY_LAYERS;

-                /* Render targets. */
         case PIPE_CAP_MAX_RENDER_TARGETS:
-                return 4;
+                return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver);

         case PIPE_CAP_VENDOR_ID:
                 return 0x14E4;
@@ -919,6 +954,12 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
         if (!v3d_get_device_info(screen->fd, &screen->devinfo, &v3d_ioctl))
                 goto fail;

+        if (screen->devinfo.ver >= 71) {
+                fprintf(stderr, "WARNING: v3d support for hw version %i is neither "
+                        "a complete nor a conformant OpenGL implementation. Testing "
+                        "use only.\n", screen->devinfo.ver);
+        }
+
         driParseConfigFiles(config->options, config->options_info, 0, "v3d",
                             NULL, NULL, NULL, 0, NULL, 0);

@@ -937,6 +978,29 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
                 v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH);
         screen->has_perfmon = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_PERFMON);

+        screen->ignore_scanout_usages = getenv("V3D_IGNORE_SCANOUT_USAGES");
+
+        const char *is_xserver_process =
+                "v3d_is_xserver_process";
+        screen->is_xserver_process =
+                driCheckOption(config->options,
+                               is_xserver_process,
+                               DRI_BOOL) &&
+                driQueryOptionb(config->options,
+                                is_xserver_process);
+
+        const char *maintain_ignorable_scanout_name =
+                "v3d_maintain_ignorable_scanout";
+        screen->maintain_ignorable_scanout =
+                driCheckOption(config->options,
+                               maintain_ignorable_scanout_name,
+                               DRI_BOOL) &&
+                driQueryOptionb(config->options,
+                                maintain_ignorable_scanout_name);
+
+        screen->has_x_session = !screen->is_xserver_process &&
+                                check_x_session();
+
         v3d_fence_init(screen);

         v3d_process_debug_variable();
diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h
index 1da9b83c965..c0f22707075 100644
--- a/src/gallium/drivers/v3d/v3d_screen.h
+++ b/src/gallium/drivers/v3d/v3d_screen.h
@@ -83,6 +83,12 @@ struct v3d_screen {
         bool has_cache_flush;
         bool has_perfmon;
         bool nonmsaa_texture_size_limit;
+        bool ignore_scanout_usages;
+        bool is_xserver_process;
+        bool maintain_ignorable_scanout;
+
+        /* Are we running in an X session? */
+        bool has_x_session;

         struct v3d_simulator_file *sim_file;

diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
index 95eb838954f..1b8758bae7d 100644
--- a/src/gallium/drivers/v3d/v3d_uniforms.c
+++ b/src/gallium/drivers/v3d/v3d_uniforms.c
@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
                    struct v3d_compiled_shader *shader,
                    enum pipe_shader_type stage)
 {
+        struct v3d_device_info *devinfo = &v3d->screen->devinfo;
         struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage];
         struct v3d_texture_stateobj *texstate = &v3d->tex[stage];
         struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
@@ -282,6 +283,9 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
         struct v3d_cl_out *uniforms =
                 cl_start(&job->indirect);

+        float clipper_xy_granularity =
+                V3DV_X(devinfo, CLIPPER_XY_GRANULARITY);
+
         for (int i = 0; i < uinfo->count; i++) {
                 uint32_t data = uinfo->data[i];

@@ -293,10 +297,10 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
                         cl_aligned_u32(&uniforms, gallium_uniforms[data]);
                         break;
                 case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f);
+                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity);
                         break;
                 case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f);
+                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity);
                         break;

                 case QUNIFORM_VIEWPORT_Z_OFFSET:
diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
index 03d7c244ea2..c487ac3b996 100644
--- a/src/gallium/drivers/v3d/v3dx_context.h
+++ b/src/gallium/drivers/v3d/v3dx_context.h
@@ -51,3 +51,23 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
  */
 bool v3dX(tfu_supports_tex_format)(uint32_t tex_format,
                                    bool for_mipmap);
+
+bool v3dX(tfu)(struct pipe_context *pctx,
+               struct pipe_resource *pdst,
+               struct pipe_resource *psrc,
+               unsigned int src_level,
+               unsigned int base_level,
+               unsigned int last_level,
+               unsigned int src_layer,
+               unsigned int dst_layer,
+               bool for_mipmap);
+
+int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen,
+                                              unsigned index,
+                                              struct pipe_driver_query_group_info *info);
+int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen,
+                                        unsigned index,
+                                        struct pipe_driver_query_info *info);
+struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d,
+                                                    unsigned num_queries,
+                                                    unsigned *query_types);
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 0640dab1884..85083035ea6 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -95,7 +95,25 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
 #endif

         assert(!job->msaa || !job->double_buffer);
-#if V3D_VERSION >= 40
+#if V3D_VERSION >= 71
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+                config.width_in_pixels = job->draw_width;
+                config.height_in_pixels = job->draw_height;
+
+                config.log2_tile_width = log2_tile_size(job->tile_width);
+                config.log2_tile_height = log2_tile_size(job->tile_height);
+
+                /* FIXME: ideallly we would like next assert on the packet header (as is
+                 * general, so also applies to GL). We would need to expand
+                 * gen_pack_header for that.
+                 */
+                assert(config.log2_tile_width == config.log2_tile_height ||
+                       config.log2_tile_width == config.log2_tile_height + 1);
+        }
+
+#endif
+
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
         cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
                 config.width_in_pixels = job->draw_width;
                 config.height_in_pixels = job->draw_height;
@@ -107,7 +125,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)

                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
         }
-#else /* V3D_VERSION < 40 */
+#endif
+#if V3D_VERSION < 40
         /* "Binning mode lists start with a Tile Binning Mode Configuration
          * item (120)"
          *
@@ -134,7 +153,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)

                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
         }
-#endif /* V3D_VERSION < 40 */
+#endif

         /* There's definitely nothing in the VCD cache we want. */
         cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
@@ -377,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
                         gs_bin->prog_data.gs->base.threads == 4;
                 shader.geometry_bin_mode_shader_start_in_final_thread_section =
                         gs_bin->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
                 shader.geometry_bin_mode_shader_propagate_nans = true;
+#endif
                 shader.geometry_bin_mode_shader_uniforms_address =
                         gs_bin_uniforms;

@@ -387,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
                         gs->prog_data.gs->base.threads == 4;
                 shader.geometry_render_mode_shader_start_in_final_thread_section =
                         gs->prog_data.gs->base.single_seg;
+#if V3D_VERSION <= 42
                 shader.geometry_render_mode_shader_propagate_nans = true;
+#endif
                 shader.geometry_render_mode_shader_uniforms_address =
                         gs_render_uniforms;
         }
@@ -638,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                 shader.number_of_varyings_in_fragment_shader =
                         v3d->prog.fs->prog_data.fs->num_inputs;

-                shader.coordinate_shader_propagate_nans = true;
-                shader.vertex_shader_propagate_nans = true;
-                shader.fragment_shader_propagate_nans = true;
-
                 shader.coordinate_shader_code_address =
                         cl_address(v3d_resource(v3d->prog.cs->resource)->bo,
                                    v3d->prog.cs->offset);
@@ -652,6 +671,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                         cl_address(v3d_resource(v3d->prog.fs->resource)->bo,
                                    v3d->prog.fs->offset);

+#if V3D_VERSION <= 42
+                shader.coordinate_shader_propagate_nans = true;
+                shader.vertex_shader_propagate_nans = true;
+                shader.fragment_shader_propagate_nans = true;
+
                 /* XXX: Use combined input/output size flag in the common
                  * case.
                  */
@@ -659,13 +683,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                         v3d->prog.cs->prog_data.vs->separate_segments;
                 shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
                         v3d->prog.vs->prog_data.vs->separate_segments;
-
                 shader.coordinate_shader_input_vpm_segment_size =
                         v3d->prog.cs->prog_data.vs->separate_segments ?
                         v3d->prog.cs->prog_data.vs->vpm_input_size : 1;
                 shader.vertex_shader_input_vpm_segment_size =
                         v3d->prog.vs->prog_data.vs->separate_segments ?
                         v3d->prog.vs->prog_data.vs->vpm_input_size : 1;
+#endif
+                /* On V3D 7.1 there isn't a specific flag to set if we are using
+                 * shared/separate segments or not. We just set the value of
+                 * vpm_input_size to 0, and set output to the max needed. That should be
+                 * already properly set on prog_data_vs_bin
+                 */
+#if V3D_VERSION == 71
+                shader.coordinate_shader_input_vpm_segment_size =
+                        v3d->prog.cs->prog_data.vs->vpm_input_size;
+                shader.vertex_shader_input_vpm_segment_size =
+                        v3d->prog.vs->prog_data.vs->vpm_input_size;
+#endif

                 shader.coordinate_shader_output_vpm_segment_size =
                         v3d->prog.cs->prog_data.vs->vpm_output_size;
@@ -724,9 +759,11 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
                 shader.instance_id_read_by_vertex_shader =
                         v3d->prog.vs->prog_data.vs->uses_iid;

+#if V3D_VERSION <= 42
                 shader.address_of_default_attribute_values =
                         cl_address(v3d_resource(vtx->defaults)->bo,
                                    vtx->defaults_offset);
+#endif
         }

         bool cs_loaded_any = false;
@@ -1436,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
         submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;


-        /* Number of batches the dispatch will invoke (minus 1). */
-        submit.cfg[4] = num_batches - 1;
+        /* Number of batches the dispatch will invoke.
+         * V3D 7.1.6 and later don't subtract 1 from the number of batches
+         */
+        if (v3d->screen->devinfo.ver < 71 ||
+            (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) {
+                submit.cfg[4] = num_batches - 1;
+        } else {
+                submit.cfg[4] = num_batches;
+        }

         /* Make sure we didn't accidentally underflow. */
         assert(submit.cfg[4] != ~0);
@@ -1445,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
         v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo);
         submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset +
                          v3d->prog.compute->offset);
-        submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+        if (v3d->screen->devinfo.ver < 71)
+                submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
         if (v3d->prog.compute->prog_data.base->single_seg)
                 submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
         if (v3d->prog.compute->prog_data.base->threads == 4)
@@ -1560,9 +1605,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
         /* GFXH-1461: If we were to emit a load of just depth or just stencil,
          * then the clear for the other may get lost.  We need to decide now
          * if it would be possible to need to emit a load of just one after
-         * we've set up our TLB clears.
+         * we've set up our TLB clears. This issue is fixed since V3D 4.3.18.
          */
-        if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
+        if (v3d->screen->devinfo.ver <= 42 &&
+            buffers & PIPE_CLEAR_DEPTHSTENCIL &&
             (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
             job->zsbuf &&
             util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
index 0ad3fb68b1e..82a45e44f82 100644
--- a/src/gallium/drivers/v3d/v3dx_emit.c
+++ b/src/gallium/drivers/v3d/v3dx_emit.c
@@ -512,13 +512,17 @@ v3dX(emit_state)(struct pipe_context *pctx)
                         /* Note: EZ state may update based on the compiled FS,
                          * along with ZSA
                          */
+#if V3D_VERSION <= 42
                         config.early_z_updates_enable =
                                 (job->ez_state != V3D_EZ_DISABLED);
+#endif
                         if (v3d->zsa->base.depth_enabled) {
                                 config.z_updates_enable =
                                         v3d->zsa->base.depth_writemask;
+#if V3D_VERSION <= 42
                                 config.early_z_enable =
                                         config.early_z_updates_enable;
+#endif
                                 config.depth_test_function =
                                         v3d->zsa->base.depth_func;
                         } else {
@@ -535,13 +539,27 @@ v3dX(emit_state)(struct pipe_context *pctx)
                                 v3d_line_smoothing_enabled(v3d) ?
                                 V3D_LINE_RASTERIZATION_PERP_END_CAPS :
                                 V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
-                }

+#if V3D_VERSION >= 71
+                        /* The following follows the logic implemented at v3dv
+                         * plus the definition of depth_clip_near/far and
+                         * depth_clamp.
+                         *
+                         * Note: some extensions are not supported by v3d
+                         * (like ARB_depth_clamp) that would affect this, but
+                         * the values on rasterizer are taking that into
+                         * account.
+                         */
+                        config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near ||
+                           v3d->rasterizer->base.depth_clip_far;
+#endif
+                }
         }

         if (v3d->dirty & V3D_DIRTY_RASTERIZER &&
             v3d->rasterizer->base.offset_tri) {
-                if (job->zsbuf &&
+                if (v3d->screen->devinfo.ver <= 42 &&
+                    job->zsbuf &&
                     job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
                         cl_emit_prepacked_sized(&job->bcl,
                                                 v3d->rasterizer->depth_offset_z16,
@@ -564,12 +582,23 @@ v3dX(emit_state)(struct pipe_context *pctx)
         }

         if (v3d->dirty & V3D_DIRTY_VIEWPORT) {
+#if V3D_VERSION <= 42
                 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
                         clip.viewport_half_width_in_1_256th_of_pixel =
                                 v3d->viewport.scale[0] * 256.0f;
                         clip.viewport_half_height_in_1_256th_of_pixel =
                                 v3d->viewport.scale[1] * 256.0f;
                 }
+#endif
+#if V3D_VERSION >= 71
+                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+                        clip.viewport_half_width_in_1_64th_of_pixel =
+                                v3d->viewport.scale[0] * 64.0f;
+                        clip.viewport_half_height_in_1_64th_of_pixel =
+                                v3d->viewport.scale[1] * 64.0f;
+                }
+#endif
+

                 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
                         clip.viewport_z_offset_zc_to_zs =
@@ -633,8 +662,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
                         }
 #endif

+                        const uint32_t max_rts =
+                                V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
                         if (blend->base.independent_blend_enable) {
-                                for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
+                                for (int i = 0; i < max_rts; i++)
                                         emit_rt_blend(v3d, job, &blend->base, i,
                                                       (1 << i),
                                                       v3d->blend_dst_alpha_one & (1 << i));
@@ -650,16 +681,16 @@ v3dX(emit_state)(struct pipe_context *pctx)
                                  * RTs without.
                                  */
                                 emit_rt_blend(v3d, job, &blend->base, 0,
-                                              ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
+                                              ((1 << max_rts) - 1) &
                                                    v3d->blend_dst_alpha_one,
                                               true);
                                 emit_rt_blend(v3d, job, &blend->base, 0,
-                                              ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
+                                              ((1 << max_rts) - 1) &
                                                    ~v3d->blend_dst_alpha_one,
                                               false);
                         } else {
                                 emit_rt_blend(v3d, job, &blend->base, 0,
-                                              (1 << V3D_MAX_DRAW_BUFFERS) - 1,
+                                              (1 << max_rts) - 1,
                                               v3d->blend_dst_alpha_one);
                         }
                 }
@@ -668,8 +699,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
         if (v3d->dirty & V3D_DIRTY_BLEND) {
                 struct pipe_blend_state *blend = &v3d->blend->base;

+                const uint32_t max_rts =
+                        V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
                 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
-                        for (int i = 0; i < 4; i++) {
+                        for (int i = 0; i < max_rts; i++) {
                                 int rt = blend->independent_blend_enable ? i : 0;
                                 int rt_mask = blend->rt[rt].colormask;

diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
similarity index 94%
rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c
rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c
index e00d84e375f..431aad14b4f 100644
--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c
+++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon)
 }

 int
-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
-                                        struct pipe_driver_query_group_info *info)
+v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index,
+                                          struct pipe_driver_query_group_info *info)
 {
         if (!screen->has_perfmon)
                 return 0;
@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde
 }

 int
-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
-                                  struct pipe_driver_query_info *info)
+v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index,
+                                    struct pipe_driver_query_info *info)
 {
         if (!screen->has_perfmon)
                 return 0;
@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = {
 };

 struct pipe_query *
-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
-                               unsigned *query_types)
+v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries,
+                                 unsigned *query_types)
 {
         struct v3d_query_perfcnt *pquery = NULL;
         struct v3d_query *query;
diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
index 82547437c25..d3fbc9aff5d 100644
--- a/src/gallium/drivers/v3d/v3dx_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -23,8 +23,9 @@

 #include "util/format/u_format.h"
 #include "v3d_context.h"
-#include "broadcom/common/v3d_tiling.h"
 #include "broadcom/common/v3d_macros.h"
+#include "broadcom/common/v3d_tiling.h"
+#include "broadcom/common/v3d_util.h"
 #include "broadcom/cle/v3dx_pack.h"

 #define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 |                   \
@@ -419,10 +420,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
          * clearing Z/S.
          */
         if (job->clear) {
+#if V3D_VERSION <= 42
                 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
                         clear.clear_z_stencil_buffer = !job->early_zs_clear;
                         clear.clear_all_render_targets = true;
                 }
+#endif
+#if V3D_VERSION >= 71
+                cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+#endif
+
         }
 #endif /* V3D_VERSION >= 40 */
 }
@@ -483,10 +490,64 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
         }
 }

-#if V3D_VERSION >= 40
+#if V3D_VERSION > 33
+/* Note that for v71, render target cfg packets has just one field that
+ * combined the internal type and clamp mode. For simplicity we keep just one
+ * helper.
+ *
+ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
+ *
+ */
+static uint32_t
+v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                enum pipe_format format)
+{
+#if V3D_VERSION == 42
+        if (util_format_is_pure_integer(format)) {
+                return V3D_RENDER_TARGET_CLAMP_INT;
+        } else if (util_format_is_srgb(format)) {
+                return V3D_RENDER_TARGET_CLAMP_NORM;
+        } else {
+                return V3D_RENDER_TARGET_CLAMP_NONE;
+        }
+#endif
+#if V3D_VERSION >= 71
+        switch (rt_type) {
+        case V3D_INTERNAL_TYPE_8I:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
+        case V3D_INTERNAL_TYPE_8UI:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
+        case V3D_INTERNAL_TYPE_8:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_8;
+        case V3D_INTERNAL_TYPE_16I:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
+        case V3D_INTERNAL_TYPE_16UI:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
+        case V3D_INTERNAL_TYPE_16F:
+                return util_format_is_srgb(format) ?
+                        V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
+                        V3D_RENDER_TARGET_TYPE_CLAMP_16F;
+        case V3D_INTERNAL_TYPE_32I:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
+        case V3D_INTERNAL_TYPE_32UI:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
+        case V3D_INTERNAL_TYPE_32F:
+                return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
+        default:
+                unreachable("Unknown internal render target type");
+        }
+        return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
+#endif
+        return 0;
+}
+#endif
+
+#if V3D_VERSION >= 71
 static void
-v3d_setup_render_target(struct v3d_job *job, int cbuf,
-                        uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
+v3d_setup_render_target(struct v3d_job *job,
+                        int cbuf,
+                        uint32_t *rt_bpp,
+                        uint32_t *rt_type_clamp)
 {
         if (!job->cbufs[cbuf])
                 return;
@@ -497,19 +558,35 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf,
            struct v3d_surface *bsurf = v3d_surface(job->bbuf);
            *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
         }
-        *rt_type = surf->internal_type;
-        if (util_format_is_srgb(surf->base.format))
-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
-#if V3D_VERSION >= 42
-        else if (util_format_is_pure_integer(surf->base.format))
-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-#endif
-        else
-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+        *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
+                                                         surf->base.format);
 }
+#endif

-#else /* V3D_VERSION < 40 */
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+static void
+v3d_setup_render_target(struct v3d_job *job,
+                        int cbuf,
+                        uint32_t *rt_bpp,
+                        uint32_t *rt_type,
+                        uint32_t *rt_clamp)
+{
+        if (!job->cbufs[cbuf])
+                return;
+
+        struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]);
+        *rt_bpp = surf->internal_bpp;
+        if (job->bbuf) {
+           struct v3d_surface *bsurf = v3d_surface(job->bbuf);
+           *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
+        }
+        *rt_type = surf->internal_type;
+        *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
+                                                    surf->base.format);
+}
+#endif

+#if V3D_VERSION < 40
 static void
 v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
                           struct v3d_resource *rsc, bool is_separate_stencil)
@@ -656,7 +733,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
         cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
                 store.buffer_to_store = NONE;
         }
-#else
+#endif
+#if V3D_VERSION >= 40
         for (int i = 0; i < 2; i++) {
                 if (i > 0)
                         cl_emit(&job->rcl, TILE_COORDINATES, coords);
@@ -664,16 +742,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
                 cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
                         store.buffer_to_store = NONE;
                 }
+
                 if (i == 0 || do_double_initial_tile_clear(job)) {
+#if V3D_VERSION < 71
                         cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
                                 clear.clear_z_stencil_buffer = !job->early_zs_clear;
                                 clear.clear_all_render_targets = true;
                         }
+#else
+                        cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear);
+#endif
                 }
                 cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
         }
 #endif
-
         cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);

         v3d_rcl_emit_generic_per_tile_list(job, layer);
@@ -775,18 +857,52 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 config.multisample_mode_4x = job->msaa;
                 config.double_buffer_in_non_ms_mode = job->double_buffer;

+#if V3D_VERSION <= 42
                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+#endif
+#if V3D_VERSION >= 71
+                config.log2_tile_width = log2_tile_size(job->tile_width);
+                config.log2_tile_height = log2_tile_size(job->tile_height);
+
+                /* FIXME: ideallly we would like next assert on the packet header (as is
+                 * general, so also applies to GL). We would need to expand
+                 * gen_pack_header for that.
+                 */
+                assert(config.log2_tile_width == config.log2_tile_height ||
+                       config.log2_tile_width == config.log2_tile_height + 1);
+#endif
+
         }

+#if V3D_VERSION >= 71
+        uint32_t base_addr = 0;
+
+        /* If we don't have any color RTs, we sill need to emit one and flat
+         * it as not used using stride = 1
+         */
+        if (job->nr_cbufs == 0) {
+           cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+              rt.stride = 1; /* Unused */
+           }
+        }
+#endif
         for (int i = 0; i < job->nr_cbufs; i++) {
                 struct pipe_surface *psurf = job->cbufs[i];
-                if (!psurf)
+                if (!psurf) {
+#if V3D_VERSION >= 71
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+                                rt.render_target_number = i;
+                                rt.stride = 1; /* Unused */
+                        }
+#endif
                         continue;
+                }
+
                 struct v3d_surface *surf = v3d_surface(psurf);
                 struct v3d_resource *rsc = v3d_resource(psurf->texture);

                 UNUSED uint32_t config_pad = 0;
-                uint32_t clear_pad = 0;
+                UNUSED uint32_t clear_pad = 0;

                 /* XXX: Set the pad for raster. */
                 if (surf->tiling == V3D_TILING_UIF_NO_XOR ||
@@ -819,6 +935,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
                 }
 #endif /* V3D_VERSION < 40 */

+#if V3D_VERSION <= 42
                 cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
                         clear) {
                         clear.clear_color_low_32_bits = job->clear_color[i][0];
@@ -847,9 +964,42 @@ v3dX(emit_rcl)(struct v3d_job *job)
                                 clear.render_target_number = i;
                         };
                 }
+#endif
+#if V3D_VERSION >= 71
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+                        rt.clear_color_low_bits = job->clear_color[i][0];
+                        v3d_setup_render_target(job, i, &rt.internal_bpp,
+                                                &rt.internal_type_and_clamping);
+                        rt.stride =
+                                v3d_compute_rt_row_row_stride_128_bits(job->tile_width,
+                                                                       v3d_internal_bpp_words(rt.internal_bpp));
+                        rt.base_address = base_addr;
+                        rt.render_target_number = i;
+
+                        base_addr += (job->tile_height * rt.stride) / 8;
+                }
+
+                if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
+                                rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
+                                        ((uint64_t) job->clear_color[i][1]) |
+                                        (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32);
+                                rt.render_target_number = i;
+                        }
+                }
+
+                if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) {
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
+                                rt.clear_color_top_bits = /* 56 bits (24 + 32) */
+                                        (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) |
+                                        (((uint64_t) (job->clear_color[i][3])) << 24);
+                                rt.render_target_number = i;
+                        }
+                }
+#endif
         }

-#if V3D_VERSION >= 40
+#if V3D_VERSION >= 40 && V3D_VERSION <= 42
         cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
                 v3d_setup_render_target(job, 0,
                                         &rt.render_target_0_internal_bpp,
diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
index 0f1735fee66..a7fad572a2d 100644
--- a/src/gallium/drivers/v3d/v3dx_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
 #endif
         }

-        /* The HW treats polygon offset units based on a Z24 buffer, so we
+        /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we
          * need to scale up offset_units if we're only Z16.
          */
+#if V3D_VERSION <= 42
         v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) {
                 depth.depth_offset_factor = cso->offset_scale;
                 depth.depth_offset_units = cso->offset_units * 256.0;
@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
                 depth.limit = cso->offset_clamp;
 #endif
         }
+#endif

         return so;
 }
@@ -138,8 +140,9 @@ v3d_create_blend_state(struct pipe_context *pctx,

         so->base = *cso;

+        uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION);
         if (cso->independent_blend_enable) {
-                for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
+                for (int i = 0; i < max_rts; i++) {
                         so->blend_enables |= cso->rt[i].blend_enable << i;

                         /* V3D 4.x is when we got independent blend enables. */
@@ -148,7 +151,7 @@ v3d_create_blend_state(struct pipe_context *pctx,
                 }
         } else {
                 if (cso->rt[0].blend_enable)
-                        so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1;
+                        so->blend_enables = (1 << max_rts) - 1;
         }

         return so;
@@ -337,6 +340,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
         v3d->dirty |= V3D_DIRTY_ZSA;
 }

+
+static bool
+needs_default_attribute_values(void)
+{
+#if V3D_VERSION <= 42
+        /* FIXME: on vulkan we are able to refine even further, as we know in
+         * advance when we create the pipeline if we have a integer vertex
+         * attrib. Pending to check if we could do something similar here.
+         */
+        return true;
+#endif
+        return false;
+}
+
 static void *
 v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
                         const struct pipe_vertex_element *elements)
@@ -414,24 +431,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
                 }
         }

-        /* Set up the default attribute values in case any of the vertex
-         * elements use them.
-         */
-        uint32_t *attrs;
-        u_upload_alloc(v3d->state_uploader, 0,
-                       V3D_MAX_VS_INPUTS * sizeof(float), 16,
-                       &so->defaults_offset, &so->defaults, (void **)&attrs);
-
-        for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
-                attrs[i * 4 + 0] = 0;
-                attrs[i * 4 + 1] = 0;
-                attrs[i * 4 + 2] = 0;
-                if (i < so->num_elements &&
-                    util_format_is_pure_integer(so->pipe[i].src_format)) {
-                        attrs[i * 4 + 3] = 1;
-                } else {
-                        attrs[i * 4 + 3] = fui(1.0);
+        if (needs_default_attribute_values()) {
+                /* Set up the default attribute values in case any of the vertex
+                 * elements use them.
+                 */
+                uint32_t *attrs;
+                u_upload_alloc(v3d->state_uploader, 0,
+                               V3D_MAX_VS_INPUTS * sizeof(float), 16,
+                               &so->defaults_offset, &so->defaults, (void **)&attrs);
+
+                for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
+                        attrs[i * 4 + 0] = 0;
+                        attrs[i * 4 + 1] = 0;
+                        attrs[i * 4 + 2] = 0;
+                        if (i < so->num_elements &&
+                            util_format_is_pure_integer(so->pipe[i].src_format)) {
+                                attrs[i * 4 + 3] = 1;
+                        } else {
+                                attrs[i * 4 + 3] = fui(1.0);
+                        }
                 }
+        } else {
+                so->defaults = NULL;
+                so->defaults_offset = 0;
         }

         u_upload_unmap(v3d->state_uploader);
@@ -699,21 +721,22 @@ v3d_upload_sampler_state_variant(void *map,
                                 break;
                         }

-                        if (variant >= V3D_SAMPLER_STATE_32) {
-                                sampler.border_color_word_0 = border.ui[0];
-                                sampler.border_color_word_1 = border.ui[1];
-                                sampler.border_color_word_2 = border.ui[2];
-                                sampler.border_color_word_3 = border.ui[3];
-                        } else {
-                                sampler.border_color_word_0 =
-                                        _mesa_float_to_half(border.f[0]);
-                                sampler.border_color_word_1 =
-                                        _mesa_float_to_half(border.f[1]);
-                                sampler.border_color_word_2 =
-                                        _mesa_float_to_half(border.f[2]);
-                                sampler.border_color_word_3 =
-                                        _mesa_float_to_half(border.f[3]);
+#if V3D_VERSION <= 42
+                        /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
+                         * for us. In V3D 4.x we need to manually convert floating point color
+                         * values to the expected format.
+                         */
+                        if (variant < V3D_SAMPLER_STATE_32) {
+                                border.ui[0] = _mesa_float_to_half(border.f[0]);
+                                border.ui[1] = _mesa_float_to_half(border.f[1]);
+                                border.ui[2] = _mesa_float_to_half(border.f[2]);
+                                border.ui[3] = _mesa_float_to_half(border.f[3]);
                         }
+#endif
+                        sampler.border_color_word_0 = border.ui[0];
+                        sampler.border_color_word_1 = border.ui[1];
+                        sampler.border_color_word_2 = border.ui[2];
+                        sampler.border_color_word_3 = border.ui[3];
                 }
         }
 }
@@ -869,7 +892,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te
 }

 static void
-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
+v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo,
+                               struct V3DX(TEXTURE_SHADER_STATE) *tex,
                                struct pipe_resource *prsc,
                                int base_level, int last_level,
                                int first_layer, int last_layer,
@@ -917,19 +941,29 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
         }

         tex->base_level = base_level;
+
 #if V3D_VERSION >= 40
         tex->max_level = last_level;
         /* Note that we don't have a job to reference the texture's sBO
          * at state create time, so any time this sampler view is used
          * we need to add the texture to the job.
          */
-        tex->texture_base_pointer =
-                cl_address(NULL,
-                           rsc->bo->offset +
-                           v3d_layer_offset(prsc, 0, first_layer));
+        const uint32_t base_offset = rsc->bo->offset +
+                v3d_layer_offset(prsc, 0, first_layer);
+
+        tex->texture_base_pointer = cl_address(NULL, base_offset);
 #endif
+
         tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;

+#if V3D_VERSION >= 71
+        tex->chroma_offset_x = 1;
+        tex->chroma_offset_y = 1;
+        /* See comment in XML field definition for rationale of the shifts */
+        tex->texture_base_pointer_cb = base_offset >> 6;
+        tex->texture_base_pointer_cr = base_offset >> 6;
+#endif
+
         /* Since other platform devices may produce UIF images even
          * when they're not big enough for V3D to assume they're UIF,
          * we force images with level 0 as UIF to be always treated
@@ -977,7 +1011,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,

         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
                 if (prsc->target != PIPE_BUFFER) {
-                        v3d_setup_texture_shader_state(&tex, prsc,
+                        v3d_setup_texture_shader_state(&v3d->screen->devinfo,
+                                                       &tex, prsc,
                                                        cso->u.tex.first_level,
                                                        cso->u.tex.last_level,
                                                        cso->u.tex.first_layer,
@@ -990,7 +1025,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
                                                                    cso->u.buf.size);
                 }

-                tex.srgb = util_format_is_srgb(cso->format);
+                bool is_srgb = util_format_is_srgb(cso->format);
+#if V3D_VERSION <= 42
+                tex.srgb = is_srgb;
+#endif
+#if V3D_VERSION >= 71
+                tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+#endif

 #if V3D_VERSION >= 40
                 tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]);
@@ -1040,7 +1081,10 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
                          * shader code if we wanted to read an MSAA sRGB
                          * texture without sRGB decode.
                          */
+#if V3D_VERSION <= 42
                         tex.srgb = false;
+#endif
+
                 } else {
                         tex.texture_type = v3d_get_tex_format(&screen->devinfo,
                                                               cso->format);
@@ -1404,7 +1448,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d,

         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
                 if (prsc->target != PIPE_BUFFER) {
-                        v3d_setup_texture_shader_state(&tex, prsc,
+                        v3d_setup_texture_shader_state(&v3d->screen->devinfo,
+                                                       &tex, prsc,
                                                        iview->base.u.tex.level,
                                                        iview->base.u.tex.level,
                                                        iview->base.u.tex.first_layer,
diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c
new file mode 100644
index 00000000000..d6b51390a11
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_tfu.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright © 2021 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_context.h"
+#include "broadcom/common/v3d_tfu.h"
+
+bool
+v3dX(tfu)(struct pipe_context *pctx,
+          struct pipe_resource *pdst,
+          struct pipe_resource *psrc,
+          unsigned int src_level,
+          unsigned int base_level,
+          unsigned int last_level,
+          unsigned int src_layer,
+          unsigned int dst_layer,
+          bool for_mipmap)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_screen *screen = v3d->screen;
+        struct v3d_resource *src = v3d_resource(psrc);
+        struct v3d_resource *dst = v3d_resource(pdst);
+        struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
+        struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
+        int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
+        int width = u_minify(pdst->width0, base_level) * msaa_scale;
+        int height = u_minify(pdst->height0, base_level) * msaa_scale;
+        enum pipe_format pformat;
+
+        if (psrc->format != pdst->format)
+                return false;
+        if (psrc->nr_samples != pdst->nr_samples)
+                return false;
+
+        if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D)
+                return false;
+
+        /* Can't write to raster. */
+        if (dst_base_slice->tiling == V3D_TILING_RASTER)
+                return false;
+
+        /* When using TFU for blit, we are doing exact copies (both input and
+         * output format must be the same, no scaling, etc), so there is no
+         * pixel format conversions. Thus we can rewrite the format to use one
+         * that is TFU compatible based on its texel size.
+         */
+        if (for_mipmap) {
+                pformat = pdst->format;
+        } else {
+                switch (dst->cpp) {
+                case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT;   break;
+                case 8:  pformat = PIPE_FORMAT_R16G16B16A16_FLOAT;   break;
+                case 4:  pformat = PIPE_FORMAT_R32_FLOAT;            break;
+                case 2:  pformat = PIPE_FORMAT_R16_FLOAT;            break;
+                case 1:  pformat = PIPE_FORMAT_R8_UNORM;             break;
+                default: unreachable("unsupported format bit-size"); break;
+                };
+        }
+
+        uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
+
+        if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) {
+                assert(for_mipmap);
+                return false;
+        }
+
+        v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
+        v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
+
+        struct drm_v3d_submit_tfu tfu = {
+                .ios = (height << 16) | width,
+                .bo_handles = {
+                        dst->bo->handle,
+                        src != dst ? src->bo->handle : 0
+                },
+                .in_sync = v3d->out_sync,
+                .out_sync = v3d->out_sync,
+        };
+        uint32_t src_offset = (src->bo->offset +
+                               v3d_layer_offset(psrc, src_level, src_layer));
+        tfu.iia |= src_offset;
+
+        uint32_t dst_offset = (dst->bo->offset +
+                               v3d_layer_offset(pdst, base_level, dst_layer));
+        tfu.ioa |= dst_offset;
+
+        switch (src_base_slice->tiling) {
+        case V3D_TILING_UIF_NO_XOR:
+        case V3D_TILING_UIF_XOR:
+                tfu.iis |= (src_base_slice->padded_height /
+                            (2 * v3d_utile_height(src->cpp)));
+                break;
+        case V3D_TILING_RASTER:
+                tfu.iis |= src_base_slice->stride / src->cpp;
+                break;
+        case V3D_TILING_LINEARTILE:
+        case V3D_TILING_UBLINEAR_1_COLUMN:
+        case V3D_TILING_UBLINEAR_2_COLUMN:
+                break;
+       }
+
+#if V3D_VERSION <= 42
+        if (src_base_slice->tiling == V3D_TILING_RASTER) {
+                tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
+                             V3D33_TFU_ICFG_FORMAT_SHIFT);
+        } else {
+                tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+                              (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+                             V3D33_TFU_ICFG_FORMAT_SHIFT);
+        }
+        tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
+
+        if (last_level != base_level)
+                tfu.ioa |= V3D33_TFU_IOA_DIMTW;
+
+        tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
+                     (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+                    V3D33_TFU_IOA_FORMAT_SHIFT);
+
+        tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
+
+        /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+         * OPAD field for the destination (how many extra UIF blocks beyond
+         * those necessary to cover the height).  When filling mipmaps, the
+         * miplevel 1+ tiling state is inferred.
+         */
+        if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
+            dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
+                int uif_block_h = 2 * v3d_utile_height(dst->cpp);
+                int implicit_padded_height = align(height, uif_block_h);
+
+                tfu.icfg |= (((dst_base_slice->padded_height -
+                               implicit_padded_height) / uif_block_h) <<
+                             V3D33_TFU_ICFG_OPAD_SHIFT);
+        }
+#endif /* V3D_VERSION <= 42 */
+
+#if V3D_VERSION >= 71
+        if (src_base_slice->tiling == V3D_TILING_RASTER) {
+                tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
+        } else {
+                tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
+                            (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+                        V3D71_TFU_ICFG_IFORMAT_SHIFT;
+        }
+        tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT;
+
+        if (last_level != base_level)
+                tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW;
+
+        tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
+                         (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+                        V3D71_TFU_IOC_FORMAT_SHIFT);
+
+        switch (dst_base_slice->tiling) {
+        case V3D_TILING_UIF_NO_XOR:
+        case V3D_TILING_UIF_XOR:
+                tfu.v71.ioc |=
+                        (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) <<
+                        V3D71_TFU_IOC_STRIDE_SHIFT;
+                break;
+        case V3D_TILING_RASTER:
+                tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) <<
+                        V3D71_TFU_IOC_STRIDE_SHIFT;
+                break;
+        default:
+                break;
+        }
+
+        tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT;
+#endif /* V3D_VERSION >= 71*/
+
+        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
+        if (ret != 0) {
+                fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
+                return false;
+        }
+
+        dst->writes++;
+
+        return true;
+}
+
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 1c3f77f6588..9bdefb55194 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -610,6 +610,10 @@ struct pipe_resource
    unsigned bind;            /**< bitmask of PIPE_BIND_x */
    unsigned flags;           /**< bitmask of PIPE_RESOURCE_FLAG_x */

+   /* Hack for avoiding sync on v3d */
+   unsigned sync_condition;
+   unsigned sync_status;
+
    /**
     * For planar images, ie. YUV EGLImage external, etc, pointer to the
     * next plane.
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 32135770e9d..2534c817dcc 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -275,7 +275,7 @@ dri3_update_max_num_back(struct loader_dri3_drawable *draw)
       if (draw->swap_interval == 0)
          draw->max_num_back = 4;
       else
-         draw->max_num_back = 3;
+         draw->max_num_back = 2;

       assert(draw->max_num_back <= LOADER_DRI3_MAX_BACK);
       break;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 77c38bf48d5..1eb2dac8018 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1058,6 +1058,9 @@ struct gl_texture_object
      * the pipe_resource *pt above.
      */
     bool needs_validation;
+
+    /* Hack for avoiding sync on v3d */
+    GLboolean SyncCondition;
 };


diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index d8fb1ed4317..048deaa02f6 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -273,6 +273,13 @@ set_tex_parameteri(struct gl_context *ctx,
    }

    switch (pname) {
+   case GL_SYNC_CONDITION:
+      if (!!texObj->SyncCondition == !!params[0])
+         return GL_FALSE;
+      texObj->SyncCondition = !!params[0];
+      return GL_TRUE;
+   case GL_SYNC_STATUS:
+      return GL_TRUE;
    case GL_TEXTURE_MIN_FILTER:
       if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_dsa;
@@ -930,6 +937,17 @@ _mesa_texture_parameter_invalidate(struct gl_context *ctx,
 {
    if (texparam_invalidates_sampler_views(pname))
       st_texture_release_all_sampler_views(st_context(ctx), texObj);
+
+   switch (pname) {
+   case GL_SYNC_CONDITION:
+      texObj->pt->sync_condition = texObj->SyncCondition;
+      break;
+   case GL_SYNC_STATUS:
+      texObj->pt->sync_status = 1;
+      break;
+   default:
+      ; /* nothing */
+   }
 }

 void
diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
index 24cc2888755..2bc2748e7fe 100644
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -77,6 +77,7 @@ TODO: document the other workarounds.
         <!-- using vulkan wsi for xservers causes deadlocks -->
         <application name="Xwayland" executable="Xwayland">
             <option name="disable_xcb_surface" value="true" />
+            <option name="v3d_is_xserver_process" value="true" />
         </application>

         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
@@ -750,6 +751,7 @@ TODO: document the other workarounds.
         <application name="mutter" executable="mutter">
             <option name="adaptive_sync" value="false" />
             <option name="v3d_nonmsaa_texture_size_limit" value="true" />
+            <option name="v3d_maintain_ignorable_scanout" value="true" />
         </application>
         <application name="muffin" executable="muffin">
             <option name="adaptive_sync" value="false" />
@@ -801,6 +803,7 @@ TODO: document the other workarounds.
         </application>
         <application name="Xorg" executable="Xorg">
             <option name="v3d_nonmsaa_texture_size_limit" value="true" />
+            <option name="v3d_is_xserver_process" value="true" />
         </application>

         <application name="gfxbench" executable="testfw_app">
diff --git a/src/util/driconf.h b/src/util/driconf.h
index ab7aa2c6553..70fa9f7b41b 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -517,6 +517,14 @@
    DRI_CONF_OPT_B(v3d_nonmsaa_texture_size_limit, def, \
                   "Report the non-MSAA-only texture size limit")

+#define DRI_CONF_V3D_IS_XSERVER_PROCESS(def) \
+   DRI_CONF_OPT_B(v3d_is_xserver_process, def, \
+                  "Identifies if the application is the Xserver.")
+
+#define DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(def)   \
+   DRI_CONF_OPT_B(v3d_maintain_ignorable_scanout, def, \
+                  "Maintain SCANOUT usage on resource allocations when the environment allows ignoring SCANOUT usage.")
+
 /**
  * \brief virgl specific configuration options
  */